In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold, StratifiedKFold
import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning, message="One or more of the test scores are non-finite")
warnings.filterwarnings("ignore", category=UserWarning, message="X does not have valid feature names")


In [None]:
X1 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data for if & lof/X1.csv')
X2 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data for if & lof/X2.csv')
X3 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data for if & lof/X3.csv')
df_stratified = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data for if & lof/df_stratified.csv')
df_random = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data for if & lof/df_random.csv')
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data for if & lof/df.csv')

In [None]:
X1 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data for if & lof/X1.csv')

In [None]:
y = df['anomaly']

In [None]:
# Define the desired sampling ratio (e.g., 60% of the original dataset size)
sampling_ratio = 0.01

# First hyperparameter tuning
using GridSearchCV on some important hyperparameters in Isolation Forest (contamination, n_estimators, bootstrap).
From first hyperparameter tuning, we can get the best combination of sampling methods and imputation methods, and also the best hyperparameters.

- According to Mensi & Manuele Bicego(2019), applying the Isolation Forest with Novelty Detection framework i.e. only inliers are used in the training phase, will result in better ROC AUC score, but worse classification report.

For more details see:
https://link.springer.com/chapter/10.1007/978-3-030-30642-7_14

In [None]:
%%time
# Define the desired sampling ratio (1% of the original dataset size)
sampling_ratio = 0.01

# Define the number of folds for cross-validation
num_folds = 5

# Define the imputation methods
imputation_methods = ['zero', 'median', 'mean']


# Initialize cross-validation
kf = KFold(n_splits=num_folds)
skf = StratifiedKFold(n_splits=num_folds)

# Define the range for contamination values
contamination_values = np.linspace(0.01, 0.1, num=10)

# Define the hyperparameter search space here
param_grid = {
    'contamination': contamination_values,
}

# Initialize a list to store results
results = []

for impute_method, X_imputed in zip(imputation_methods, [X1, X2, X3]):
    for stratified, df_sampling in [(True, df_stratified), (False, df_random)]:
        fold_results = []
        for train_idx, test_idx in kf.split(df_sampling):
            # Randomly select the specified sample size from the training data
            sample_size = int(len(train_idx) * sampling_ratio)
            selected_train_idx = np.random.choice(train_idx, size=sample_size, replace=False)

            # Create a new dataset with random sampling
            X_train, y_train = X_imputed.iloc[selected_train_idx][y[selected_train_idx] == 0], df_sampling.iloc[selected_train_idx]['anomaly'][y[selected_train_idx] == 0]
            X_test, y_test = X_imputed.iloc[test_idx], df_sampling.iloc[test_idx]['anomaly']

            # Define the Isolation Forest model
            model = IsolationForest(random_state=42)

            # Perform grid search with cross-validation
            grid_search = GridSearchCV(model, param_grid, cv=3, scoring="f1",n_jobs=-1)
            grid_search.fit(X_train, y_train)

            # Get the best hyperparameters
            best_params = grid_search.best_params_

            # Train the model with the best hyperparameters
            best_model = IsolationForest(**best_params, random_state=42)
            best_model.fit(X_train)

            # Evaluate the model on the testing set
            y_pred = best_model.predict(X_test)

            # Convert predictions to 0 (normal) and 1 (anomaly)
            y_pred_binary = np.where(y_pred == -1, 1, 0)

            # Evaluate performance using F1 score
            roc_auc = roc_auc_score(y_test, y_pred_binary)

            fold_results.append(roc_auc)

           # Check if ROC AUC score is not NaN
            if not np.isnan(roc_auc):
                fold_results.append(roc_auc)

        # Store the results for this combination of imputation and sampling
        results.append({
            'Imputation': impute_method,
            'Stratified Sampling': stratified,
            'AUC-ROC Score': np.mean(fold_results),
        })

# Print the results
for result in results:
    print(f"Imputation: {result['Imputation']}, Stratified Sampling: {result['Stratified Sampling']}, "
          f"AUC-ROC Score: {result['AUC-ROC Score']:.4f}")
print(best_model)


Imputation: zero, Stratified Sampling: True, AUC-ROC Score: 0.4996 Best Model: IsolationForest(bootstrap=True, contamination=0.01, n_estimators=10,
                random_state=42)
Imputation: zero, Stratified Sampling: False, AUC-ROC Score: 0.4953 Best Model: IsolationForest(bootstrap=True, contamination=0.01, n_estimators=10,
                random_state=42)
Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.5229 Best Model: IsolationForest(bootstrap=True, contamination=0.01, n_estimators=10,
                random_state=42)
Imputation: median, Stratified Sampling: False, AUC-ROC Score: 0.4947 Best Model: IsolationForest(bootstrap=True, contamination=0.01, n_estimators=10,
                random_state=42)
Imputation: mean, Stratified Sampling: True, AUC-ROC Score: 0.5001 Best Model: IsolationForest(bootstrap=True, contamination=0.01, n_estimators=10,
                random_state=42)
Imputation: mean, Stratified Sampling: False, AUC-ROC Score: 0.4961 Best Model: Isolation

In [None]:
print(best_model)

IsolationForest(bootstrap=True, contamination=0.01, n_estimators=5,
                random_state=42)


From here, we can see that applying the Isolation Forest with Novelty Detection framework i.e. only inliers are used in the training phase, will result in better ROC AUC score, but worse classification report.

### The **best combination** of imputation method and sampling method is:

 Imputation: median(X2) with Stratified Sampling (AUC-ROC Score: 0.5229)

## Second hyperparameter tuning
use the the best combination of sampling methods and imputation methods as our dataset(which we get from first hyperparameter tuning) using GridSearchCV on the rest of the hyperparameters in Isolation Forest (contamination, n_estimators, bootstrap).
From second hyperparameter tuning, we can get the final best hyperparameters for Isolation Forest.

In [None]:
%%time
# Define the desired sampling ratio (1% of the original dataset size)
sampling_ratio = 0.01

# Define the number of folds for cross-validation
num_folds = 7

# Define the imputation methods
imputation_methods = ['median']


# Initialize cross-validation
kf = KFold(n_splits=num_folds)
skf = StratifiedKFold(n_splits=num_folds)

# Define the hyperparameter search space here
param_grid = {
    'contamination': [0.01],
    'n_estimators': [5],
    'bootstrap': [True],
    'verbose': [1,2,3,4],
    'warm_start': [True],
}

  fold_results = []
  for train_idx, test_idx in kf.split(df_sampling):
      # Randomly select the specified sample size from the training data
      sample_size = int(len(train_idx) * sampling_ratio)
      selected_train_idx = np.random.choice(train_idx, size=sample_size, replace=False)

      # Create a new dataset with random sampling
      X_train, y_train = X2.iloc[selected_train_idx][y[selected_train_idx] == 0], df_stratified.iloc[selected_train_idx]['anomaly'][y[selected_train_idx] == 0]
      X_test, y_test = X2.iloc[test_idx], df_stratified.iloc[test_idx]['anomaly']

      # Define the Isolation Forest model
      model = IsolationForest(random_state=42)

      # Perform grid search with cross-validation
      grid_search = GridSearchCV(model, param_grid, cv=3, scoring="f1",n_jobs=-1)
      grid_search.fit(X_train, y_train)

      # Get the best hyperparameters
      best_params = grid_search.best_params_

      # Train the model with the best hyperparameters
      best_model = IsolationForest(**best_params, random_state=42)
      best_model.fit(X_train)

      # Evaluate the model on the testing set
      y_pred = best_model.predict(X_test)

      # Convert predictions to 0 (normal) and 1 (anomaly)
      y_pred_binary = np.where(y_pred == -1, 1, 0)

      # Evaluate performance using F1 score
      roc_auc = roc_auc_score(y_test, y_pred_binary)

      # Check if ROC AUC score is not NaN
      if not np.isnan(roc_auc):
          fold_results.append(roc_auc)

# Print the results
print(f"Imputation: median, Stratified Sampling: True, "
      f"AUC-ROC Score: ",np.mean(fold_results),:.4f)
print(best_model)

Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.5004
Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.4893
Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.4985
Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.4965
Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.4997
Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.4996
Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.5082
Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.4966
Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.5081
Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.5032
Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.4963
Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.5093
Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.5029
Imputation: median, Stratified Sampling: True, AUC-ROC Score: 0.4917
Imputation: median, Stratified Sam

### The **final best hyperparameters** for Isolation Forest is:
bootstrap=True, contamination=0.01, n_estimators=5, warm_start=True, random_state=42, verbose=1