In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm 
import torch

In [2]:
def create_torch_dataset():
    all_data = []
    scaler = MinMaxScaler()
    columns = ['Input Voltage', 'Output Voltage', 'Tachometer']
    num_rows = 122824
    for i in range(1, 561): # 561
        temp_df = pd.read_csv(f'../PHM09_competition_1/Run_{i}.csv', names=columns, nrows=num_rows)
        normalized_data = scaler.fit_transform(temp_df)
        temp_tensor = torch.tensor(normalized_data, dtype=torch.float32)
        temp_tensor = temp_tensor.unsqueeze(dim=0) 
        all_data.append(temp_tensor)
    
    all_data_tensor = torch.stack(all_data, dim=0)
    return all_data_tensor

tensor_data = create_torch_dataset()

In [3]:
def scoring_function(predictions, expected_anomaly_rate):
    """
    Custom scoring function for anomaly detection.

    Parameters:
    predictions (numpy array): The array of predictions from the model, where -1 indicates an anomaly.
    expected_anomaly_rate (float): The expected rate of anomalies in the data (between 0 and 1).

    Returns:
    float: A score representing the performance of the model. Lower is better.
    """
    # Calculate the rate of anomalies detected
    anomaly_rate = np.mean(predictions == -1)

    # Calculate the difference from the expected rate
    score = abs(anomaly_rate - expected_anomaly_rate)

    return score
    
flattened_data = tensor_data.view(tensor_data.shape[0], -1)


best_score = float('inf')
best_params = {}
predictions_best = None

# Define a range of values for each parameter
n_estimators_range = [50,100, 200]
max_samples_range = [0.25, 0.5, 0.75, 1.0]
contamination_range = [0.04] # The number of the anomalies 560*0.04 = 22.4
bootstrap_range = [True, False]
max_features_range = [1,2]

# around 22 anomalies
expected_anomaly_rate = 0.04

# Iterate over all possible combinations of parameters
for n_estimators in n_estimators_range:
    for max_samples in max_samples_range:
        for contamination in contamination_range:
            for bootstrap in bootstrap_range:
                for max_features in max_features_range:
                    # Initialize the Isolation Forest with the current set of parameters
                    iso_forest = IsolationForest(n_estimators=n_estimators, max_samples=max_samples, 
                                                contamination=contamination, bootstrap=bootstrap,
                                                max_features=max_features)
                    
                    # Fit and predict
                    iso_forest.fit(flattened_data)
                    predictions = iso_forest.predict(flattened_data)

                    # Evaluate your model here based on your specific criteria
                    score = scoring_function(predictions, expected_anomaly_rate)

                    # If this is the best score so far, save the parameters and predictions
                    if score < best_score:
                        best_score = score
                        best_params = {'n_estimators': n_estimators, 'max_samples': max_samples,
                                       'contamination': contamination, 'bootstrap': bootstrap,
                                       'max_features': max_features}
                        predictions_best = predictions

# Print out the best parameters
print("Best Parameters:", best_params)

# Separating indices of anomalies and normal
anomalies = np.where(predictions_best == -1)[0]
normal = np.where(predictions_best == 1)[0]

# Print anomalies and normal indices
print(f"Total Anomaly: {len(anomalies)}\n Indices: {anomalies}\n\n" )

Best Parameters: {'n_estimators': 50, 'max_samples': 0.25, 'contamination': 0.04, 'bootstrap': True, 'max_features': 1}
Total Anomaly: 23
 Indices: [  8  78  88  96 126 162 179 183 207 269 288 345 396 403 441 468 472 482
 483 532 537 553 554]




In [10]:
data = {
    'model': [str('IsolationForest')],  
    'anomalies': [list(anomalies)],
    'length': [len(anomalies)],
    'params': [str(best_params)],
    'hyper_params':None,
}

In [13]:
def append_to_csv(data):
    new_data = pd.DataFrame(data)
    try:
        existing_data = pd.read_csv('anomaly_results.csv')
        if data['model'][0] in existing_data['model'].values:
            existing_data.loc[existing_data['model'] == data['model'][0]] = new_data.values
        else:
            existing_data = pd.concat([existing_data, new_data], ignore_index=True)
    except FileNotFoundError:
        existing_data = new_data
    existing_data.to_csv('anomaly_results.csv', index=False)
    return existing_data

df = append_to_csv(data)
df

Unnamed: 0,model,anomalies,length,params,hyper_params
0,IsolationForest,"[9, 53, 79, 84, 125, 149, 162, 172, 174, 179, ...",23,"{'n_estimators': 50, 'max_samples': 0.25, 'con...",
1,Autoencoder,"[53, 82, 88, 111, 155, 162, 192, 195, 207, 269...",26,"{'n_epochs': 100, 'loss_fn': 'L1Loss', 'params...",
