In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
import math
import os
import plotly.graph_objects as go

### Experiment 1
- Default Parameters
- Normal Distribution

In [2]:
# Define the folder containing the files
folder_path = 'normal_data_labelled/'
result_path = 'if_results_normal_default_parameters'
# Create a folder to store the results if it doesn't exist
if not os.path.exists(result_path):
    os.makedirs(result_path)

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(folder_path, filename))
        # Drop NaN values and select relevant columns
        df = df.dropna(subset=['cycle_time'])
        df = df[['shot_time','COUNTER_ID', 'cycle_time','shot_id']]
        df['cycle_time'] = df['cycle_time'].apply(math.floor)
        
        # Fit Isolation Forest model
        data = ['cycle_time']
        if_model = IsolationForest(contamination='auto', random_state=42)
        if_model.fit(df[data])
        
        # Calculate anomaly scores
        df['anomoly_score'] = if_model.decision_function(df[data])
        df['anomoly_score_inverse'] = if_model.score_samples(df[data])
        df['anomoly_score_inverse'] = df['anomoly_score_inverse'] + 1
        # Predict anomalies and assign anomaly scores
        df['output'] = if_model.predict(df[data])
        df['output'] = df['output'].apply(lambda x: 0 if x == -1 else 1)
        df['output'] = 1 - df['output']

        # Save the results in a new CSV file
        result_filename = os.path.join(result_path, filename)
        df.to_csv(result_filename, index=False)
        print(f"----------- {filename} complete -----------")


----------- EMA2233M10017_model_response.csv complete -----------
----------- EMA2233M10024_model_response.csv complete -----------
----------- EMA2233M10035_model_response.csv complete -----------
----------- EMA2233M10100_model_response.csv complete -----------
----------- EMA2233M10102_model_response.csv complete -----------
----------- EMA2233M10103_model_response.csv complete -----------
----------- EMA2303M10302_model_response.csv complete -----------


### Experiment 2
- Default Parameters
- Anomalous Distribution

In [3]:
# Define the folder containing the files
folder_path = 'anomalous_data_labelled/'
result_path = 'if_results_anomalous_default_parameters'
# Create a folder to store the results if it doesn't exist
if not os.path.exists(result_path):
    os.makedirs(result_path)

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(folder_path, filename))
        # Drop NaN values and select relevant columns
        df = df.dropna(subset=['cycle_time'])
        df = df[['shot_time','COUNTER_ID', 'cycle_time','shot_id']]

        df['cycle_time'] = df['cycle_time'].apply(math.floor)
        
        # Fit Isolation Forest model
        data = ['cycle_time']
        if_model = IsolationForest(contamination='auto', random_state=42)
        if_model.fit(df[data])
        
        # Calculate anomaly scores
        df['anomoly_score'] = if_model.decision_function(df[data])
        df['anomoly_score_inverse'] = if_model.score_samples(df[data])
        df['anomoly_score_inverse'] = df['anomoly_score_inverse'] + 1
        # Predict anomalies and assign anomaly scores
        df['output'] = if_model.predict(df[data])
        df['output'] = df['output'].apply(lambda x: 0 if x == -1 else 1)
        df['output'] = 1 - df['output']

        # Save the results in a new CSV file
        result_filename = os.path.join(result_path, filename)
        df.to_csv(result_filename, index=False)
        print(f"----------- {filename} complete -----------")


----------- all_combined_EMA2233M10295.csv complete -----------
----------- all_combined_EMA2233M10296.csv complete -----------
----------- all_combined_EMA2233M10297.csv complete -----------
----------- all_combined_EMA2233M10300.csv complete -----------
----------- all_combined_EMA2233M10302.csv complete -----------
----------- all_combined_EMA2233M10303.csv complete -----------
----------- all_combined_EMA2233M10304.csv complete -----------
----------- all_combined_EMA2233M10305.csv complete -----------
----------- all_combined_EMA2233M10308.csv complete -----------


### Experiment 3
- Fine Tuned Parameters - {'bootstrap': True, 'contamination': 'auto', 'max_features': 1.0, 'max_samples': 0.4, 'n_estimators': 100, 'warm_start': True}
- Normal Distribution

In [4]:
# Define the folder containing the files
folder_path = 'normal_data_labelled/'
result_path = 'if_results_normal_tuned_parameters'
# Create a folder to store the results if it doesn't exist
if not os.path.exists(result_path):
    os.makedirs(result_path)

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(folder_path, filename))
        # Drop NaN values and select relevant columns
        df = df.dropna(subset=['cycle_time'])
        df = df[['shot_time','COUNTER_ID', 'cycle_time','shot_id']]
        df['cycle_time'] = df['cycle_time'].apply(math.floor)
        
        # Fit Isolation Forest model
        data = ['cycle_time']
        if_model = IsolationForest(bootstrap=True,contamination='auto',  max_features=1.0, max_samples = 0.4, n_estimators=100, random_state=42, warm_start=True)
        if_model.fit(df[data])
        
        # Calculate anomaly scores
        df['anomoly_score'] = if_model.decision_function(df[data])
        df['anomoly_score_inverse'] = if_model.score_samples(df[data])
        df['anomoly_score_inverse'] = df['anomoly_score_inverse'] + 1
        # Predict anomalies and assign anomaly scores
        df['output'] = if_model.predict(df[data])
        df['output'] = df['output'].apply(lambda x: 0 if x == -1 else 1)
        df['output'] = 1 - df['output']

        # Save the results in a new CSV file
        result_filename = os.path.join(result_path, filename)
        df.to_csv(result_filename, index=False)
        print(f"----------- {filename} complete -----------")


----------- EMA2233M10017_model_response.csv complete -----------
----------- EMA2233M10024_model_response.csv complete -----------
----------- EMA2233M10035_model_response.csv complete -----------
----------- EMA2233M10100_model_response.csv complete -----------
----------- EMA2233M10102_model_response.csv complete -----------
----------- EMA2233M10103_model_response.csv complete -----------
----------- EMA2303M10302_model_response.csv complete -----------


### Experiment 4
- Fine Tuned Parameters - {'bootstrap': True, 'contamination': 'auto', 'max_features': 1.0, 'max_samples': 0.1, 'n_estimators': 50, 'warm_start': True}
- Anomalous Distribution

In [5]:
# Define the folder containing the files
folder_path = 'anomalous_data_labelled/'
result_path = 'if_results_anomolaus_tuned_parameters'
# Create a folder to store the results if it doesn't exist
if not os.path.exists(result_path):
    os.makedirs(result_path)

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(folder_path, filename))
        # Drop NaN values and select relevant columns
        df = df.dropna(subset=['cycle_time'])
        df = df[['shot_time','COUNTER_ID', 'cycle_time','shot_id']]
        df['cycle_time'] = df['cycle_time'].apply(math.floor)
        
        # Fit Isolation Forest model
        data = ['cycle_time']
        if_model = IsolationForest(bootstrap=True,contamination='auto',  max_features=1.0, max_samples = 0.1, n_estimators=50, random_state=42, warm_start=True)
        if_model.fit(df[data])
        
        # Calculate anomaly scores
        df['anomoly_score'] = if_model.decision_function(df[data])
        df['anomoly_score_inverse'] = if_model.score_samples(df[data])
        df['anomoly_score_inverse'] = df['anomoly_score_inverse'] + 1
        # Predict anomalies and assign anomaly scores
        df['output'] = if_model.predict(df[data])
        df['output'] = df['output'].apply(lambda x: 0 if x == -1 else 1)
        df['output'] = 1 - df['output']

        # Save the results in a new CSV file
        result_filename = os.path.join(result_path, filename)
        df.to_csv(result_filename, index=False)
        print(f"----------- {filename} complete -----------")

----------- all_combined_EMA2233M10295.csv complete -----------
----------- all_combined_EMA2233M10296.csv complete -----------
----------- all_combined_EMA2233M10297.csv complete -----------
----------- all_combined_EMA2233M10300.csv complete -----------
----------- all_combined_EMA2233M10302.csv complete -----------
----------- all_combined_EMA2233M10303.csv complete -----------
----------- all_combined_EMA2233M10304.csv complete -----------
----------- all_combined_EMA2233M10305.csv complete -----------
----------- all_combined_EMA2233M10308.csv complete -----------


### Experiment 5
- Fine Tuned Parameters - {'bootstrap': False, 'contamination': 0.1, 'max_features': 0.1, 'max_samples': 0.3, 'n_estimators': 50, 'warm_start': True}
- Anomalous Distribution

In [6]:
# Define the folder containing the files
folder_path = ['anomalous_data_labelled/','normal_data_labelled/']
result_path = 'if_results_combined_data_tuned_parameters'
# Create a folder to store the results if it doesn't exist
if not os.path.exists(result_path):
    os.makedirs(result_path)
size = 0
combined_df = pd.DataFrame()
for path in folder_path:
    print('Processing ----', path[:-1])
    for filename in os.listdir(path):
        if filename.endswith('.csv'):
            # Read the CSV file into a DataFrame
            data = pd.read_csv(os.path.join(path, filename))
            # Drop NaN values and select relevant columns
            data = data.dropna(subset=['cycle_time'])
            data = data[['shot_time', 'COUNTER_ID', 'cycle_time','shot_id']]
            data['cycle_time'] = data['cycle_time'].apply(math.floor)
            combined_df = pd.concat([combined_df, data])  # Concatenate current data with existing
            size = size + data.shape[0]
    print(size)

     
# Fit Isolation Forest model
data = ['cycle_time']
if_model = IsolationForest(bootstrap=False,contamination=0.1,  max_features=0.1, max_samples = 0.3, n_estimators=50, random_state=42, warm_start=True)
if_model.fit(combined_df[data])

# Calculate anomaly scores
combined_df['anomoly_score'] = if_model.decision_function(combined_df[data])
combined_df['anomoly_score_inverse'] = if_model.score_samples(combined_df[data])
combined_df['anomoly_score_inverse'] = combined_df['anomoly_score_inverse'] + 1
# Predict anomalies and assign anomaly scores
combined_df['output'] = if_model.predict(combined_df[data])
combined_df['output'] = combined_df['output'].apply(lambda x: 0 if x == -1 else 1)
combined_df['output'] = 1 - combined_df['output']

# Save the results in a new CSV file
filename = 'combined_fine_tuned_result.csv'
result_filename = os.path.join(result_path, filename)
combined_df.to_csv(result_filename, index=False)
print(f"----------- {filename} complete -----------")
combined_df.shape

Processing ---- anomalous_data_labelled
140235
Processing ---- normal_data_labelled
162747
----------- combined_fine_tuned_result.csv complete -----------


(162747, 7)