Anomalies are rare events or observations that deviate significantly from what is considered normal. Detecting anomalies is crucial in various fields to identify potential security breaches, fraudulent activities, or abnormal health conditions. 

Isolation forest is a machine learning algorithm specifically designed for anomaly detection. It works by isolating anomalies in a dataset by randomly partitioning the data into subsets. The algorithm then builds an ensemble of trees to isolate anomalies based on how quickly they are separated from the rest of the data. This approach is particularly effective for high-dimensional datasets and is known for its efficiency and scalability.

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Loading training data
train_path = r"C:\Users\hp\Downloads\ML-MATT-CompetitionQT2021_train.csv"
test_path = r"C:\Users\hp\Downloads\ML-MATT-CompetitionQT1920_test.csv"
train_data = pd.read_csv(train_path, delimiter=';')
test_data = pd.read_csv(test_path, delimiter=';')

# 'Unusual' is the target variable in train_data
X_train = train_data.drop(columns=['Unusual'])
y_train = train_data['Unusual']

# Dropping non-numeric columns and handling missing values
X_train = X_train.drop(columns=['Time', 'CellName']).apply(pd.to_numeric, errors='coerce').dropna()
X_test = test_data.drop(columns=['Time', 'CellName'], errors='ignore').apply(pd.to_numeric, errors='coerce').dropna()

# y_train with cleaned X_train
y_train = y_train[X_train.index]

# Ensure both datasets have the same columns
common_columns = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_columns]
X_test = X_test[common_columns]

# Normalize the data
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled = scaler.transform(X_test)

# Outlier detection using Isolation Forest
outlier_fraction = 0.1
model = IsolationForest(contamination=outlier_fraction, random_state=42)
model.fit(X_train_scaled)

# Predict on the training data
train_preds = model.predict(X_train_scaled)
train_preds = np.where(train_preds == 1, 0, 1)  # Convert 1 (inlier) to 0 and -1 (outlier) to 1

# Evaluate the model on the training data
accuracy = accuracy_score(y_train, train_preds)
precision = precision_score(y_train, train_preds)
recall = recall_score(y_train, train_preds)
f1 = f1_score(y_train, train_preds)

print(f"Training Accuracy: {accuracy}")
print(f"Training Precision: {precision}")
print(f"Training Recall: {recall}")
print(f"Training F1 Score: {f1}")
print("\nTraining Classification Report:\n", classification_report(y_train, train_preds))


Training Accuracy: 0.6624214177324951
Training Precision: 0.19181793551882959
Training Recall: 0.06952764411273692
Training F1 Score: 0.10206140983133918

Training Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.89      0.79     26721
           1       0.19      0.07      0.10     10183

    accuracy                           0.66     36904
   macro avg       0.45      0.48      0.45     36904
weighted avg       0.57      0.66      0.60     36904

