In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier



# Loading dataset
file_path = 'FinancialMarketData.xlsx'
data = pd.ExcelFile(file_path)
ews_data = data.parse('EWS')

In [2]:
ews_data.head()

Unnamed: 0,Y,Data,XAU BGNL,ECSURPUS,BDIY,CRY,DXY,JPY,GBP,Cl1,...,LP01TREU,EMUSTRUU,LF94TRUU,MXUS,MXEU,MXJP,MXBR,MXRU,MXIN,MXCN
0,0,2000-01-11,283.25,0.077,1388,157.26,100.56,105.86,1.646,25.77,...,116.4635,230.5267,123.7616,1416.12,127.75,990.59,856.76,224.33,217.34,34.3
1,0,2000-01-18,287.65,0.043,1405,165.01,101.86,105.47,1.6383,28.85,...,117.2674,231.377,123.7616,1428.79,129.5,993.98,925.22,234.37,227.08,32.74
2,0,2000-01-25,287.15,0.135,1368,167.24,102.41,106.04,1.6496,28.28,...,117.9946,232.3895,123.7616,1385.93,126.48,974.83,886.93,216.82,233.0,32.46
3,0,2000-02-01,282.75,0.191,1311,166.85,104.92,107.85,1.6106,28.22,...,120.51,231.9417,122.3281,1385.31,129.19,1007.12,842.6,201.89,237.48,31.29
4,1,2000-02-08,298.4,0.312,1277,165.43,104.22,109.3,1.6108,28.02,...,118.7914,237.8117,122.3281,1411.95,134.67,1034.58,945.15,218.0,258.02,31.32


## Preparing the data : 

In [3]:
# Separating features and target variable

features = ews_data.drop(columns=["Y", "Data"])  # here Data represents the date column (mispelled)
target = ews_data["Y"]

# Handling missing values using mean imputation
imputer = SimpleImputer(strategy="mean")
features_imputed = pd.DataFrame(imputer.fit_transform(features), columns=features.columns)

# Scaling features for uniformity
scaler = StandardScaler()
features_scaled = pd.DataFrame(scaler.fit_transform(features_imputed), columns=features.columns)



In [4]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(features_scaled, target)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

In [5]:
#  Training an Isolation Forest model 
model = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
model.fit(X_train)

In [6]:
# Predicting anomalies and evaluating the model
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Converting Isolation Forest outputs (-1 for anomaly, 1 for normal) to binary classification
y_pred_test_binary = [1 if pred == -1 else 0 for pred in y_pred_test]

# Evaluation Metrics
print("Classification Report:")
print(classification_report(y_test, y_pred_test_binary))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_test_binary))

Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.92      0.67       179
           1       0.59      0.13      0.21       171

    accuracy                           0.53       350
   macro avg       0.56      0.52      0.44       350
weighted avg       0.56      0.53      0.44       350

ROC-AUC Score: 0.5224280440393348


In [7]:
# a Random Forest classifier with hyperparameter tuning
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_rf = grid_search.best_estimator_

y_pred = best_rf.predict(X_test)
y_pred_proba = best_rf.predict_proba(X_test)[:, 1]

# Adjust threshold for classification
threshold = 0.4
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)

# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred_adjusted))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.83      0.90       179
           1       0.85      0.98      0.91       171

    accuracy                           0.91       350
   macro avg       0.91      0.91      0.91       350
weighted avg       0.92      0.91      0.91       350

ROC-AUC Score: 0.98503708059721


In [8]:
import joblib

# Save the trained model and scaler
joblib.dump(best_rf, "rf_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(imputer, "imputer.pkl")


['imputer.pkl']