In [1]:
# 1. Import Libraries and Load Data

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV

import joblib
import pandas as pd
import sys
import os
sys.path.insert(0, os.path.abspath(".."))

from src.preprocessing import load_data, split_data, balance_classes, scale_features

# Create a directory for saving models if it doesn't exist
model_dir = '../models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Load the Data
df = load_data('../data/raw/Data for Task 1.csv')

In [2]:
# 2. Define Configurations

# Define Experiment Combinations
experiments = [
    {'balance': 'smote', 'scaling': 'standard'},
    {'balance': 'smote', 'scaling': 'minmax'},
    {'balance': 'adasyn', 'scaling': 'standard'},
    {'balance': 'adasyn', 'scaling': 'minmax'},
    {'balance': 'none', 'scaling': 'standard'},
    {'balance': 'none', 'scaling': 'minmax'}
]

# Define Models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'DecisionTree': DecisionTreeClassifier(random_state=1),
    'RandomForest': RandomForestClassifier(random_state=1),
    'GradientBoosting': GradientBoostingClassifier(random_state=1)
}

# Define Parameter Grids
param_grids = {
    'LogisticRegression': {
        'C': [0.01, 0.1, 1, 10]
    },
    'DecisionTree': {
        'max_depth': [None, 5, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'RandomForest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5]
    },
    'GradientBoosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
}

In [3]:
# 3. Run Experiments and Sort Results

# Store best overall results
best_model = None
best_f1 = 0

results = []
for config in experiments:
    X_train, X_test, y_train, y_test = split_data(df)
    X_train_scaled, X_test_scaled = scale_features(X_train, X_test, method=config['scaling'])
    X_train_bal, y_train_bal = balance_classes(X_train_scaled, y_train, method=config['balance'])
    
    for model_name, model in models.items():
        grid_search = GridSearchCV(model, param_grids[model_name], cv=3, scoring='f1', n_jobs=-1)
        grid_search.fit(X_train_bal, y_train_bal)
        
        y_pred = grid_search.best_estimator_.predict(X_test_scaled)
        current_f1 = f1_score(y_test, y_pred)
        
        if current_f1 > best_f1:
            best_model = grid_search.best_estimator_
            best_f1 = current_f1
        
        results.append({
            'model': model_name,
            **config,
            'best_params': grid_search.best_params_,
            'cv_f1': grid_search.best_score_,
            'f1': current_f1,
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
        })
            
results_df = pd.DataFrame(results).sort_values('f1', ascending=False)
display(results_df)

# Save the best overall model
joblib.dump(best_model, os.path.join('..', 'models', 'best_model.pkl'))

found 0 physical cores < 1
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Unnamed: 0,model,balance,scaling,best_params,cv_f1,f1,precision,recall
0,LogisticRegression,smote,standard,{'C': 0.1},0.987482,0.97619,0.97619,0.97619
4,LogisticRegression,smote,minmax,{'C': 10},0.978721,0.97619,0.97619,0.97619
8,LogisticRegression,adasyn,standard,{'C': 10},0.95627,0.964706,0.953488,0.97619
12,LogisticRegression,adasyn,minmax,{'C': 10},0.933951,0.964706,0.953488,0.97619
14,RandomForest,adasyn,minmax,"{'max_depth': None, 'min_samples_split': 2, 'n...",0.957197,0.963855,0.97561,0.952381
15,GradientBoosting,adasyn,minmax,"{'learning_rate': 0.2, 'max_depth': 3, 'n_esti...",0.964806,0.963855,0.97561,0.952381
7,GradientBoosting,smote,minmax,"{'learning_rate': 0.2, 'max_depth': 3, 'n_esti...",0.973615,0.963855,0.97561,0.952381
20,LogisticRegression,none,minmax,{'C': 10},0.963623,0.963855,0.97561,0.952381
16,LogisticRegression,none,standard,{'C': 0.1},0.9633,0.963855,0.97561,0.952381
10,RandomForest,adasyn,standard,"{'max_depth': None, 'min_samples_split': 5, 'n...",0.95487,0.952381,0.952381,0.952381


['..\\models\\best_model.pkl']