In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

# Use project modules where convenient
from MachineLearningModule.Classifiers import Classifiers
from MachineLearningModule.DataPreProcessing import Conversions, HandleMissingValues, Normalisation, DimReduction

DATA_PATH = 'Datasets/Drebin_v1_sample.csv' if os.path.exists('Datasets/Drebin_v1_sample.csv') else 'Datasets/Drebin_v1.csv'
print('Loading', DATA_PATH)
df = pd.read_csv(DATA_PATH, low_memory=False)
target = 'class'
if target not in df.columns:
    raise ValueError(f'Expected target column 
 in dataset')

# Preprocessing pipeline (reuse functions from project)
df = Conversions.convert_all_cat_features_to_num_via_label_encoding(df)
df = HandleMissingValues.impute_missing_values_with_feature_mean(df)
df = Normalisation.min_max_normalisation(df)

X = df.drop(target, axis=1)
y = df[target]

# Quick train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

models = ['RF', 'SVM', 'KNN', 'NB', 'MLP']
results = {}

for name in models:
    print('Training', name)
    try:
        clf = Classifiers.get_classifier(name, X_train, y_train)
    except Exception as e:
        print('Error training', name, e)
        continue

    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='binary', pos_label=1) if len(np.unique(y))==2 else precision_score(y_test, y_pred, average='macro')
    rec = recall_score(y_test, y_pred, average='binary', pos_label=1) if len(np.unique(y))==2 else recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='binary', pos_label=1) if len(np.unique(y))==2 else f1_score(y_test, y_pred, average='macro')

    results[name] = {
        'classifier': clf,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'y_pred': y_pred
    }
    print(name, 'accuracy=', acc, 'f1=', f1)

# Display a confusion matrix heatmap for each model
for name, info in results.items():
    cm = confusion_matrix(y_test, info['y_pred'])
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion matrix: {name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Summary table
summary = pd.DataFrame([{
    'model': k,
    'accuracy': v['accuracy'],
    'precision': v['precision'],
    'recall': v['recall'],
    'f1': v['f1']
} for k, v in results.items()])
display(summary.sort_values('f1', ascending=False))

### Next
- Use `03_Comparison_MLflow.ipynb` to run cross-validation across models and log results to MLflow for comparison.

## Quick Hyperparameter Tuning (light)
The cell below runs a small RandomizedSearchCV for each model using very few iterations (n_iter=3) and `cv=2`.
This is intended for quick experimentation inside the notebook; final tuning is done in the main flow (now also reduced for speed).

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

tuning_results = {}

# Define small search spaces for quick tuning
param_spaces = {
    'RF': {'n_estimators': [50,100,200], 'max_depth': [None,5,10]},
    'KNN': {'n_neighbors': [3,5,7], 'weights': ['uniform','distance']},
    'SVM': {'C': [0.1,1,10], 'kernel': ['rbf','linear']},
    'NB': {},
    'MLP': {'hidden_layer_sizes': [(50,),(100,)], 'alpha': [0.0001,0.001]}
}

for name in models:
    print('Tuning', name)
    params = param_spaces.get(name, {})
    if not params:
        print('No tuning parameters for', name)
        continue
    # Build estimator similar to 03_Comparison notebook
    if name == 'RF':
        from sklearn.ensemble import RandomForestClassifier
        estimator = RandomForestClassifier(random_state=42, n_jobs=-1)
    elif name == 'KNN':
        from sklearn.neighbors import KNeighborsClassifier
        estimator = KNeighborsClassifier(n_jobs=-1)
    elif name == 'SVM':
        from sklearn.svm import SVC
        estimator = SVC(probability=True, random_state=42)
    elif name == 'MLP':
        from sklearn.neural_network import MLPClassifier
        estimator = MLPClassifier(random_state=42)
    else:
        continue

    search = RandomizedSearchCV(estimator, params, n_iter=3, cv=2, random_state=42, n_jobs=-1)
    try:
        search.fit(X_train, y_train)
        print('Best params for', name, search.best_params_)
        tuning_results[name] = search.best_params_
    except Exception as e:
        print('Tuning failed for', name, e)

tuning_results