In [30]:
import sys, os
import pandas as pd
import numpy as np
sys.path.append(os.path.abspath("..")) 

from src.preprocessing import Preprocessor
from src.models import LogisticModel, KNNModel, SVMModel, RFModel, get_default_models
from src.models import get_default_models
from src.evaluator import Evaluator

In [31]:
prep1 = Preprocessor("/Users/wendy/Library/CloudStorage/OneDrive-ImperialCollegeLondon/Year 2/MATE50001/Computing/Coursework/dataset_2 (1).csv")
X_train1, X_test1, y_train1, y_test1 = prep1.preprocess()

In [32]:
models = get_default_models()
ev = Evaluator(class_names=["non-conductive", "conductive"])

from sklearn.metrics import accuracy_score

for name, model in models.items():
    model.fit(X_train1, y_train1)
    y_pred = model.predict(X_test1)
    acc = accuracy_score(y_test1, y_pred)
    print(name, "test accuracy:", acc)

logistic test accuracy: 0.9594594594594594
knn test accuracy: 0.9324324324324325
svm_rbf test accuracy: 0.9594594594594594
random_forest test accuracy: 1.0


In [38]:
os.makedirs("plots", exist_ok=True)
os.makedirs("metrics", exist_ok=True)

ev = Evaluator(class_names=["non-conductive", "conductive"])

models = get_default_models()
results = {}

for name, model in models.items():
    model.fit(X_train1, y_train1)
    y_pred = model.predict(X_test1)

    metrics = ev.compute_metrics(
        y_test1,
        y_pred,
        average="binary",
        pos_label=1
    )
    results[name] = metrics

    print(f"=== {name} ===")
    print(metrics)
    

=== logistic ===
{'accuracy': 0.9594594594594594, 'precision': 1.0, 'recall': 0.918918918918919, 'f1': 0.9577464788732394}
=== knn ===
{'accuracy': 0.9324324324324325, 'precision': 1.0, 'recall': 0.8648648648648649, 'f1': 0.927536231884058}
=== svm_rbf ===
{'accuracy': 0.9594594594594594, 'precision': 1.0, 'recall': 0.918918918918919, 'f1': 0.9577464788732394}
=== random_forest ===
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}


In [34]:
prep2 = Preprocessor("/Users/wendy/Library/CloudStorage/OneDrive-ImperialCollegeLondon/Year 2/MATE50001/Computing/Coursework/dataset_2 (1).csv", target_col="label")
X_train2, X_test2, y_train2, y_test2 = prep2.preprocess()

models = get_default_models()
ev = Evaluator(class_names=[0, 1]) 
results_d2 = {}

for name, model in models.items():
    model.fit(X_train2, y_train2)
    y_pred = model.predict(X_test2)

    metrics = ev.compute_metrics(
        y_test2, y_pred,
        average="binary",
        pos_label=1
    )
    results_d2[name] = metrics
    print(name, metrics)

pd.DataFrame(results_d2).T

logistic {'accuracy': 0.9594594594594594, 'precision': 1.0, 'recall': 0.918918918918919, 'f1': 0.9577464788732394}
knn {'accuracy': 0.9324324324324325, 'precision': 1.0, 'recall': 0.8648648648648649, 'f1': 0.927536231884058}
svm_rbf {'accuracy': 0.9594594594594594, 'precision': 1.0, 'recall': 0.918918918918919, 'f1': 0.9577464788732394}
random_forest {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}


Unnamed: 0,accuracy,precision,recall,f1
logistic,0.959459,1.0,0.918919,0.957746
knn,0.932432,1.0,0.864865,0.927536
svm_rbf,0.959459,1.0,0.918919,0.957746
random_forest,1.0,1.0,1.0,1.0


In [35]:
from sklearn.model_selection import learning_curve

best_model_name = "random_forest"  

best_model = get_default_models()[best_model_name]
best_model.fit(X_train2, y_train2) 

X2 = np.vstack([X_train2, X_test2])
y2 = np.hstack([y_train2, y_test2])

train_sizes, train_scores, val_scores = learning_curve(
    estimator=best_model.model,
    X=X2,
    y=y2,
    cv=5,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring="accuracy",
    n_jobs=None,
)

train_mean = train_scores.mean(axis=1)
val_mean = val_scores.mean(axis=1)

df_learning = pd.DataFrame({
    "train_size": train_sizes,
    "train_accuracy": train_mean,
    "val_accuracy": val_mean,
})
df_learning

Unnamed: 0,train_size,train_accuracy,val_accuracy
0,29,1.0,0.950944
1,58,1.0,0.989152
2,88,1.0,0.994558
3,117,1.0,0.994558
4,147,1.0,0.994558
5,176,1.0,0.994558
6,205,1.0,0.994558
7,235,1.0,0.99726
8,264,1.0,0.997297
9,294,1.0,0.997297


In [36]:
threshold = 0.70

good = df_learning[df_learning["val_accuracy"] >= threshold]

if good.empty:
    print("Model never reaches 70% validation accuracy.")
else:
    row = good.iloc[0]
    print(f"Smallest train size with ≥ {threshold*100:.0f}% accuracy:")
    print(row)

Smallest train size with ≥ 70% accuracy:
train_size        29.000000
train_accuracy     1.000000
val_accuracy       0.950944
Name: 0, dtype: float64


In [None]:
from sklearn.inspection import permutation_importance
import pandas as pd
import numpy as np

# Preprocess Dataset 2
prep2 = Preprocessor("../data/dataset_2.csv", target_col="label")
X_train2, X_test2, y_train2, y_test2 = prep2.preprocess()

feature_names = prep2.feature_cols  # list of 8 feature names

# Choose your best performing model from results_d2:
best_model_name = "svm_rbf"  # <-- change to whichever won
best_model = get_default_models()[best_model_name]

# Fit the model on the training data
best_model.fit(X_train2, y_train2)

# Compute permutation importance
r = permutation_importance(
    estimator=best_model.model,
    X=X_test2,
    y=y_test2,
    scoring="accuracy",
    n_repeats=20,
    random_state=42
)

importances = pd.Series(r.importances_mean, index=feature_names)
importances.sort_values(ascending=False)