In [1]:
# Imports
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib

print('Imports ok')

Imports ok


In [None]:
# Load raw features CSV
csv = '../data/features_glcm_lbp_hsv.csv'
if not os.path.exists(csv):
    raise FileNotFoundError(f'{csv} not found in working dir')
df = pd.read_csv(csv)
# Extract/normalize label (case-insensitive H# pattern)
df['label'] = df['filename'].astype(str).str.extract(r'(?i)(h\d+)')[0].str.upper().fillna('UNKNOWN')
print('Loaded', csv, '->', df.shape)
print('Label distribution:')
print(df['label'].value_counts())

# prepare X,y (exclude filename,label,label_code if present)
exclude = {'filename','label','label_code'}
feature_cols = [c for c in df.columns if c not in exclude]
X = df[feature_cols].values
le = LabelEncoder()
y = le.fit_transform(df['label'].values)
print('Features:', len(feature_cols))
print('Classes:', list(le.classes_))

FileNotFoundError: features_glcm_lbp_hsv.csv not found in working dir

In [None]:
# Train and evaluate (same helper as in NCA notebook)
def train_and_evaluate_raw(X, y, label_names=None, prefix='raw'):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)
    kernels=['linear','rbf','poly']
    results = {}
    for k in kernels:
        clf = SVC(kernel=k, random_state=42, gamma='scale')
        clf.fit(X_train_s, y_train)
        y_pred = clf.predict(X_test_s)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        results[k] = {'model': clf, 'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'y_pred': y_pred}
        print(f'Kernel={k:6s}  Accuracy={acc:.4f}  Precision={prec:.4f}  Recall={rec:.4f}  F1={f1:.4f}')
    best_k = max(results.keys(), key=lambda kk: results[kk]['accuracy'])
    best_model = results[best_k]['model']
    joblib.dump(best_model, f'{prefix}_svm_best.joblib')
    joblib.dump(scaler, f'{prefix}_scaler.joblib')
    print(f'Saved best model ({best_k}) and scaler with prefix {prefix}_')
    if label_names is not None:
        print('Classification report (best):')
        print(classification_report(y_test, results[best_k]['y_pred'], target_names=label_names, zero_division=0))
    return results, best_model, scaler

# Run quick baseline on raw features
res_raw, best_raw, scaler_raw = train_and_evaluate_raw(X, y, label_names=le.classes_, prefix='raw')

# ------------------ Robust GridSearchCV (stratified CV + pipeline) ------------------
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline

# Split for grid search evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

pipeline = Pipeline([('scaler', StandardScaler()), ('svc', SVC(random_state=42))])
param_grid = [
    { 'svc__kernel': ['rbf'], 'svc__C': [0.1, 1, 10, 100], 'svc__gamma': ['scale', 'auto', 0.01, 0.001] },
    { 'svc__kernel': ['linear'], 'svc__C': [0.01, 0.1, 1, 10, 100] },
    { 'svc__kernel': ['poly'], 'svc__C': [0.1, 1, 10], 'svc__degree': [2, 3], 'svc__gamma': ['scale', 'auto'] },
]
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gs = GridSearchCV(pipeline, param_grid, scoring='accuracy', cv=cv, n_jobs=-1, verbose=1, return_train_score=False)
print('Running GridSearchCV (this may take a while)')
gs.fit(X_train, y_train)
print('Best params:', gs.best_params_)
print('Best CV score:', gs.best_score_)
# evaluate on held-out test set
y_pred = gs.predict(X_test)
from sklearn.metrics import classification_report, accuracy_score
acc_test = accuracy_score(y_test, y_pred)
print(f'Test set accuracy (best grid estimator): {acc_test:.4f}')
print('Classification report (grid best):')
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))
# save best pipeline
joblib.dump(gs.best_estimator_, 'raw_svm_best_grid.joblib')
print('Saved tuned pipeline -> raw_svm_best_grid.joblib')

Kernel=linear  Accuracy=0.6677  Precision=0.6700  Recall=0.6677  F1=0.6656
Kernel=rbf     Accuracy=0.7683  Precision=0.7786  Recall=0.7683  F1=0.7660
Kernel=poly    Accuracy=0.7180  Precision=0.7332  Recall=0.7180  F1=0.7166
Saved best model (rbf) and scaler with prefix raw_
Classification report (best):
              precision    recall  f1-score   support

          H1       0.94      0.96      0.95        69
         H10       0.74      0.76      0.75        51
          H2       0.81      0.83      0.82        69
          H3       0.77      0.87      0.82        69
          H4       0.71      0.93      0.81        69
          H5       0.98      0.59      0.74        69
          H6       0.70      0.75      0.73        69
          H7       0.70      0.65      0.68        69
          H8       0.75      0.63      0.68        65
          H9       0.65      0.68      0.67        57

    accuracy                           0.77       656
   macro avg       0.78      0.77      0.76 

In [None]:
# Optionally: save feature columns mapping for later use
import json
with open('raw_feature_columns.json','w') as f:
    json.dump(feature_cols, f, indent=2)
print('Saved raw_feature_columns.json')

Saved raw_feature_columns.json
