In [1]:
import os
import pandas as pd

path = os.getcwd()
parent = os.path.dirname(path) # extract parent dir in cross platform way

path_to_plots = os.path.join(parent, "plots")
path_to_data = os.path.join(parent, "data")

In [2]:
def gridsearch2table(grid_search):
    """
    Given a GridSearchCV object, return a DataFrame with the mean test accuracy
    for each combination of hyperparameters tested.

    Parameters:
    grid_search (GridSearchCV): The fitted GridSearchCV object.

    Returns:
    pd.DataFrame: A DataFrame with the hyperparameters and their corresponding mean test accuracy.
    or
    pd.io.formats.style.Styler: A Styler object with the hyperparameters and their corresponding mean test accuracy.
    """
    # Extract results from the GridSearchCV object
    results = pd.DataFrame(grid_search.cv_results_)
    
    # Select relevant columns (parameters and mean test score)
    param_columns = [col for col in results.columns if col.startswith('param_')]
    score_column = ['mean_test_score']
    accuracy_table = results[param_columns + score_column]
    
    # Rename columns for better readability
    accuracy_table.columns = [col.replace('param_', '').replace('_', ' ').title() for col in accuracy_table.columns]
    
    # Rename the mean test score column
    accuracy_table = accuracy_table.rename(columns={'Mean Test Score': 'Mean Test Accuracy'})

    # Top 10 by accuracy
    accuracy_table = accuracy_table.sort_values("Mean Test Accuracy", ascending=False).head(10)
    
    # return accuracy_table

    # Use Styler to apply background gradient
    styled_table = accuracy_table.copy().style.background_gradient(cmap='viridis_r', subset=['Mean Test Accuracy'])
    return styled_table

## KNN

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

train = pd.read_csv(os.path.join(path_to_data, "train_cleaned.csv"))
test = pd.read_csv(os.path.join(path_to_data, "test_cleaned.csv"))

# Separate features and target variable
X = train.drop("Survived", axis=1)
y = train["Survived"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to run GridSearchCV and return results
def run_grid_search(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    return grid_search

# Nearest Neighbors
knn_params = {
    'n_neighbors': range(15, 36),
    'weights': ['uniform', 'distance']
}
knn_clf = KNeighborsClassifier()
grid_search_knn = run_grid_search(knn_clf, knn_params, X_train, y_train)

# Best KNN model
best_knn = grid_search_knn.best_estimator_
y_val_pred_knn = best_knn.predict(X_val)
knn_accuracy = accuracy_score(y_val, y_val_pred_knn)
knn_report = classification_report(y_val, y_val_pred_knn, output_dict=True)

print("Best KNN Parameters:", grid_search_knn.best_params_)
print("KNN Validation Accuracy:", knn_accuracy)
print(classification_report(y_val, y_val_pred_knn))


Best KNN Parameters: {'n_neighbors': 21, 'weights': 'uniform'}
KNN Validation Accuracy: 0.6433566433566433
              precision    recall  f1-score   support

           0       0.67      0.72      0.69        80
           1       0.61      0.54      0.57        63

    accuracy                           0.64       143
   macro avg       0.64      0.63      0.63       143
weighted avg       0.64      0.64      0.64       143



In [4]:
gridsearch2table(grid_search_knn)

Unnamed: 0,N Neighbors,Weights,Mean Test Accuracy
12,21,uniform,0.72231
8,19,uniform,0.720602
10,20,uniform,0.720556
27,28,distance,0.718848
14,22,uniform,0.717063
21,25,distance,0.715339
19,24,distance,0.715339
18,24,uniform,0.715324
25,27,distance,0.715324
16,23,uniform,0.713554


## SVM

In [5]:
# Support Vector Machine (RBF)
svm_params = {
    'svc__C': [1, 10, 100, 500, 1000],
    'svc__gamma': [1, 0.1, 0.01]
}
svm_clf = Pipeline([('scaler', StandardScaler()), ('svc', SVC(kernel='rbf', random_state=42))])
grid_search_svm = run_grid_search(svm_clf, svm_params, X_train, y_train)

# Best SVM model
best_svm = grid_search_svm.best_estimator_
y_val_pred_svm = best_svm.predict(X_val)
svm_accuracy = accuracy_score(y_val, y_val_pred_svm)
svm_report = classification_report(y_val, y_val_pred_svm, output_dict=True)

print("Best SVM Parameters:", grid_search_svm.best_params_)
print("SVM Validation Accuracy:", svm_accuracy)
print(classification_report(y_val, y_val_pred_svm))


Best SVM Parameters: {'svc__C': 100, 'svc__gamma': 0.01}
SVM Validation Accuracy: 0.8251748251748252
              precision    recall  f1-score   support

           0       0.79      0.94      0.86        80
           1       0.90      0.68      0.77        63

    accuracy                           0.83       143
   macro avg       0.84      0.81      0.82       143
weighted avg       0.84      0.83      0.82       143



In [6]:
gridsearch2table(grid_search_svm)

Unnamed: 0,Svc C,Svc Gamma,Mean Test Accuracy
8,100,0.01,0.815494
1,1,0.1,0.801428
11,500,0.01,0.799674
7,100,0.1,0.796181
4,10,0.1,0.794426
5,10,0.01,0.794395
14,1000,0.01,0.789148
0,1,1.0,0.785717
2,1,0.01,0.785623
10,500,0.1,0.780422


## MLP

In [10]:
# MLPClassifier

import warnings
warnings.filterwarnings("ignore") # suppress failed-to-converge warnings

mlp_params = {
    'mlp__hidden_layer_sizes': [(50, 50), (100,)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__solver': ['adam' , 'lbfgs'] # lbfgs - often fails to converge even after increasing max_iter
    #'mlp__alpha': [0.0001, 0.001] # L2 regularization 
}
mlp_clf = Pipeline([('scaler', StandardScaler()), ('mlp', MLPClassifier(max_iter=7000, random_state=42))])
grid_search_mlp = run_grid_search(mlp_clf, mlp_params, X_train, y_train)

# Best MLP model
best_mlp = grid_search_mlp.best_estimator_
y_val_pred_mlp = best_mlp.predict(X_val)
mlp_accuracy = accuracy_score(y_val, y_val_pred_mlp)
mlp_report = classification_report(y_val, y_val_pred_mlp, output_dict=True)

print("Best MLP Parameters:", grid_search_mlp.best_params_)
print("MLP Validation Accuracy:", mlp_accuracy)
print(classification_report(y_val, y_val_pred_mlp))


Best MLP Parameters: {'mlp__activation': 'relu', 'mlp__hidden_layer_sizes': (50, 50), 'mlp__solver': 'adam'}
MLP Validation Accuracy: 0.7832167832167832
              precision    recall  f1-score   support

           0       0.79      0.84      0.81        80
           1       0.78      0.71      0.74        63

    accuracy                           0.78       143
   macro avg       0.78      0.78      0.78       143
weighted avg       0.78      0.78      0.78       143



In [11]:
gridsearch2table(grid_search_mlp)

Unnamed: 0,Mlp Activation,Mlp Hidden Layer Sizes,Mlp Solver,Mean Test Accuracy
0,relu,"(50, 50)",adam,0.810278
4,tanh,"(50, 50)",adam,0.801428
6,tanh,"(100,)",adam,0.799705
2,relu,"(100,)",adam,0.79792
5,tanh,"(50, 50)",lbfgs,0.752212
1,relu,"(50, 50)",lbfgs,0.738146
3,relu,"(100,)",lbfgs,0.725912
7,tanh,"(100,)",lbfgs,0.701258


In [9]:
# Summarize results in a table

# QUESTION: different setup = different models OR gridsearch??? what if grid is 3D???

# results = {
#     'Model': ['Decision Tree', 'Random Forest', 'KNN', 'SVM', 'MLP'],
#     'Best Parameters': [
#         grid_search_dt.best_params_,
#         grid_search_rf.best_params_,
#         grid_search_knn.best_params_,
#         grid_search_svm.best_params_,
#         grid_search_mlp.best_params_
#     ],
#     'Validation Accuracy': [
#         dt_accuracy,
#         rf_accuracy,
#         knn_accuracy,
#         svm_accuracy,
#         mlp_accuracy
#     ]
# }

# results_df = pd.DataFrame(results)
# print(results_df)
