In [1]:
import os
import random
import shap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
SEED = 17
def set_seed(seed=SEED):
    random.seed(SEED)
    np.random.seed(SEED)

set_seed(SEED)

In [3]:
train = pd.read_csv('inputs/tpm_train.csv', index_col=0)
label_train = pd.read_csv('inputs/label_train.csv')
test = pd.read_csv('inputs/tpm_test.csv', index_col=0)
label_test = pd.read_csv('inputs/label_test.csv')

selected_features = pd.read_csv("inputs/selected_features_tpm_200.csv", header=None).iloc[1:, 0].tolist()
X_selected_test = test[selected_features]
X_selected_train = train[selected_features]

label_test.set_index('sample', inplace=True)
label_train.set_index('sample', inplace=True)

data_test = X_selected_test.join(label_test[['label', 'subject', 'batch']])
data_train = X_selected_train.join(label_train[['label', 'subject', 'batch']])

train_features = data_train[selected_features]
test_features = data_test[selected_features]

train_labels = data_train['label']
test_labels = data_test['label']

In [4]:
clf = LogisticRegression()
clf.fit(train_features, train_labels)
test_predictions = clf.predict(test_features)

accuracy = accuracy_score(test_labels, test_predictions)
print(f'Validation accuracy: {accuracy}')

conf_matrix = confusion_matrix(test_labels, test_predictions)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(test_labels, test_predictions)
print('Classification Report:')
print(class_report)

Validation accuracy: 0.8
Confusion Matrix:
[[17  8]
 [ 2 23]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.68      0.77        25
           1       0.74      0.92      0.82        25

    accuracy                           0.80        50
   macro avg       0.82      0.80      0.80        50
weighted avg       0.82      0.80      0.80        50



In [5]:
set_seed(SEED)
def custom_grid_search_lr(params_grid, data_train, groups, folds=5):
    kf = GroupKFold(n_splits=folds)
    best_params = None
    best_score = -np.inf

    for params in params_grid:
        fold_scores = []

        for train_index, val_index in kf.split(data_train, groups=groups):
            train_subset = data_train.iloc[train_index]
            val_subset = data_train.iloc[val_index]

            train_features = train_subset[selected_features].values
            train_labels = train_subset['label'].values

            val_features = val_subset[selected_features].values
            val_labels = val_subset['label'].values

            if params['penalty'] == 'elasticnet':
                lr = LogisticRegression(C=params['C'], penalty=params['penalty'], solver=params['solver'], l1_ratio=params['l1_ratio'], max_iter=10000)
            else:
                lr = LogisticRegression(C=params['C'], penalty=params['penalty'], solver=params['solver'], max_iter=10000)

            lr.fit(train_features, train_labels)
            val_predictions = lr.predict(val_features)

            accuracy = accuracy_score(val_labels, val_predictions)
            fold_scores.append(accuracy)
        
        mean_score = np.mean(fold_scores)
        print(f"Mean accuracy for params {params}: {mean_score:.4f}")

        if mean_score > best_score:
            best_score = mean_score
            best_params = params

    return best_params, best_score

param_grid_lr = []
incompatible_combinations = {
    'l1': ['newton-cg', 'sag', 'lbfgs'],
    'elasticnet': ['newton-cg', 'sag', 'liblinear', 'lbfgs']
}

C_values = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
penalty_types = ['l1', 'l2', 'elasticnet']
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
l1_ratios = [0.1, 0.3, 0.5, 0.7, 0.9]

for C in C_values:
    for penalty in penalty_types:
        for solver in solvers:
            if penalty in incompatible_combinations and solver in incompatible_combinations[penalty]:
                continue
            
            if penalty == 'elasticnet':
                for l1_ratio in l1_ratios:
                    param_grid_lr.append({'C': C, 'penalty': penalty, 'solver': solver, 'l1_ratio': l1_ratio})
            else:
                param_grid_lr.append({'C': C, 'penalty': penalty, 'solver': solver})


groups = data_train['subject'].values

best_params_lr, best_score_lr = custom_grid_search_lr(param_grid_lr, data_train, groups)

print(f"Best Params (LR): {best_params_lr}")
print(f"Best Mean Accuracy: {best_score_lr:.4f}")

Mean accuracy for params {'C': 0.0001, 'penalty': 'l1', 'solver': 'liblinear'}: 0.5000
Mean accuracy for params {'C': 0.0001, 'penalty': 'l1', 'solver': 'saga'}: 0.5000
Mean accuracy for params {'C': 0.0001, 'penalty': 'l2', 'solver': 'newton-cg'}: 0.7846
Mean accuracy for params {'C': 0.0001, 'penalty': 'l2', 'solver': 'lbfgs'}: 0.7846
Mean accuracy for params {'C': 0.0001, 'penalty': 'l2', 'solver': 'liblinear'}: 0.7717
Mean accuracy for params {'C': 0.0001, 'penalty': 'l2', 'solver': 'sag'}: 0.7783
Mean accuracy for params {'C': 0.0001, 'penalty': 'l2', 'solver': 'saga'}: 0.7721
Mean accuracy for params {'C': 0.0001, 'penalty': 'elasticnet', 'solver': 'saga', 'l1_ratio': 0.1}: 0.5000
Mean accuracy for params {'C': 0.0001, 'penalty': 'elasticnet', 'solver': 'saga', 'l1_ratio': 0.3}: 0.5000
Mean accuracy for params {'C': 0.0001, 'penalty': 'elasticnet', 'solver': 'saga', 'l1_ratio': 0.5}: 0.5000
Mean accuracy for params {'C': 0.0001, 'penalty': 'elasticnet', 'solver': 'saga', 'l1_rati

In [6]:
best_params_lr = {'C': 10000, 'penalty': 'l1', 'solver': 'liblinear'}

In [7]:
best_lr = LogisticRegression(**best_params_lr)
best_lr.fit(train_features, train_labels)
test_predictions = best_lr.predict(test_features)

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f'Test accuracy: {test_accuracy}')

conf_matrix = confusion_matrix(test_labels, test_predictions)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(test_labels, test_predictions)
print('Classification Report:')
print(class_report)

Test accuracy: 0.64
Confusion Matrix:
[[11 14]
 [ 4 21]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.44      0.55        25
           1       0.60      0.84      0.70        25

    accuracy                           0.64        50
   macro avg       0.67      0.64      0.62        50
weighted avg       0.67      0.64      0.62        50



In [8]:
rf_clf = RandomForestClassifier(random_state=SEED)
rf_clf.fit(train_features, train_labels)
test_predictions = rf_clf.predict(test_features)

accuracy = accuracy_score(test_labels, test_predictions)
print(f'Validation accuracy: {accuracy}')

conf_matrix = confusion_matrix(test_labels, test_predictions)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(test_labels, test_predictions)
print('Classification Report:')
print(class_report)

Validation accuracy: 0.76
Confusion Matrix:
[[17  8]
 [ 4 21]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.68      0.74        25
           1       0.72      0.84      0.78        25

    accuracy                           0.76        50
   macro avg       0.77      0.76      0.76        50
weighted avg       0.77      0.76      0.76        50



In [9]:
set_seed(SEED)
def custom_grid_search_rf(params_grid, data_train, groups, folds=5):
    kf = GroupKFold(n_splits=folds)
    best_params = None
    best_score = -np.inf

    for params in params_grid:
        fold_scores = []

        for train_index, val_index in kf.split(data_train, groups=groups):
            train_subset = data_train.iloc[train_index]
            val_subset = data_train.iloc[val_index]
            
            train_features = train_subset[selected_features]
            train_labels = train_subset['label'].values
            
            val_features = val_subset[selected_features]
            val_labels = val_subset['label'].values

            rf = RandomForestClassifier(n_estimators=params['n_estimators'], max_depth=params.get('max_depth', None),
                                        min_samples_split=params.get('min_samples_split', 2),
                                        min_samples_leaf=params.get('min_samples_leaf', 1),
                                        max_features=params.get('max_features', 'sqrt'), random_state=SEED)
            rf.fit(train_features, train_labels)

            val_predictions = rf.predict(val_features)

            accuracy = accuracy_score(val_labels, val_predictions)
            fold_scores.append(accuracy)
        
        mean_score = np.mean(fold_scores)
        print(f"Mean accuracy for params {params}: {mean_score:.4f}")

        if mean_score > best_score:
            best_score = mean_score
            best_params = params

    return best_params, best_score

param_grid_rf = [
    {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split,
     'min_samples_leaf': min_samples_leaf, 'max_features': max_features}
    for n_estimators in [100, 500, 1000]
    for max_depth in [None, 10, 20, 50]
    for min_samples_split in [2, 5]
    for min_samples_leaf in [1, 2]
    for max_features in ['sqrt', 'log2']
]

groups = data_train['subject'].values

best_params_rf, best_score_rf = custom_grid_search_rf(param_grid_rf, data_train, groups)

print(f"Best Params (RF): {best_params_rf}")
print(f"Best Mean Accuracy: {best_score_rf:.4f}")

Mean accuracy for params {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}: 0.7904
Mean accuracy for params {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}: 0.8037
Mean accuracy for params {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt'}: 0.7654
Mean accuracy for params {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2'}: 0.8037
Mean accuracy for params {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt'}: 0.7971
Mean accuracy for params {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}: 0.8029
Mean accuracy for params {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': '

In [10]:
best_params_rf = {'n_estimators': 500, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}

In [11]:
best_rf = RandomForestClassifier(**best_params_rf, random_state=SEED)
best_rf.fit(train_features, train_labels)
test_predictions = best_rf.predict(test_features)

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f'Test accuracy: {test_accuracy}')

conf_matrix = confusion_matrix(test_labels, test_predictions)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(test_labels, test_predictions)
print('Classification Report:')
print(class_report)

Test accuracy: 0.78
Confusion Matrix:
[[19  6]
 [ 5 20]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.76      0.78        25
           1       0.77      0.80      0.78        25

    accuracy                           0.78        50
   macro avg       0.78      0.78      0.78        50
weighted avg       0.78      0.78      0.78        50



In [12]:
svm_clf = SVC(probability=True, random_state=SEED)
svm_clf.fit(train_features, train_labels)
test_predictions = svm_clf.predict(test_features)

accuracy = accuracy_score(test_labels, test_predictions)
print(f'Test accuracy: {accuracy}')

conf_matrix = confusion_matrix(test_labels, test_predictions)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(test_labels, test_predictions)
print('Classification Report:')
print(class_report)

Test accuracy: 0.74
Confusion Matrix:
[[17  8]
 [ 5 20]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.68      0.72        25
           1       0.71      0.80      0.75        25

    accuracy                           0.74        50
   macro avg       0.74      0.74      0.74        50
weighted avg       0.74      0.74      0.74        50



In [13]:
set_seed(SEED)
def custom_grid_search_svm(params_grid, data_train, groups, folds=5):
    kf = GroupKFold(n_splits=folds)
    best_params = None
    best_score = -np.inf

    for params in params_grid:
        fold_scores = []

        for train_index, val_index in kf.split(data_train, groups=groups):
            train_subset = data_train.iloc[train_index]
            val_subset = data_train.iloc[val_index]
            
            train_features = train_subset[selected_features]
            train_labels = train_subset['label'].values
            
            val_features = val_subset[selected_features]
            val_labels = val_subset['label'].values

            svm = SVC(C=params['C'], gamma=params.get('gamma', 'scale'), kernel=params['kernel'],
                      degree=params.get('degree', 3), coef0=params.get('coef0', 0.0), probability=True, random_state=SEED)
            svm.fit(train_features, train_labels)

            val_predictions = svm.predict(val_features)

            accuracy = accuracy_score(val_labels, val_predictions)
            fold_scores.append(accuracy)
        
        mean_score = np.mean(fold_scores)
        print(f"Mean accuracy for params {params}: {mean_score:.4f}")

        if mean_score > best_score:
            best_score = mean_score
            best_params = params

    return best_params, best_score

param_grid_svm = [
    {'C': C, 'gamma': gamma, 'kernel': 'rbf'}
    for C in [0.1, 1, 10, 100, 1000]
    for gamma in ['scale', 0.1, 0.01, 0.001]
] + [
    {'C': C, 'gamma': gamma, 'kernel': 'sigmoid', 'coef0': coef0}
    for C in [0.1, 1, 10, 100, 1000]
    for gamma in ['scale', 0.1, 0.01, 0.001]
    for coef0 in [0, 0.1, 0.5, 1]
] + [
    {'C': C, 'gamma': gamma, 'kernel': 'poly', 'degree': degree, 'coef0': coef0}
    for C in [0.1, 1, 10, 100, 1000]
    for gamma in ['scale', 0.1, 0.01, 0.001]
    for degree in [2, 3, 4]
    for coef0 in [0, 0.1, 0.5, 1]
] + [
    {'C': C, 'kernel': 'linear'}
    for C in [0.1, 1, 10, 100, 1000]
]

groups = data_train['subject'].values

best_params_svm, best_score_svm = custom_grid_search_svm(param_grid_svm, data_train, groups)

print(f"Best Params (SVM): {best_params_svm}")
print(f"Best Mean Accuracy: {best_score_svm:.4f}")

Mean accuracy for params {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}: 0.7471
Mean accuracy for params {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}: 0.5067
Mean accuracy for params {'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}: 0.7971
Mean accuracy for params {'C': 0.1, 'gamma': 0.001, 'kernel': 'rbf'}: 0.7400
Mean accuracy for params {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}: 0.8412
Mean accuracy for params {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}: 0.5254
Mean accuracy for params {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}: 0.8221
Mean accuracy for params {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}: 0.8475
Mean accuracy for params {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}: 0.8925
Mean accuracy for params {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}: 0.5321
Mean accuracy for params {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}: 0.8283
Mean accuracy for params {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}: 0.8925
Mean accuracy for params {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}: 0.8925
Mean accurac

In [14]:
best_params_svm = {'C': 0.1, 'kernel': 'linear'}

In [15]:
best_svm = SVC(**best_params_svm, probability=True, random_state=SEED)
best_svm.fit(train_features, train_labels)
test_predictions = best_svm.predict(test_features)

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f'Test accuracy: {test_accuracy}')

conf_matrix = confusion_matrix(test_labels, test_predictions)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(test_labels, test_predictions)
print('Classification Report:')
print(class_report)

Test accuracy: 0.78
Confusion Matrix:
[[17  8]
 [ 3 22]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.68      0.76        25
           1       0.73      0.88      0.80        25

    accuracy                           0.78        50
   macro avg       0.79      0.78      0.78        50
weighted avg       0.79      0.78      0.78        50

