In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [2]:
# Paths to the datasets
train_dataset = '/home/aghasemi/CompBio481/ML_classifiers/datasets/NC_vs_NPH_train.csv'
test_dataset = '/home/aghasemi/CompBio481/ML_classifiers/datasets/NC_vs_NPH_test.csv'

In [3]:
train_df = pd.read_csv(train_dataset)
test_df = pd.read_csv(test_dataset)

In [4]:
# Separate features and target variable for training data
X_train = train_df.drop(columns=['ID_1', 'Diagnosis'])
y_train = train_df['Diagnosis']

In [5]:
# Separate features and target variable for test data
X_test = test_df.drop(columns=['ID_1', 'Diagnosis'])
y_test = test_df['Diagnosis']

In [6]:
# Logistic Regression
param_grid_lr = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 500]
}

In [7]:
lr = LogisticRegression()
grid_search_lr = GridSearchCV(estimator=lr, param_grid=param_grid_lr, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_lr.fit(X_train, y_train)
best_params_lr = grid_search_lr.best_params_



In [8]:
# XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2]
}

In [9]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)
best_params_xgb = grid_search_xgb.best_params_

In [10]:
# SVM
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

In [11]:
svm = SVC()
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_svm.fit(X_train, y_train)
best_params_svm = grid_search_svm.best_params_

In [12]:
# Train and evaluate the models with the best parameters
# Logistic Regression
lr_best = LogisticRegression(**best_params_lr)
lr_best.fit(X_train, y_train)
y_pred_lr = lr_best.predict(X_test)
print("Logistic Regression")
print("Best Parameters:", best_params_lr)
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression
Best Parameters: {'C': 10, 'max_iter': 100, 'solver': 'liblinear'}
Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.90      0.95      0.93        59
           1       0.77      0.62      0.69        16

    accuracy                           0.88        75
   macro avg       0.84      0.79      0.81        75
weighted avg       0.87      0.88      0.88        75



In [13]:
# XGBoost
xgb_best = XGBClassifier(**best_params_xgb, use_label_encoder=False, eval_metric='logloss')
xgb_best.fit(X_train, y_train)
y_pred_xgb = xgb_best.predict(X_test)
print("\nXGBoost")
print("Best Parameters:", best_params_xgb)
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


XGBoost
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Accuracy: 0.8
              precision    recall  f1-score   support

           0       0.83      0.93      0.88        59
           1       0.56      0.31      0.40        16

    accuracy                           0.80        75
   macro avg       0.69      0.62      0.64        75
weighted avg       0.77      0.80      0.78        75



In [14]:
# SVM
svm_best = SVC(**best_params_svm)
svm_best.fit(X_train, y_train)
y_pred_svm = svm_best.predict(X_test)
print("\nSVM")
print("Best Parameters:", best_params_svm)
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


SVM
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy: 0.8533333333333334
              precision    recall  f1-score   support

           0       0.89      0.93      0.91        59
           1       0.69      0.56      0.62        16

    accuracy                           0.85        75
   macro avg       0.79      0.75      0.76        75
weighted avg       0.85      0.85      0.85        75

