# Best Model with 148 Genes and CatBoost

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier, Pool, metrics, cv

In [2]:
path1 = Path("/Users/zainabnazari/large_data_ppmi")

In [3]:
top_genes=pd.read_csv(path1/"Log_TMM_top_148.txt",delimiter='\t')
top_genes

Unnamed: 0,ID,ENSG00000178537,ENSG00000147475,ENSG00000176076,ENSG00000026297,ENSG00000105767,ENSG00000100577,ENSG00000106638,ENSG00000273513,ENSG00000153976,...,ENSG00000267296,ENSG00000102882,ENSG00000142937,ENSG00000004779,ENSG00000130520,ENSG00000107672,ENSG00000243836,ENSG00000180667,ENSG00000109113,Class
0,3000,0.214188,0.256651,2.028779,1.738351,0.575424,0.500757,-0.010251,0.909533,1.348857,...,-0.313192,1.367487,-0.646322,-0.964944,-0.167636,-0.407364,-1.452176,0.011872,0.180085,CTR
1,3001,0.017292,-0.226348,-1.718627,0.083409,-0.327149,1.005505,-0.389756,0.771296,-0.534252,...,-1.536625,0.054046,0.740634,0.312466,-0.094441,0.002080,-0.267186,-0.252018,-0.220479,PD
2,3002,-2.295262,-0.191711,1.437014,-0.539339,-0.708054,-0.929805,-0.669279,-0.648208,2.203919,...,0.945716,-0.331213,0.078177,-0.155409,-0.672555,-0.900814,-1.202376,-0.910379,0.783800,PD
3,3003,0.032919,-0.465098,0.693917,-0.478519,-0.295854,-0.568447,0.721424,-0.557504,0.111526,...,0.100442,-0.188091,-0.318060,0.313729,0.237089,-0.360275,0.729641,0.927829,-1.059551,PD
4,3004,0.639304,0.824057,0.804450,0.370042,0.015323,0.256362,-0.451868,1.240719,0.957138,...,1.642679,0.205678,-0.229659,-0.824420,-0.567233,-0.863398,1.151091,-0.947288,-0.729601,CTR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,4102,0.019741,0.427530,0.266964,0.182688,-0.221460,0.484210,0.227876,0.194415,0.246500,...,0.229888,-0.222252,0.708111,0.601851,0.247423,-0.038397,2.340051,-1.269700,0.282535,PD
541,4108,0.294262,0.564670,0.410063,0.410425,0.335009,-1.095308,-0.270516,-1.851468,0.280611,...,0.367099,0.116123,-0.232081,0.220116,0.132690,-0.102922,0.005361,0.777387,1.762710,PD
542,4115,0.116057,-0.726276,-1.108262,0.954280,-0.768082,-0.456909,-0.459594,0.527362,-0.597880,...,-0.708426,-0.072919,-0.228603,0.203033,-0.539261,-0.026190,-1.429442,0.267668,-0.133665,PD
543,4136,-0.221597,-0.665018,0.742318,-0.366123,0.829450,-0.475974,0.084117,0.255936,1.273915,...,-0.270663,0.706560,-0.474882,-0.994447,-0.506636,-0.122179,-1.856811,0.300377,-0.355611,PD


In [4]:
# Separate features (X) and target variable (y)
X = top_genes.drop(['ID', 'Class'], axis=1)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'COHORT' column
label = label_encoder.fit_transform(top_genes['Class'])

# Set the label for parkinson's disease and healthy control
top_genes.loc[:, 'Class'] = label

y = top_genes['Class']


# CatBoost

In [7]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

In [8]:
model = CatBoostClassifier(
    custom_metric=['Recall', 'Precision', 'AUC','Accuracy'],
    random_seed=42,
    logging_level='Silent'
)

In [9]:
# For AUC:
model.fit(
    X_train, y_train,
    eval_set=(X_validation, y_validation),
#     logging_level='Verbose',  # you can uncomment this for text output
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [11]:
cv_params = model.get_params()
cv_params.update({
    'loss_function': metrics.Logloss()
})
cv_data = cv(
    Pool(X, y),
    cv_params,
    fold_count=3,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [14]:
print(list(cv_data.keys()))

['iterations', 'test-Logloss-mean', 'test-Logloss-std', 'train-Logloss-mean', 'train-Logloss-std', 'test-Recall-mean', 'test-Recall-std', 'train-Recall-mean', 'train-Recall-std', 'test-Precision-mean', 'test-Precision-std', 'train-Precision-mean', 'train-Precision-std', 'test-AUC-mean', 'test-AUC-std', 'test-Accuracy-mean', 'test-Accuracy-std', 'train-Accuracy-mean', 'train-Accuracy-std']


In [12]:
print('Best validation AUC score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-AUC-mean']),
    cv_data['test-AUC-std'][np.argmax(cv_data['test-AUC-mean'])],
    np.argmax(cv_data['test-AUC-mean'])
))


print('Best validation Accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation AUC score: 0.81±0.03 on step 998
Best validation Accuracy score: 0.75±0.01 on step 720


In [13]:
y_pred = model.predict(X_validation)
tn, fp, fn, tp = confusion_matrix(y_validation, y_pred).ravel()

specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)

print(f"Specificity: {specificity:.4f}")
print(f"Sensitivity: {sensitivity:.4f}")

Specificity: 0.4118
Sensitivity: 0.8932


In [15]:
current_date = datetime.now().date()
print("Last update :", current_date)

Last update : 2024-05-20
