In [230]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import KNNImputer
import shap
import sklearn
from sklearn.metrics import matthews_corrcoef
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import re
from sklearn.model_selection import cross_val_score,KFold, LeaveOneOut
from matplotlib import pyplot
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import MinMaxScaler, minmax_scale
from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_rows', None)

In [231]:
filename = ""
df = pd.read_excel(filename)
df.set_index('Pat', inplace=True)

In [232]:
#Removing all unnecessary variables
columns_drop = ['SAL', 'AGEBIN', 'Sex male', 'sex female', 'EXAML', 'SEX',
                'AMLSTAT', 'WBCU', 'HB', 'HBU', 'PLT', 'PLTU', 'LDH', 'LDHU', 'CGKT',
                'ELN2017', 'ELN2022', 'CEBPASTAT', 'FEV',
               'ELN2022 fav', 'ELN2022 int', 'ELN2022 adv', 'ALSCTCR1', 'ALSCTSLV', 'ALSCTOTH',
                'OSTMU', 'EFSTM', 'EFSTMU', 'EFSSTAT', 'RFSTM', 'FLT3R','CGNK', 'CGCX',
               'RFSTMU', 'RFSSTAT', 'Unnamed: 0', 'CEBPADM', 'CEBPA.bZIP','CEBPA.bZIP.inframe', 'CEBPA.TAD',
               'OSTM_unter24_zensiert', 'OSTM = 24', 'OSTMabove24', 'CR1','FLT3T', 'TrialID','WBC', 'BMB', 'PBB'] 
df = df.drop(columns_drop, axis=1)

In [233]:
replacement_mappings = {
    "t8.21": "t(8;21)",
    "minus.5": "-5",
    "minus.7": "-7",
    "minus.17": "-17",
    "del.5q.": "del(5q)",
    "t.v.11..v.q23.": "t(v;11q23.3)",
    "t.9.11..p21.23.q23.": "t(9;11)(p21.3;q23.3)",
    "t.10.11.": "t(10;11)",
    "t.11.19..q23.p13.": "t(11;19)(q23.3;p13.3)",
    "del.7q.": "del(7q)",
    "del.9q.": "del(9q)",
    "minus.X": "-X",
    "minus.Y": "-Y",
    "inv16_t16.16": "inv(16)(p13.1q22)",
    "CEBPA.bZIP.inframe": "CEBPA-bZip-inf",
    "FLT3I": "FLT3-ITD"
}

# Loop through each column where replacements are needed
for column in df.columns:
    df.rename(columns=replacement_mappings, inplace=True)


In [None]:
#Removing all mutations which have less than 1% of occurence
sum=0
columns = df.loc[:, (df.columns != 'AGE') & (df.columns != 'OSTM') & (df.columns != 'TrialID') & (df.columns != 'AGEGRP')
                & (df.columns != 'WBC') & (df.columns != 'BMB') & (df.columns != 'PBB')]

for column in columns:
    counter = (df[column].values == 1).sum()
    patients = len(df)
    result=counter/patients
    if (counter/patients) <= 0.01:
        sum+=1
        print(str(column) + ': ' + str(counter) + '/' + str(patients) + ' = ' + str(result))
        df = df.drop(column, axis=1)

In [235]:
df["inv(16)(p13.1q22)"] = pd.to_numeric(df["inv(16)(p13.1q22)"], errors='coerce')
df["t(8;21)"] = pd.to_numeric(df["t(8;21)"], errors='coerce')

print(df.shape)

for index, row in df.iterrows():
    if row['OSTM'] >= 24:
        df.loc[index, 'OSTM'] = 1
    elif (row['OSTM'] < 24) and (row['OSSTAT'] == 1):
        df.loc[index, 'OSTM'] = 0
    elif (row['OSTM'] < 24) and (row['OSSTAT'] == 0):
        df.drop(index=index, inplace=True)

print(df.shape)

(3062, 54)
(2858, 54)


In [236]:
imputer = KNNImputer(n_neighbors=100)
df['AGEGRP'] = df['AGEGRP'].map({'infants': 1,'children': 2,'AYA': 3,'adults': 4,'seniors': 5,'elderly': 6 })

df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns, index=df.index)
df = df.round().astype(int)

In [337]:
results_df = pd.DataFrame({
    'Test Size': [],
    'Classifier': [],
    'Accuracy': [],
    'AUC': [],
    'MCC': []
})
strat_element = 'TrialID'

df['split_col'] = df[strat_element] + df['OSTM'] #for knn

classifiers = ["RandomForestClassifier", 'LogisticRegression', 'sklearn.svm.SVC', 'xgb.XGBClassifier']
size_test = [0.1, 0.2, 0.25, 0.3]

X = df.loc[:, df.columns[(df.columns != 'OSTM') & (df.columns != 'OSSTAT')]]
y = df['OSTM']

for element in size_test:
    for classifier_name in classifiers:
        acc_sum = 0
        roc_sum = 0
        mcc_sum = 0
        for i in range(0,3):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=element, stratify=X['split_col'])
            if classifier_name == 'sklearn.svm.SVC':
                classifier_class = eval(classifier_name)
                model = classifier_class(probability = True)
            else:
                classifier_class = eval(classifier_name)
                model = classifier_class()
            X_train = X_train.drop([strat_element, 'split_col'], axis=1)#'TrialID', 
            X_test = X_test.drop([ strat_element, 'split_col'], axis=1)#'TrialID',

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            y_pred_prob = model.predict_proba(X_test)

            accuracy = accuracy_score(y_test, y_pred)
            acc_sum += accuracy
            roc_auc = roc_auc_score(y_test, y_pred_prob[:,1])
            roc_sum += roc_auc
            mcc=matthews_corrcoef(y_test, y_pred)
            mcc_sum += mcc
        
        acc_avg = round(acc_sum / 3, 4)
        roc_avg = round(roc_sum / 3, 4)
        mcc_avg = round(mcc_sum / 3, 4)
        
        results_df = results_df.append({
            'Test Size': element,
            'Classifier': classifier_name,
            'Accuracy': acc_avg,
            'AUC': roc_avg,
            'MCC': mcc_avg
        }, ignore_index=True)
        
results_df.to_excel('', index=False)

Done


In [None]:
strat_element = 'AGEGRP'
df['split_col'] = df[strat_element] + df['OSTM'] 
size_test = 0.3 
X = df.loc[:, df.columns[(df.columns != 'OSTM') & (df.columns != 'OSSTAT')]]
y = df['OSTM']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size_test, stratify=X['split_col'])
model = LogisticRegression()
X_train = X_train.drop([strat_element, 'split_col'], axis=1)
X_test = X_test.drop([ strat_element, 'split_col'], axis=1)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)

#accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob[:,1])
mcc=matthews_corrcoef(y_test, y_pred)
print(roc_auc, mcc)

cm = confusion_matrix(y_test,y_pred)
print(cm)

explainer = shap.Explainer(model.predict,X_test)
shap_values = explainer.shap_values(X_test) 
shap.summary_plot(shap_values, X_test, show=False, class_inds='original', plot_type="bar")

f = plt.gcf()
plt.xlim(0,0.35)
plt.legend(loc='lower right')
f.tight_layout()
f.savefig('', facecolor='w', dpi=300, format="tiff")
plt.close()  

shap.plots.beeswarm(explainer(X_test), max_display=20, show=False) 

f = plt.gcf()
plt.xlim(-1,1)
f.tight_layout()
f.savefig('', facecolor='w', dpi=300, format="tiff")
plt.close()  

age_groups = {
    'Infants': X_test[X_test['AGE'] <= 2],
    'Children': X_test[(X_test['AGE'] > 2) & (X_test['AGE'] <= 15)],
    'AYA': X_test[(X_test['AGE'] > 15) & (X_test['AGE'] <= 40)],
    'Adults': X_test[(X_test['AGE'] > 40) & (X_test['AGE'] <= 65)],
    'Seniors': X_test[(X_test['AGE'] > 65) & (X_test['AGE'] <= 75)],
    'Elderly': X_test[X_test['AGE'] > 75]
}

for age_group_name, age_group_data in age_groups.items():
    X_age_group = age_group_data
    y_age_group = y_test[age_group_data.index]

    y_pred = model.predict(X_age_group)
    y_pred_prob_test =model.predict_proba(X_age_group)
    roc_auc_test = roc_auc_score(y_age_group, y_pred_prob_test[:,1])

    print(f" For {age_group_name},"f" AUC: {roc_auc_test:.4f}")
    cm = confusion_matrix(y_age_group,y_pred)
    print(cm)
    print(f"\nShap values for {age_group_name}:")

    explainer = shap.Explainer(model.predict,X_age_group)
    shap_values = explainer.shap_values(X_age_group) #lr, svm
    
    shap.summary_plot(shap_values, X_age_group, show=False, class_inds='original', plot_type="bar")
    
    f = plt.gcf()
    plt.xlim(0,0.3)
    plt.legend(loc='lower right')
    f.tight_layout()
    f.savefig('' + age_group_name + '_bar.tiff', facecolor='w', dpi=300, format="tiff")
    plt.close()  
    
    shap.plots.beeswarm(explainer(X_age_group),max_display=20, show=False) #for RF [:, :, 1]
  
    f = plt.gcf()
    plt.xlim(-1,1)
    f.tight_layout()
    f.savefig('' + age_group_name + '_beeswarm.tiff', facecolor='w', dpi=300, format="tiff")
    plt.close()  