In [1]:
import pandas as pd
import numpy as np

#model selection
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split,RandomizedSearchCV,cross_validate

#############
#preprocessing

#missing features


#scaling
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

#oversampling
from imblearn.pipeline import Pipeline #can't use standard Pipeline with SMOTE
from imblearn.over_sampling import SMOTE


####################
#models 
from sklearn.tree import DecisionTreeClassifier

from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import HistGradientBoostingClassifier



#model_evaluation
from sklearn.metrics import make_scorer, recall_score, precision_score,matthews_corrcoef

# I changed cell below for local import

In [2]:
#url = 'https://raw.githubusercontent.com/walleser/brca_tcga/master/brca_merged_data.csv'

df = pd.read_csv("brca_merged_data.csv",index_col= 0)
#df = pd.read_csv(url,index_col= 0)
print(df.shape)
df.head()

(876, 364)


Unnamed: 0,#Patient Identifier,Sample Identifier,Oncotree Code,Cancer Type,Cancer Type Detailed,Tumor Type,Neoplasm Histologic Grade,Tissue Prospective Collection Indicator,Tissue Retrospective Collection Indicator,Tissue Source Site,...,22_mutcount,3_mutcount,4_mutcount,5_mutcount,6_mutcount,7_mutcount,8_mutcount,9_mutcount,X_mutcount,Y_mutcount
0,TCGA-3C-AALI,TCGA-3C-AALI-01,IDC,Invasive Breast Carcinoma,Breast Invasive Ductal Carcinoma,Infiltrating Ductal Carcinoma,,No,Yes,3C,...,9.0,54.0,29.0,32.0,38.0,41.0,32.0,22.0,29.0,
1,TCGA-3C-AALK,TCGA-3C-AALK-01,IDC,Invasive Breast Carcinoma,Breast Invasive Ductal Carcinoma,Infiltrating Ductal Carcinoma,,No,Yes,3C,...,3.0,5.0,2.0,9.0,5.0,2.0,,4.0,4.0,
2,TCGA-4H-AAAK,TCGA-4H-AAAK-01,ILC,Invasive Breast Carcinoma,Breast Invasive Lobular Carcinoma,Infiltrating Lobular Carcinoma,,Yes,No,4H,...,,1.0,3.0,2.0,,2.0,2.0,4.0,3.0,
3,TCGA-5T-A9QA,TCGA-5T-A9QA-01,BRCNOS,Invasive Breast Carcinoma,Breast Invasive Carcinoma (NOS),Other,,Yes,No,5T,...,4.0,8.0,8.0,,3.0,4.0,2.0,2.0,10.0,
4,TCGA-A1-A0SF,TCGA-A1-A0SF-01,IDC,Invasive Breast Carcinoma,Breast Invasive Ductal Carcinoma,Infiltrating Ductal Carcinoma,,No,Yes,A1,...,1.0,3.0,6.0,3.0,6.0,6.0,1.0,2.0,1.0,


In [3]:
df.isna().sum()[210:260]


CTNNA1|alpha-Catenin                          876
NKX2-1|TTF1                                   876
CASP3|Caspase-3                               798
CASP9|Caspase-9                               798
PARP1|PARP1                                   798
COPS5|JAB1                                    798
PATIENT_ID                                      0
SUBTYPE                                        79
CANCER_TYPE_ACRONYM                             0
OTHER_PATIENT_ID                                0
AGE                                             0
SEX                                             0
AJCC_PATHOLOGIC_TUMOR_STAGE                     4
AJCC_STAGING_EDITION                          117
DAYS_LAST_FOLLOWUP                             85
DAYS_TO_BIRTH                                  14
DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS            0
ETHNICITY                                     125
FORM_COMPLETION_DATE                            0
HISTORY_NEOADJUVANT_TRTYN                       0


In [4]:
cols = df.columns

print(cols)

Index(['#Patient Identifier', 'Sample Identifier', 'Oncotree Code',
       'Cancer Type', 'Cancer Type Detailed', 'Tumor Type',
       'Neoplasm Histologic Grade', 'Tissue Prospective Collection Indicator',
       'Tissue Retrospective Collection Indicator', 'Tissue Source Site',
       ...
       '22_mutcount', '3_mutcount', '4_mutcount', '5_mutcount', '6_mutcount',
       '7_mutcount', '8_mutcount', '9_mutcount', 'X_mutcount', 'Y_mutcount'],
      dtype='object', length=364)


In [5]:
model_cols = list(df.columns[18:209])
model_cols.extend(["AGE","SEX","RACE","TMB (nonsynonymous)"])
model_cols.extend(list(df.columns[248:321]))

In [6]:
df_model = df.loc[:,model_cols]
df_model.drop(['DFS_MONTHS', 'PFS_STATUS', 'PFS_MONTHS'],axis=1,inplace=True)

df_model.shape

(876, 265)

In [7]:
df_model.shape


(876, 265)

In [8]:
sum(df_model.isna().sum() > 100)


69

In [9]:
df_model.dropna(axis= 1,thresh=100,inplace=True)
df_model.shape


(876, 203)

In [10]:
df_model.dropna(axis = 0, subset= ["DSS_STATUS"],inplace=True)
df_model.shape


(857, 203)

In [11]:
map_dict = {"0:ALIVE OR DEAD TUMOR FREE" : int(0),
            "1:DEAD WITH TUMOR" : int(1)
           }


X = df_model.drop(labels= ['DSS_STATUS', 'DSS_MONTHS', "DFS_STATUS" ], axis=1)
y = df_model["DSS_STATUS"].map(map_dict)

In [12]:
y.value_counts()

0    791
1     66
Name: DSS_STATUS, dtype: int64

In [13]:
#get dummy variables

X = pd.get_dummies(X)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0,stratify=y)


In [15]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((574, 204), (283, 204), (574,), (283,))

In [16]:
sensitivity = make_scorer(recall_score, pos_label=1)
specificity = make_scorer(recall_score, pos_label=0)
PPV = make_scorer(precision_score, pos_label=1)
NPV = make_scorer(precision_score, pos_label=0)
MCC = make_scorer(matthews_corrcoef)
score_metrics = {'roc_auc':'roc_auc','accuracy':'accuracy','MCC':MCC,
                 'bal_acc':'balanced_accuracy', 'sensitivity' : sensitivity,
                 'specificity': specificity,'PPV': PPV, 'NPV' : NPV, 'f1':'f1'}

In [17]:
# define pipeline
from sklearn.linear_model import LogisticRegression

imp = SimpleImputer(strategy="most_frequent") #IterativeImputer(max_iter=100, random_state=0)
scale = StandardScaler()
over = SMOTE(random_state=0)

model = LogisticRegression()

#defining steps in pipeline
steps = [('impute',imp),('scale', scale), ('over', over), ('model', model)]
pipeline = Pipeline(steps=steps, verbose=3)

In [None]:
#Logisitic Regression: 
from sklearn.model_selection import cross_validate

scores = cross_validate(pipeline, X_train, y_train, cv = 10, 
                       scoring=score_metrics,
                        return_train_score=False, 
                        verbose=1,n_jobs=5)


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.


In [None]:
scores_df = pd.DataFrame(scores)

In [None]:
scores_df.describe()

In [None]:
#KNeighborsClassifier: 

from sklearn.neighbors import KNeighborsClassifier

imp = SimpleImputer(strategy="most_frequent") #IterativeImputer(max_iter=100, random_state=0)
scale = StandardScaler()
over = SMOTE(random_state=0)

model = KNeighborsClassifier(n_neighbors=3)

#defining steps in pipeline
steps = [('impute',imp),('scale', scale), ('over', over), ('model', model)]
pipeline = Pipeline(steps=steps, verbose=3)

In [None]:
scores = cross_validate(pipeline, X_train, y_train, cv = 10, 
                       scoring=score_metrics,
                        return_train_score=False, 
                        verbose=1,n_jobs=5)


In [None]:
scores_df = pd.DataFrame(scores)
scores_df.describe()

In [24]:
#Decision Tree: 
from sklearn.tree import DecisionTreeClassifier
imp = SimpleImputer(strategy="most_frequent") #IterativeImputer(max_iter=100, random_state=0)
scale = StandardScaler()
over = SMOTE(random_state=0)

model = DecisionTreeClassifier()

#defining steps in pipeline
steps = [('impute',imp),('scale', scale), ('over', over), ('model', model)]
pipeline = Pipeline(steps=steps, verbose=3)

In [25]:
scores = cross_validate(pipeline, X_train, y_train, cv = 10, 
                       scoring=score_metrics,
                        return_train_score=False, 
                        verbose=1,n_jobs=5)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:    0.3s finished


In [26]:
scores_df = pd.DataFrame(scores)
scores_df.describe()

Unnamed: 0,fit_time,score_time,test_roc_auc,test_accuracy,test_MCC,test_bal_acc,test_sensitivity,test_specificity,test_PPV,test_NPV,test_f1
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.156739,0.009174,0.491415,0.797913,-0.015866,0.491415,0.13,0.85283,0.062619,0.922661,0.084232
std,0.014048,0.000763,0.100656,0.032881,0.15128,0.100656,0.198886,0.031822,0.098962,0.017396,0.131806
min,0.140204,0.007996,0.40566,0.741379,-0.126719,0.40566,0.0,0.792453,0.0,0.901961,0.0
25%,0.147041,0.008998,0.433962,0.789474,-0.109336,0.433962,0.0,0.834906,0.0,0.913506,0.0
50%,0.154557,0.009,0.438679,0.80006,-0.102791,0.438679,0.0,0.867925,0.0,0.919184,0.0
75%,0.162772,0.00903,0.529127,0.809513,0.039085,0.529127,0.2375,0.867925,0.095833,0.932128,0.136555
max,0.18871,0.010907,0.733962,0.844828,0.347679,0.733962,0.6,0.886792,0.3,0.958333,0.4


In [23]:
#Random Forest with GridSearch CV: 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# params =  {
#     'min_samples_split': [2, 3, 4],
#     'max_depth': [6, 16, None]
# }

# grid = GridSearchCV(rf, param_grid = params, 
#                           cv = 10, n_jobs=1, verbose=2)

# grid.fit(X_train,y_train)

# grid.best_score_


In [25]:
#Random Forest: 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

imp = SimpleImputer(strategy="most_frequent") #IterativeImputer(max_iter=100, random_state=0)
scale = StandardScaler()
over = SMOTE(random_state=0)

rf = RandomForestClassifier()


params =  {
    'min_samples_split': [2, 3, 4],
    'max_depth': [6, 16, None]
}

model = RandomizedSearchCV(rf, param_distributions = params, 
                          cv = 3, n_jobs=-1, verbose=2)

#defining steps in pipeline
steps = [('impute',imp),('scale', scale), ('over', over), ('model', model)]
pipeline = Pipeline(steps=steps, verbose=3)

In [27]:
scores = cross_validate(pipeline, X_train, y_train, cv = 10, 
                       scoring=score_metrics,
                        return_train_score=False, 
                        verbose=1,n_jobs=-5)
scores_df = pd.DataFrame(scores)
scores_df.describe()

[Parallel(n_jobs=-5)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-5)]: Done  10 out of  10 | elapsed:   13.7s finished


Unnamed: 0,fit_time,score_time,test_roc_auc,test_accuracy,test_MCC,test_bal_acc,test_sensitivity,test_specificity,test_PPV,test_NPV,test_f1
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,11.799048,0.026619,0.517547,0.925136,0.088095,0.521557,0.045,0.998113,0.2,0.92655,0.073333
std,0.170037,0.00446,0.081426,0.011393,0.196161,0.048382,0.095598,0.005967,0.421637,0.010226,0.155397
min,11.553709,0.022217,0.419811,0.912281,-0.036711,0.490566,0.0,0.981132,0.0,0.913793,0.0
25%,11.678811,0.0239,0.461085,0.913793,0.0,0.5,0.0,1.0,0.0,0.917488,0.0
50%,11.800612,0.02438,0.504717,0.929825,0.0,0.5,0.0,1.0,0.0,0.929825,0.0
75%,11.877303,0.028018,0.553774,0.929825,0.0,0.5,0.0,1.0,0.0,0.929825,0.0
max,12.073969,0.034861,0.664151,0.947368,0.486423,0.625,0.25,1.0,1.0,0.946429,0.4


In [29]:
imp = SimpleImputer(strategy="most_frequent") #IterativeImputer(max_iter=100, random_state=0)
scale = StandardScaler()
over = SMOTE(random_state=0)

model = GradientBoostingClassifier()

#defining steps in pipeline
steps = [('impute',imp),('scale', scale), ('over', over), ('model', model)]
pipeline = Pipeline(steps=steps, verbose=3)

In [30]:
scores = cross_validate(pipeline, X_train, y_train, cv = 10, 
                       scoring=score_metrics,
                        return_train_score=False, 
                        verbose=1,n_jobs=5)
scores_df = pd.DataFrame(scores)
scores_df.describe()

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:    6.2s finished


Unnamed: 0,fit_time,score_time,test_roc_auc,test_accuracy,test_MCC,test_bal_acc,test_sensitivity,test_specificity,test_PPV,test_NPV,test_f1
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,3.102171,0.009316,0.548962,0.89386,0.097895,0.534245,0.11,0.958491,0.223333,0.928663,0.137143
std,0.028171,0.000436,0.154438,0.028383,0.189899,0.061029,0.117379,0.024841,0.32358,0.010506,0.154626
min,3.049995,0.008924,0.254717,0.862069,-0.071734,0.471698,0.0,0.924528,0.0,0.909091,0.0
25%,3.092025,0.009004,0.491981,0.86585,-0.052387,0.481132,0.0,0.943396,0.0,0.924878,0.0
50%,3.115342,0.009093,0.577358,0.894737,0.043909,0.526415,0.1,0.962264,0.1,0.927273,0.1
75%,3.121613,0.00949,0.634198,0.912281,0.213248,0.583491,0.2,0.976415,0.3,0.928571,0.264286
max,3.131135,0.010085,0.807547,0.947368,0.486423,0.625,0.25,1.0,1.0,0.946429,0.4
