In [1]:
import os
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.model_selection import GridSearchCV,StratifiedKFold, KFold, cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression as LogR
from sklearn.preprocessing import OneHotEncoder
from itertools import chain

## Place Utilities in code block below:#

In [2]:


def merge_files(base_filename,filename):
  base = '/home/'
  base_df = pd.read_csv(base + base_filename,index_col=0)
  filename_df = pd.read_csv(base + filename+ ".csv",index_col=0)
  train_data = base_df.join(filename_df)
  return train_data

# Parsing Functions:

def get_integer_cols(df):
  int_type_cols=df.columns[[x=='int64' for x in df.dtypes]]
  df_res=df[int_type_cols]
  return(df_res)
def get_floats(df):
  float_type_cols=df.columns[[x=='float64' for x in df.dtypes]]
  df_res=df[float_type_cols]
  return(df_res)
def get_strings(df):
  str_type_cols=df.columns[[x!='int64' and x!='float64' for x in df.dtypes]]
  df_res=df[str_type_cols]
  return(df_res)

# Discrete data processing:

def filter_missing_data(df,missing_frac):
  missing_entry_vals=[66,99,77,88,666,999]
  #For all the columns in a data frame, removes columns where the fraction of 
  # entries exceeds "missing_frac" fraction of data
  keep_inds=df.apply(lambda x:(sum([y in missing_entry_vals for y in x])/len(x))<missing_frac,axis=0)
  df_res=df[keep_inds[[y is True for y in keep_inds]].index]
  return(df_res)

def compute_entropy(x):
  value_freq=pd.Series(x).value_counts(normalize=True,sort=False)
  return(- sum(value_freq*np.log(value_freq)) )

def filter_lowInfo_cols(df,thresh=1):
  #Filters out columns that have low entropy (there isn't much variability in the category):
  keep_inds=df.apply(lambda x:compute_entropy(x)>thresh)
  df_res=df[keep_inds[[y is True for y in keep_inds]].index]
  return(df_res)

def filter_lowInfo_cols_ohe(df,thresh=0.15):
  #Filters out columns that have low diversity based on threshold (lots of 1 or 0)
  keep_inds=df.apply(lambda x:(sum(x)/len(x))>thresh,axis=0)
  df_res=df[keep_inds[[y is True for y in keep_inds]].index]
  return(df_res)

def oneHot_wrapper(df_disc):
  #Convert discrete df into one hot encodings:
  enc=OneHotEncoder(handle_unknown='ignore')
  enc.fit(df_disc)
  new_colnames=[]
  for i in range(df_disc.shape[1]):
    for j in range(len(enc.categories_[i])):
      new_colnames.append(df_disc.columns[i]+"_"+str(enc.categories_[i][j]))
  new_disc_df=pd.DataFrame(enc.transform(df_disc).toarray(),index=df_disc.index,columns=new_colnames)
  return(new_disc_df)

#Continuous data processing:
from sklearn import preprocessing
def normalize_data(df):
  df_dta = df.apply(lambda x: np.log10(x+1))
  df_dta = preprocessing.StandardScaler().fit(df_dta).transform(df_dta)
  new_df=pd.DataFrame(data=df_dta,columns=df.columns,index=df.index)
  return new_df

def preprocess_main(df):
  # Filters and processes df columns:
  #1. Categorical data processing:
  cat_dta=get_integer_cols(df)
  if cat_dta.shape[1]!=0:
    cat_filt=filter_missing_data(cat_dta,missing_frac=0.5)
    cat_ohe=oneHot_wrapper(cat_filt)
    cat_dta=filter_lowInfo_cols_ohe(cat_ohe,thresh=0.15)
  
  #2. Float normalization:
  float_dta=get_floats(df)
  if float_dta.shape[1]!=0:
      float_dta=normalize_data(float_dta)
  
  #Merge data together:
  final_df=cat_dta.join(float_dta,how='left')
  return(final_df)

def extract_features_logistic(input_df,cvs):
  #Wraps up an skl model into a fitting routine
  #0. parse the features and the output into a model:
  y_vec=input_df[['PatientStatus']].values[:,0]
  covars=[col for col in input_df.columns if col!='PatientStatus']
  X_dta=input_df[covars]
  #1. Separate data into train test split:
  
 
  mdl=LogR(penalty='l1',solver='liblinear')
  paramGrid={'C':np.logspace(-2,3,10)}
  fit_obj=GridSearchCV(mdl,param_grid=paramGrid,scoring='roc_auc',cv=cvs)
  fit_obj.fit(X_dta,y_vec)
  #Store the best score:
  best_roc=fit_obj.best_score_
  fit_obj=fit_obj.best_estimator_
  #Extract features:
  coefficients=pd.Series(data=fit_obj.coef_[0,:],index=X_dta.columns)
  return best_roc,coefficients

## Main Script for feature Selection

In [None]:
import os
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.model_selection import GridSearchCV,StratifiedKFold, KFold, cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression as LogR

inputFile='/home/Background and Helpful Info.csv'
dta_prefix=inputFile.split('.csv')[0].split('/')[-1]
#1. load files
train_dta=pd.read_csv('Tumor information_train.csv',index_col=0)
feature_data=pd.read_csv(inputFile,index_col=0).fillna(value=0)
# -----Might need to add further processing to data ---------

#feature_data=merge_files("Tumor information_train.csv",inputFile)
#2. Feature Trimming
proc_dta=preprocess_main(feature_data)
proc_dta.to_csv(dta_prefix+'_'+'new_featureMat.csv',sep=',')
feature_data_merge=train_dta.join(proc_dta).drop(['PrimarySite','Histology','Grade','SEERSummStage2000'],axis=1)
#3. Feature Selection
best_score,features=extract_features_logistic(proc_dta,cvs=10)
print('Best estimator AUC score: {0}'.format(best_score))
#4. Saving
features.to_csv(dta_prefix+'_featureVector.csv',sep=',')

FileNotFoundError: ignored

In [3]:
inputFile='/home/Herbals.csv'
dta_prefix=inputFile.split('.csv')[0].split('/')[-1]
#1. load files
train_dta=pd.read_csv('/home/Tumor Information_train.csv',index_col=0)
feature_data=pd.read_csv(inputFile,index_col=0)
feature_data.head()

Unnamed: 0_level_0,HrblEvr,Adph,AdphYrs,AdphDays,AdphNow,Bcoh,BcohYrs,BcohDays,BcohNow,Q10,Q10Yrs,Q10Days,Q10Now,CrnP,CrnPYrs,CrnPDays,CrnPNow,Dong,DongYrs,DongDays,DongNow,EPA,EPAYrs,EPADays,EPANow,GarP,GarPYrs,GarPDays,GarPNow,Gnko,GnkoYrs,GnkoDays,GnkoNow,Gnsg,GnsgYrs,GnsgDays,GnsgNow,Gpsd,GpsdYrs,GpsdDays,GpsdNow,Gluc,GlucYrs,GlucDays,GlucNow,Chon,ChonYrs,ChonDays,ChonNow,Lutn,LutnYrs,LutnDays,LutnNow,Lyco,LycoYrs,LycoDays,LycoNow,Mltn,MltnYrs,MltnDays,MltnNow,MSM,MSMYrs,MSMDays,MSMNow,Soy,SoyYrs,SoyDays,SoyNow,SJW,SJWYrs,SJWDays,SJWNow
QBarcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1
100,0,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66
102,1,0,66,66,66,2,1,3,1,0,66,66,66,0,66,66,66,0,66,66,66,2,1,3,1,2,1,3,1,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,1,1,3,0
104,1,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,2,2,2,1,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66
105,1,0,66,66,66,0,66,66,66,0,66,66,66,1,1,1,0,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66
106,1,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,2,3,3,1


In [4]:
feature_data.fillna(0, inplace=True)
feature_data.head()

Unnamed: 0_level_0,HrblEvr,Adph,AdphYrs,AdphDays,AdphNow,Bcoh,BcohYrs,BcohDays,BcohNow,Q10,Q10Yrs,Q10Days,Q10Now,CrnP,CrnPYrs,CrnPDays,CrnPNow,Dong,DongYrs,DongDays,DongNow,EPA,EPAYrs,EPADays,EPANow,GarP,GarPYrs,GarPDays,GarPNow,Gnko,GnkoYrs,GnkoDays,GnkoNow,Gnsg,GnsgYrs,GnsgDays,GnsgNow,Gpsd,GpsdYrs,GpsdDays,GpsdNow,Gluc,GlucYrs,GlucDays,GlucNow,Chon,ChonYrs,ChonDays,ChonNow,Lutn,LutnYrs,LutnDays,LutnNow,Lyco,LycoYrs,LycoDays,LycoNow,Mltn,MltnYrs,MltnDays,MltnNow,MSM,MSMYrs,MSMDays,MSMNow,Soy,SoyYrs,SoyDays,SoyNow,SJW,SJWYrs,SJWDays,SJWNow
QBarcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1
100,0,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66
102,1,0,66,66,66,2,1,3,1,0,66,66,66,0,66,66,66,0,66,66,66,2,1,3,1,2,1,3,1,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,1,1,3,0
104,1,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,2,2,2,1,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66
105,1,0,66,66,66,0,66,66,66,0,66,66,66,1,1,1,0,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66
106,1,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,0,66,66,66,2,3,3,1


In [5]:
#2. Feature Trimming
proc_dta=preprocess_main(feature_data)
proc_dta.head()
feature_data_merge=train_dta.join(proc_dta).drop(['PrimarySite','Histology','Grade','SEERSummStage2000'],axis=1)
feature_data_merge.head()

Unnamed: 0_level_0,PatientStatus,HrblEvr_0,HrblEvr_1,Adph_0,Bcoh_0,Q10_0,CrnP_0,Dong_0,EPA_0,GarP_0,Gnko_0,Gnsg_0,Gpsd_0,Gluc_0,Chon_0,Lutn_0,Lyco_0,Mltn_0,MSM_0,Soy_0,SJW_0
QBARCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1454,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21967,1,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1146,0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2815,1,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
214,1,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
proc_dta.to_csv('/home/'+dta_prefix+'_'+'new_featureMat.csv',sep=',')

In [7]:
#3. Feature Selection
best_score,features=extract_features_logistic(feature_data_merge,cvs=5)

In [8]:
print('Best classifier score: {0}'.format(best_score))
features.sort_values(ascending=False)

Best classifier score: 0.5272588877583508


HrblEvr_1    1.457998
HrblEvr_0    1.130671
Lyco_0       0.560450
CrnP_0       0.504984
Lutn_0       0.449759
GarP_0       0.393836
Mltn_0       0.203843
Gnko_0       0.151704
Gluc_0       0.109358
EPA_0        0.105553
Chon_0      -0.047708
Q10_0       -0.075951
Gpsd_0      -0.164669
Soy_0       -0.181784
Adph_0      -0.214775
Dong_0      -0.223950
Bcoh_0      -0.225294
SJW_0       -0.528249
Gnsg_0      -0.538708
MSM_0       -0.601233
dtype: float64

In [9]:
#4. Saving
features.to_csv('/home/'+dta_prefix+'_featureVector.csv',sep=',')

In [14]:
feature_matrix='/'

def get_top_features(df,featureRanks,top_n=10):
    #Top positive features
    highVals=featureRanks.sort_values(ascending=False)[0:(top_n)].index
    #Top negative features
    lowVals=featureRanks.sort_values(ascending=True)[0:(top_n)].index
    cols=list(highVals)+list(lowVals)
    df_features=df[cols]
    return(df_features)


def merge_dataframes(df1,df2):
    return(df1.join(df2))



In [15]:
Top_features=get_top_features(proc_dta,features,top_n=10)
Top_features.head()

Unnamed: 0_level_0,HrblEvr_1,HrblEvr_0,Lyco_0,CrnP_0,Lutn_0,GarP_0,Mltn_0,Gnko_0,Gluc_0,EPA_0,MSM_0,Gnsg_0,SJW_0,Bcoh_0,Dong_0,Adph_0,Soy_0,Gpsd_0,Q10_0,Chon_0
QBarcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
100,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
102,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
104,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
105,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
106,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
Top_features.to_csv('/home/'+dta_prefix+'_top.csv',sep=',')

## Machine learning pipeline

In [22]:
import os
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV,StratifiedKFold, KFold, cross_validate,train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier as RFC

#1. Load in the data:

patient_train=pd.read_csv('/home/Tumor Information_train.csv',index_col=0)
patient_test=pd.read_csv('/home/Tumor Information_test.csv',index_col=0)

#2. Load in our selected features:
featureSet=pd.read_csv('/home/TOTAL_SELECTED_FEATURES_FINAL.csv',index_col=0)

#3. Create dftrain and dftest
dftrain = patient_train.join(featureSet, how="left")
dftest = patient_test.join(featureSet, how="left")

total_train_dta=pd.concat([dftrain,dftest])

#Gather columns in "train" that aren't the id or labels
cols = [col for col in total_train_dta.columns if col not in ['PatientStatus']]

types = total_train_dta.dtypes
cat_columns = [t[0] for t in types.iteritems() if ((t[1] not in ['int64', 'float64']))]

lbl = preprocessing.LabelEncoder()
for col in cat_columns:
    total_train_dta[col] = lbl.fit_transform(total_train_dta[col].astype(str))

#4. Split the data into train and test again:
train_processed_data = total_train_dta.iloc[:len(dftrain)]
test_processed_data = total_train_dta.iloc[len(dftrain):]

#5. Merge and align data:
# Might want to add PrimarySite, Histology or Grade:
# ------Maybe more feature engnieering here---------

# Parse the features and the output into a model:
covars=[col for col in train_processed_data.columns if col!='PatientStatus']
y_vec_train=train_processed_data[['PatientStatus']].values[:,0]
X_dta_train=train_processed_data[covars]
X_dta_test=test_processed_data[covars]
#Random Forest param grid:
param_Grid={'n_estimators':np.linspace(25,250,10,dtype='int64'),
            'max_depth':np.linspace(1,10,10,dtype='int64')
            }
rf_classif=RFC(criterion='gini')
rf_grid=GridSearchCV(rf_classif,param_grid=param_Grid,cv=10,scoring='roc_auc',n_jobs=2)
rf_grid.fit(X_dta_train,y_vec_train)

#Get best estimator
rf_best=rf_grid.best_estimator_
print('Best Random Forest Ensembl AUC:'.format(rf_grid.best_score_))
#Make predictions:
y_test_pred=rf_best.predict_proba(X_dta_test)
#Create response dataframe:
PREDICTIONS=pd.Series(y_test_pred[:,1],index=test_processed_data.index)
PREDICTIONS.to_csv('/home/Team6_TestSet_Predictions.csv')


Best Random Forest Ensembl AUC:


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Creates a model given an activation and learning rate
# Import Densenet from Keras
from keras.layers import Dense
from keras.models import Sequential
def create_model(learning_rate, activation):
  
  	# Create an Adam optimizer with the given learning rate
  	opt =keras.optimizers. Adam(lr = learning_rate)
  	
  	# Create your binary classification model  
  	model = Sequential()
  	model.add(Dense(128, input_shape = (233,), activation = activation))
  	model.add(Dense(256, activation = activation))
  	model.add(Dense(1, activation = 'sigmoid'))
  	
  	# Compile your model with your optimizer, loss, and metrics
  	model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])
  	return model

In [None]:
# Import KerasClassifier from keras scikit learn wrappers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV,cross_val_score
# Create a KerasClassifier
model = KerasClassifier(build_fn = create_model)

# Define the parameters to try out
params = {'activation':['relu', 'tanh'], 'batch_size':[32, 128, 256], 
          'epochs':[50, 100, 200], 'learning_rate':[1E-4, 1E-5]}

# Create a randomize search cv object passing in the parameters to try
random_search = RandomizedSearchCV(model, param_distributions = params, cv = KFold(3),scoring='roc_auc',n_jobs=10,verbose=10)
grid_result = random_search.fit(X_train, y_train)
accuracy=grid_result.best_score_
bestnn=grid_result.best_estimator_
predictions = bestnn.predict(test_processed_data[:,-1])
predictions
predictionsofsurvival=bestnn.predict_proba(X_val)[:,1]