In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

import pandas as pd
import urllib.request
import numpy as np
import matplotlib as plt
from IPython.display import display
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import scipy.sparse as sp
from sklearn import metrics

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


In [2]:
patient_data = pd.read_csv('../data/patient_clinical_data.txt', delimiter="\t",encoding='iso-8859-1')

pcdf_train = pd.read_csv('../data/somatic_mutations_train.csv')
pcdf_test = pd.read_csv('../data/somatic_mutations_test.csv')



In [None]:
patient_data = patient_data[['bcr_patient_barcode', 'gender', 'age_at_initial_pathologic_diagnosis', 'race']]
patient_data.age_at_initial_pathologic_diagnosis[patient_data.age_at_initial_pathologic_diagnosis == '[Not Available]'] = np.NaN
patient_data.race[patient_data.race == '[Not Available]'] = 'NA'
patient_data.race[patient_data.race == '[Not Evaluated]'] = 'NA'
patient_data.race[patient_data.race == '[Unknown]'] = 'NA'


In [None]:
from sklearn import preprocessing
ge = preprocessing.LabelEncoder()
re = preprocessing.LabelEncoder()


patient_data = patient_data[['bcr_patient_barcode', 'gender', 'age_at_initial_pathologic_diagnosis', 'race']]
patient_data.gender = ge.fit_transform(patient_data.gender)

patient_data.race = ge.fit_transform(patient_data.race)
patient_data.age_at_initial_pathologic_diagnosis[patient_data.age_at_initial_pathologic_diagnosis == '[Not Available]'] = np.NaN
patient_data.age_at_initial_pathologic_diagnosis = patient_data.age_at_initial_pathologic_diagnosis.astype(float)
patient_data = patient_data.fillna(-1.0)


In [None]:
datasets = []

for dataset in [pcdf_train, pcdf_test]:
  # get column list from training set
  cols = list(dataset.columns)
  cols[0] = 'Seq'
  dataset.columns=cols

  # create train and test sets
  lmap = pd.Series(dataset.cancer_type.values,index=dataset.patient_barcode).to_dict()

  by_patient = pd.pivot_table(dataset, index="patient_barcode", columns='gene',
                      values="Seq", aggfunc="count")

  # this block below changes the columns to a 1/0 depending on whether the gene exists.
#  for col in by_patient.columns:
#    by_patient[col] = np.where(by_patient[col]>0, 1.0, 0)

  by_patient = pd.DataFrame(by_patient.to_records())
  by_patient['cancer_type'] = by_patient["patient_barcode"].map(lmap)
  labels = by_patient['cancer_type']
  
  # add sex, race, age
  by_patient = pd.merge(by_patient, patient_data, left_on='patient_barcode', right_on='bcr_patient_barcode',how='left')
  by_patient = by_patient.fillna(0.0)
  
  mat_d = by_patient.drop(['patient_barcode', 'cancer_type', 'bcr_patient_barcode'], axis=1)

  datasets.append({'dataset':mat_d, "labels":labels})

#ensure that columns in test set matches training set
missing_in_train = set(datasets[1]['dataset'].columns) - set(datasets[0]['dataset'].columns)
missing_in_test = set(datasets[0]['dataset'].columns) - set(datasets[1]['dataset'].columns)

for col in missing_in_train:
  datasets[0]['dataset'][col] = 0
for col in missing_in_test:
  datasets[1]['dataset'][col] = 0

#reorder test columns to match train
cols = datasets[0]['dataset'].columns.tolist()
datasets[1]['dataset'] = datasets[1]['dataset'][cols]

train_X = datasets[0]['dataset']
train_Y = datasets[0]['labels']

test_X = datasets[1]['dataset']
test_Y = datasets[1]['labels']


In [None]:
"""
by_ct = pd.pivot_table(pcdf, index="cancer_type", columns='Hugo_Symbol',
                    values="Seq", aggfunc="count")

def find_features(df, count):
  features = set()
  cols = list(df.columns)
  for _, prow in df.iterrows():
    sorted_counts = np.argsort(prow.values)
    idxs = set(sorted_counts[-1*count:])
    features.update([c for i, c in enumerate(cols) if i in idxs])
  return features

features = find_features(by_ct, 100)
"""

In [None]:
"""
shuffle = np.random.permutation(np.arange(mat_d.shape[0]))
#X, Y  = principalComponents[shuffle], labels.iloc[shuffle]
X, Y  = mat_d.iloc[shuffle], labels.iloc[shuffle]
train_X = X[:6000]
train_Y = Y[:6000].values

test_X = X[6000:]
test_Y = Y[6000:].values

"""

# Feature selection

In [None]:
maxs = []
for col in train_X.columns:
  maxs.append(train_X[col].max())
print(sorted(maxs, reverse=True)[:10])

In [None]:
from sklearn.feature_selection import SelectFromModel

clf = LogisticRegression(C=100)
bestfeatures = SelectFromModel(clf, threshold=1.0)

best_f = bestfeatures.fit(train_X,train_Y)


In [None]:
print(len(train_X.columns))
supp = best_f.get_support()
print(supp[:10])

In [None]:
positive = [i for i,v in enumerate(supp) if v]
print(len(positive))

In [None]:
sel_labels = [cols[i] for i in best_f.get_support(indices=True)]
train_X = train_X[sel_labels]
test_X = test_X[sel_labels]


# Logistic regression with L1/L2

In [None]:
clf = LogisticRegression()

clf = clf.fit(train_X,  train_Y)
pred_Y = clf.predict(test_X)

print("Accuracy with all features:",metrics.f1_score(test_Y, pred_Y, average='micro'))
sizes=[]
accuracies=[]

#for c_val in [0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1, 10, 100, 1000]:
for c_val in [0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1, 10, 100, 1000]:
  log_r = LogisticRegression( penalty="l1",  C=c_val, max_iter=200, tol=.01)
  log_r.fit(train_X, train_Y)
  non_zero1 = list() # list of words with non zero weights
  features = train_X.columns
  wts = {feat: wt for wt, feat in zip(log_r.coef_.T, features)}
  [non_zero1.append(feat) for feat, wts in wts.items() if np.count_nonzero(wts) != 0]
  
  log_r = LogisticRegression(C=0.5, max_iter=200, penalty="l2")
  log_r.fit(train_X[non_zero1], train_Y)
  dev_preict2 = log_r.predict(test_X[non_zero1])
  sizes.append(len(non_zero1))
  accuracies.append(metrics.f1_score(test_Y, dev_preict2, average='weighted'))
  print("C = %s, Non zero with L1=%s,  F1-Score with L2 and reduced vocabulary =%s "%(str(c_val), str(len(non_zero1)), 
                                             metrics.f1_score(test_Y, dev_preict2, average='weighted')))


# Logistic regression with PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(train_X)
ev = np.cumsum(pca.explained_variance_ratio_)
evcount = len(ev[ev<99.0])
print("Number of features that expain 99% of the variance", evcount)

pca = PCA(n_components=evcount)
pca.fit(train_X)
train_data = pca.transform(train_X)
test_data = pca.transform(test_X)

In [None]:
clf = LogisticRegression()

clf = clf.fit(train_data,  train_Y)
pred_Y = clf.predict(test_data)

print("Accuracy:",metrics.f1_score(test_Y, pred_Y, average='micro'))

# Random Forest without PCA

In [None]:
clf = RandomForestClassifier(n_estimators=1000, n_jobs=50)

clf = clf.fit(train_X, train_Y)
pred_Y = clf.predict(test_X)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.f1_score(test_Y, pred_Y, average='micro')) 

# Random Forest with PCA

In [None]:
clf = RandomForestClassifier(n_estimators=1000, n_jobs=20)

clf = clf.fit(train_data, train_Y)
pred_Y = clf.predict(test_data)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.f1_score(test_Y, pred_Y, average='micro'))

# Dimensionality reduction using random forest

In [None]:
from sklearn.feature_selection import SelectFromModel

rf = RandomForestClassifier(n_estimators=1000, n_jobs=50)

sel = SelectFromModel(rf)
sel.fit(train_X, train_Y)

In [None]:
import pickle
f = open("rf_selector", "wb") 
s = pickle.dumps(sel)

In [None]:
sel_feats =  sel.get_support(indices=True)

In [None]:
sel_labels = [cols[i] for i in sel_feats]
train_X[sel_labels].head()

In [None]:
log_r_class = []
accuracies = []


for c_val in [0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1, 10, 100, 1000]:
  log_r = LogisticRegression( C=c_val)
  log_r.fit(train_X[sel_labels], train_Y)
  log_r_class.append(log_r)
  dev_preict2 = log_r.predict(test_X[sel_labels])
  accuracies.append(metrics.f1_score(test_Y, dev_preict2, average='weighted'))
  print("C = %s, accuracy=%s "%(str(c_val), metrics.f1_score(test_Y, dev_preict2, average='weighted')))


In [None]:
log_r_class[6]

In [None]:
clf = RandomForestClassifier(n_estimators=1000, n_jobs=50, max_features=75)

clf = clf.fit(train_X[sel_labels], train_Y)
pred_Y = clf.predict(test_X[sel_labels])
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.f1_score(test_Y, pred_Y, average='micro')) 


In [None]:
from sklearn.ensemble import VotingClassifier
rfc = RandomForestClassifier(n_estimators=1000, n_jobs=50, max_features=75)
lrc = LogisticRegression()
voting_clf = VotingClassifier(
        estimators=[('lr', lrc), ('rf', rfc)],
        voting='soft')
vc = voting_clf.fit(train_X[sel_labels], train_Y)


In [None]:
pred_Y = vc.predict(test_X[sel_labels])
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.f1_score(test_Y, pred_Y, average='micro')) 


In [None]:
print("Precision:",metrics.precision_score(test_Y, pred_Y, average='micro'), 'Recall',
      metrics.recall_score(test_Y, pred_Y, average='micro'), 
      "F1-score", metrics.f1_score(test_Y, pred_Y, average='micro'))



In [None]:
print(metrics.classification_report(test_Y, pred_Y))