In [None]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

import pandas as pd
import urllib.request
import numpy as np
import matplotlib as plt
from IPython.display import display
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import scipy.sparse as sp
from sklearn import metrics

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


In [None]:
pcdf = pd.read_csv('pancancer_mutations_merged.csv')


In [None]:
#filter out non protein
pcdf = pcdf[pcdf.BIOTYPE=='protein_coding']

cols = list(pcdf.columns)
cols[0] = 'Seq'
pcdf.columns=cols
lmap = pd.Series(pcdf.cancer_type.values,index=pcdf.bcr_patient_barcode).to_dict()

by_patient = pd.pivot_table(pcdf, index="bcr_patient_barcode", columns='Hugo_Symbol',
                    values="Seq", aggfunc="count")
by_patient = by_patient.fillna(0.0)
# this block below changes the columns to a 1/0 depending on whether the gene exists.
#for col in by_patient.columns:
#  by_patient[col] = np.where(by_patient[col]>0, 1.0, 0)


In [None]:
by_patient = pd.DataFrame(by_patient.to_records())
by_patient['cancer_type'] = by_patient["bcr_patient_barcode"].map(lmap)
labels = by_patient['cancer_type']

In [None]:
mat_d = by_patient.drop(['bcr_patient_barcode', 'cancer_type'], axis=1)

In [None]:
by_ct = pd.pivot_table(pcdf, index="cancer_type", columns='Hugo_Symbol',
                    values="Seq", aggfunc="count")

def find_features(df, count):
  features = set()
  cols = list(df.columns)
  for _, prow in df.iterrows():
    sorted_counts = np.argsort(prow.values)
    idxs = set(sorted_counts[-1*count:])
    features.update([c for i, c in enumerate(cols) if i in idxs])
  return features

features = find_features(by_ct, 100)


In [None]:
shuffle = np.random.permutation(np.arange(mat_d.shape[0]))
#X, Y  = principalComponents[shuffle], labels.iloc[shuffle]
X, Y  = mat_d.iloc[shuffle], labels.iloc[shuffle]
train_X = X[:6000]
train_Y = Y[:6000].values

test_X = X[6000:]
test_Y = Y[6000:].values



# Logistic regression with L1/L2

In [None]:
clf = LogisticRegression()

clf = clf.fit(train_X,  train_Y)
pred_Y = clf.predict(test_X)

print("Accuracy:",metrics.f1_score(test_Y, pred_Y, average='micro'))
sizes=[]
accuracies=[]

#for c_val in [0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1, 10, 100, 1000]:
for c_val in [0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1, 10, 100, 1000]:
  log_r = LogisticRegression( penalty="l1",  C=c_val, max_iter=200, tol=.01)
  log_r.fit(train_X, train_Y)
  non_zero1 = list() # list of words with non zero weights
  features = train_X.columns
  wts = {feat: wt for wt, feat in zip(log_r.coef_.T, features)}
  [non_zero1.append(feat) for feat, wts in wts.items() if np.count_nonzero(wts) != 0]
  
  log_r = LogisticRegression(C=0.5, max_iter=200, penalty="l2")
  log_r.fit(train_X[non_zero1], train_Y)
  dev_preict2 = log_r.predict(test_X[non_zero1])
  sizes.append(len(non_zero1))
  accuracies.append(metrics.f1_score(test_Y, dev_preict2, average='weighted'))
  print("C = %s, Non zero with L1=%s,  F1-Score with L2 and reduced vocabulary =%s "%(str(c_val), str(len(non_zero1)), 
                                             metrics.f1_score(test_Y, dev_preict2, average='weighted')))


# Logistic regression with PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(train_X)
ev = np.cumsum(pca.explained_variance_)
evcount = len(ev[ev<99.0])
print("Number of features that expain 99% of the variance", evcount)

pca = PCA(n_components=evcount)
pca.fit(train_X)
train_data = pca.transform(train_X)
test_data = pca.transform(test_X)

In [None]:
clf = LogisticRegression()

clf = clf.fit(train_data,  train_Y)
pred_Y = clf.predict(test_data)

print("Accuracy:",metrics.f1_score(test_Y, pred_Y, average='micro'))

# Random Forest with PCA

In [None]:
clf = RandomForestClassifier(n_estimators=1000, n_jobs=20)

clf = clf.fit(train_data, train_Y)
pred_Y = clf.predict(test_data)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.f1_score(test_Y, pred_Y, average='micro'))