In [1]:
import warnings
import pandas as pd
import urllib.request
import numpy as np
from IPython.display import display
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from textwrap import wrap
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
import glob


In [2]:
# TCGA dictionary information
tcga_dict = open("./data/tcga_dictionaries.txt","r")
dict_name_index = 0 #Set dictionary index counter to 0
for line in tcga_dict:
    if line.startswith("#"): #If line starts with #, the next line will be a known dictionary
        dict_name_index += 1
    elif dict_name_index == 5:
        code_to_disease = eval(line)
        

In [3]:
def getDataAndLabels(features):
    labels_string = features.cancer_type
    le            = preprocessing.LabelEncoder()
    labels        = le.fit_transform(labels_string)

    # Get rid of the cancer type and patient_id columns 
    data = features[features.columns[3:]]
    return {'data': data, 'labels': labels , 'label_encoder': le }

In [4]:
print('Loading training data ...')

train_files = glob.glob("./data/features_*.train.csv")
all_train_data = {}
for filename in train_files:
    
    name = filename[16:-10]
    print(" ", name)
    train_features = pd.read_csv(filename)
    all_train_data[name] = getDataAndLabels(train_features)

print("done.")

Loading training data ...
  l1reg_c1
  l1reg_c100
  l1reg_c0.5
  l1reg_c10
  topgenes_small
  bestfit_med
  bestfit_large
  all
  bestfit_with_topgenes
done.


In [5]:
print('Loading test data ...')

test_files = glob.glob("./data/features_*.test.csv")
all_test_data = {}
for filename in test_files:
    
    name = filename[16:-9]
    print(" ", name)
    test_features = pd.read_csv(filename)
    all_test_data[name] = getDataAndLabels(test_features)

print("done.")

Loading test data ...
  l1reg_c10
  topgenes_small
  bestfit_large
  l1reg_c0.5
  l1reg_c100
  l1reg_c1
  bestfit_med
  bestfit_with_topgenes
  all
done.


In [9]:
def getBestParamsLogit(train_data, train_labels):
    #
    # Logistic Regression
    #
    lr = LogisticRegression(penalty='l2', multi_class = 'ovr', solver='liblinear', max_iter=150)
    params = {'C': [0.1, 0.25,  0.5,]}
    logit = GridSearchCV(lr, params, cv=5,
                         scoring='accuracy', return_train_score=True)

    # Fit  training data
    logit.fit(train_data, train_labels)  
    # Show the best C parameter to use and the expected accuracy
    print(' Best param:', logit.best_params_)
    print(' Accuracy:  ', np.round(logit.best_score_, 4) )
    
    return logit.best_params_

In [10]:
def getBestParamsSVM(train_data, train_labels):
    #
    # SVM
    #
    classifier = LinearSVC(penalty='l2')

    params = {'C': [0.01, 0.1, 0.5]}
    svm = GridSearchCV(classifier, params, cv=4, 
                       scoring='accuracy', return_train_score=True)

    # Fit  training data
    svm.fit(train_data, train_labels)  
    # Show the best C parameter to use and the expected accuracy
    print(' Best param:', svm.best_params_)
    print(' Accuracy:  ', np.round(svm.best_score_, 4) )
    
    return svm.best_params_

In [37]:
def runClassifiers(train_data, train_labels, test_data, test_labels, name, hyper_params, scores):

    if name in hyper_params and 'lr' in hyper_params[name]:
        best_params_logit = hyper_params[name]['lr']
    else:
        print("Running grid search on Logistic Regression...")
        best_params_logit = getBestParamsLogit(train_data, train_labels)

    if name in hyper_params and 'svm' in hyper_params[name]:
        best_params_svm = hyper_params[name]['svm']
    else:
        print("Running grid search on Linear SVM...")
        best_params_svm = getBestParamsSVM(train_data, train_labels)


    # Run logistic regression with L2 regularization on reduced
    # feature set
    lr = LogisticRegression(penalty='l2', tol=.01, max_iter=150, 
                            C=best_params_logit['C'], 
                            solver="liblinear", multi_class="ovr")
    lr.fit(train_data, train_labels) 
    predict = lr.predict(test_data)

    # Get precision, recall, f1 scores
    logit_prf_scores      = precision_recall_fscore_support(test_labels, predict, average='weighted')
    logit_scores_by_label = precision_recall_fscore_support(test_labels, predict, average=None)

    # Get confusion matrix
    logit_confusion       = confusion_matrix(test_labels, predict)

        
    #
    # Run Linear SVM
    #
    svm = LinearSVC(penalty='l2', C=best_params_svm['C'])

    svm.fit(train_data, train_labels,) 
    predict = svm.predict(test_data)

    # Get precision, recall, f1 scores
    svm_prf_scores      = precision_recall_fscore_support(test_labels, predict, average='weighted')
    svm_scores_by_label = precision_recall_fscore_support(test_labels, predict, average=None)

    # Get confusion matrix
    svm_confusion       = confusion_matrix(test_labels, predict)

    print("\nLogistic Regression", name)
    print("  precision:", np.round(logit_prf_scores[0], 4))  
    print("  recall:   ", np.round(logit_prf_scores[1], 4))  
    print("  f1:       ", np.round(logit_prf_scores[2], 4))  

    print("\nLinear SVM", name)
    print("  precision:", np.round(svm_prf_scores[0], 4))  
    print("  recall:   ", np.round(svm_prf_scores[1], 4))  
    print("  f1:       ", np.round(svm_prf_scores[2], 4))  
    
    scores[name] = {
        'lr': [
            logit_prf_scores[0],
            logit_prf_scores[1],
            logit_prf_scores[2],
            logit_scores_by_label,
            logit_confusion
        ],
        'svm': [
            svm_prf_scores[0],
            svm_prf_scores[1],
            svm_prf_scores[2],
            svm_scores_by_label,
            svm_confusion
        ]
    }
    

In [38]:
label_encoder            = preprocessing.LabelEncoder()

hyper_params = {
    'l1reg_c0.5':           {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'l1reg_c1':             {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'l1reg_c10':            {'lr': {'C': 0.1},  'svm': {'C': 0.01}},
    'l1reg_c100':           {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'topgenes_small':       {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'bestfit_med':          {'lr': {'C': 0.1 }, 'svm': {'C': 0.01}},
    'bestfit_large':        {'lr': {'C': 0.1 }, 'svm': {'C': 0.01}},
    'all':                  {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'bestfit_with_topgenes':{'lr': {'C': 0.1 }, 'svm': {'C': 0.01}}
}


scores = {}


for name in all_train_data.keys():
    print("************************")
    print(name)
    print("************************")

    train      = all_train_data[name]
    test       = all_test_data[name]

    runClassifiers(train['data'], train['labels'], test['data'], test['labels'], name, hyper_params, scores)
    
df = pd.DataFrame(scores)
display(df)


************************
l1reg_c1
************************


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



Logistic Regression l1reg_c1
  precision: 0.5681
  recall:    0.5445
  f1:        0.5322

Linear SVM l1reg_c1
  precision: 0.5478
  recall:    0.548
  f1:        0.5287
************************
l1reg_c100
************************


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



Logistic Regression l1reg_c100
  precision: 0.5681
  recall:    0.5465
  f1:        0.5277

Linear SVM l1reg_c100
  precision: 0.5521
  recall:    0.55
  f1:        0.5305
************************
l1reg_c0.5
************************


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



Logistic Regression l1reg_c0.5
  precision: 0.552
  recall:    0.5385
  f1:        0.5268

Linear SVM l1reg_c0.5
  precision: 0.5404
  recall:    0.5425
  f1:        0.5241
************************
l1reg_c10
************************


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



Logistic Regression l1reg_c10
  precision: 0.5743
  recall:    0.5514
  f1:        0.5304

Linear SVM l1reg_c10
  precision: 0.5488
  recall:    0.5475
  f1:        0.5292
************************
topgenes_small
************************


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



Logistic Regression topgenes_small
  precision: 0.5569
  recall:    0.535
  f1:        0.5204

Linear SVM topgenes_small
  precision: 0.5423
  recall:    0.537
  f1:        0.5192
************************
bestfit_med
************************


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



Logistic Regression bestfit_med
  precision: 0.5603
  recall:    0.531
  f1:        0.519

Linear SVM bestfit_med
  precision: 0.53
  recall:    0.5275
  f1:        0.5111
************************
bestfit_large
************************


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



Logistic Regression bestfit_large
  precision: 0.5681
  recall:    0.5425
  f1:        0.5256

Linear SVM bestfit_large
  precision: 0.5437
  recall:    0.538
  f1:        0.523
************************
all
************************


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



Logistic Regression all
  precision: 0.5706
  recall:    0.5504
  f1:        0.5313

Linear SVM all
  precision: 0.549
  recall:    0.547
  f1:        0.5273
************************
bestfit_with_topgenes
************************


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



Logistic Regression bestfit_with_topgenes
  precision: 0.5603
  recall:    0.531
  f1:        0.519

Linear SVM bestfit_with_topgenes
  precision: 0.53
  recall:    0.5275
  f1:        0.5111


Unnamed: 0,l1reg_c1,l1reg_c100,l1reg_c0.5,l1reg_c10,topgenes_small,bestfit_med,bestfit_large,all,bestfit_with_topgenes
lr,"[0.5681154499739296, 0.5444555444555444, 0.532...","[0.5681442037933181, 0.5464535464535465, 0.527...","[0.5520063533832971, 0.5384615384615384, 0.526...","[0.5742944787609393, 0.5514485514485514, 0.530...","[0.5569367667214056, 0.534965034965035, 0.5204...","[0.5602604447718631, 0.5309690309690309, 0.519...","[0.5681430686396486, 0.5424575424575424, 0.525...","[0.5706167814222141, 0.5504495504495505, 0.531...","[0.5602604447718631, 0.5309690309690309, 0.519..."
svm,"[0.5477702176996838, 0.547952047952048, 0.5287...","[0.5521135462932529, 0.5499500499500499, 0.530...","[0.5403681901438986, 0.5424575424575424, 0.524...","[0.5488495909450991, 0.5474525474525475, 0.529...","[0.5423094578926713, 0.5369630369630369, 0.519...","[0.5300064794198983, 0.5274725274725275, 0.511...","[0.543693044130823, 0.537962037962038, 0.52301...","[0.5490403202231957, 0.5469530469530469, 0.527...","[0.5300064794198983, 0.5274725274725275, 0.511..."


In [73]:
rows = []
for name in all_train_data.keys():    
    rows.append([name,
                'lr',
                df.loc['lr'][name][0],
                df.loc['lr'][name][1],
                df.loc['lr'][name][2]])
for name in all_train_data.keys():    
    rows.append([name,
                'svm',
                df.loc['svm'][name][0],
                df.loc['svm'][name][1],
                df.loc['svm'][name][2]])

df_report = pd.DataFrame(rows, columns=['name', 'classifier', 'precision', 'recall', 'f1'])
display(df_report)

# best precision
sorted = df_report.sort_values(by='precision', ascending=0)
display(sorted.head(1))

# best recall
sorted = df_report.sort_values(by='recall', ascending=0)
display(sorted.head(1))

# best f1
sorted = df_report.sort_values(by='f1', ascending=0)
display(sorted.head(1))



Unnamed: 0,name,classifier,precision,recall,f1
0,l1reg_c1,lr,0.568115,0.544456,0.53216
1,l1reg_c100,lr,0.568144,0.546454,0.527696
2,l1reg_c0.5,lr,0.552006,0.538462,0.526784
3,l1reg_c10,lr,0.574294,0.551449,0.530438
4,topgenes_small,lr,0.556937,0.534965,0.520447
5,bestfit_med,lr,0.56026,0.530969,0.519019
6,bestfit_large,lr,0.568143,0.542458,0.525563
7,all,lr,0.570617,0.55045,0.531318
8,bestfit_with_topgenes,lr,0.56026,0.530969,0.519019
9,l1reg_c1,svm,0.54777,0.547952,0.528737


Unnamed: 0,name,classifier,precision,recall,f1
3,l1reg_c10,lr,0.574294,0.551449,0.530438


Unnamed: 0,name,classifier,precision,recall,f1
3,l1reg_c10,lr,0.574294,0.551449,0.530438


Unnamed: 0,name,classifier,precision,recall,f1
0,l1reg_c1,lr,0.568115,0.544456,0.53216
