In [None]:
import warnings
import pandas as pd
import urllib.request
import numpy as np
from IPython.display import display
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from textwrap import wrap
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
import glob


In [None]:
# TCGA dictionary information
tcga_dict = open("./data/tcga_dictionaries.txt","r")
dict_name_index = 0 #Set dictionary index counter to 0
for line in tcga_dict:
    if line.startswith("#"): #If line starts with #, the next line will be a known dictionary
        dict_name_index += 1
    elif dict_name_index == 5:
        code_to_disease = eval(line)
        

In [None]:
def getDataAndLabels(name, features):
    labels_string = features.cancer_type
    le            = preprocessing.LabelEncoder()
    labels        = le.fit_transform(labels_string)

    # Get rid of the cancer type and patient_id columns 
    data = features[features.columns[3:]]
    return {'name': name, 'feature_size': data.shape[1],
            'data': data, 'labels': labels , 'label_encoder': le }

In [None]:
print('Loading training data ...')

train_files = glob.glob("./data/features_*.train.csv")
all_train_data = {}
for filename in train_files:
    
    name = filename[16:-10]
    print(" ", name)
    train_features = pd.read_csv(filename)
    all_train_data[name] = getDataAndLabels(name, train_features)

print("done.")

In [None]:
print('Loading test data ...')

test_files = glob.glob("./data/features_*.test.csv")
all_test_data = {}
for filename in test_files:
    
    name = filename[16:-9]
    print(" ", name)
    test_features = pd.read_csv(filename)
    all_test_data[name] = getDataAndLabels(name, test_features)

print("done.")

In [None]:
def getBestParamsLogit(train_data, train_labels):
    #
    # Logistic Regression
    #
    lr = LogisticRegression(penalty='l2', multi_class = 'ovr', solver='liblinear', max_iter=150)
    params = {'C': [0.1, 0.25,  0.5,]}
    logit = GridSearchCV(lr, params, cv=5,
                         scoring='accuracy', return_train_score=True)

    # Fit  training data
    logit.fit(train_data, train_labels)  
    # Show the best C parameter to use and the expected accuracy
    print(' Best param:', logit.best_params_)
    print(' Accuracy:  ', np.round(logit.best_score_, 4) )
    
    return logit.best_params_

In [None]:
def getBestParamsSVM(train_data, train_labels):
    #
    # SVM
    #
    classifier = LinearSVC(penalty='l2')

    params = {'C': [0.01, 0.1, 0.5]}
    svm = GridSearchCV(classifier, params, cv=4, 
                       scoring='accuracy', return_train_score=True)

    # Fit  training data
    svm.fit(train_data, train_labels)  
    # Show the best C parameter to use and the expected accuracy
    print(' Best param:', svm.best_params_)
    print(' Accuracy:  ', np.round(svm.best_score_, 4) )
    
    return svm.best_params_

In [None]:
def runClassifiers(train_data, train_labels, test_data, test_labels, name, hyper_params, scores):

    if name in hyper_params and 'lr' in hyper_params[name]:
        best_params_logit = hyper_params[name]['lr']
    else:
        print("Running grid search on Logistic Regression...")
        best_params_logit = getBestParamsLogit(train_data, train_labels)

    if name in hyper_params and 'svm' in hyper_params[name]:
        best_params_svm = hyper_params[name]['svm']
    else:
        print("Running grid search on Linear SVM...")
        best_params_svm = getBestParamsSVM(train_data, train_labels)


    # Run logistic regression with L2 regularization on reduced
    # feature set
    lr = LogisticRegression(penalty='l2', tol=.01, max_iter=150, 
                            C=best_params_logit['C'], 
                            solver="liblinear", multi_class="ovr")
    lr.fit(train_data, train_labels) 
    predict = lr.predict(test_data)

    # Get precision, recall, f1 scores
    logit_prf_scores      = precision_recall_fscore_support(test_labels, predict, average='weighted')
    logit_scores_by_label = precision_recall_fscore_support(test_labels, predict, average=None)

    # Get confusion matrix
    logit_confusion       = confusion_matrix(test_labels, predict)

        
    #
    # Run Linear SVM
    #
    svm = LinearSVC(penalty='l2', C=best_params_svm['C'])

    svm.fit(train_data, train_labels,) 
    predict = svm.predict(test_data)

    # Get precision, recall, f1 scores
    svm_prf_scores      = precision_recall_fscore_support(test_labels, predict, average='weighted')
    svm_scores_by_label = precision_recall_fscore_support(test_labels, predict, average=None)

    # Get confusion matrix
    svm_confusion       = confusion_matrix(test_labels, predict)

    print("\nLogistic Regression", name)
    print("  precision:", np.round(logit_prf_scores[0], 4))  
    print("  recall:   ", np.round(logit_prf_scores[1], 4))  
    print("  f1:       ", np.round(logit_prf_scores[2], 4))  

    print("\nLinear SVM", name)
    print("  precision:", np.round(svm_prf_scores[0], 4))  
    print("  recall:   ", np.round(svm_prf_scores[1], 4))  
    print("  f1:       ", np.round(svm_prf_scores[2], 4))  
    
    scores[name] = {
        'lr': [
            logit_prf_scores[0],
            logit_prf_scores[1],
            logit_prf_scores[2],
            logit_scores_by_label,
            logit_confusion
        ],
        'svm': [
            svm_prf_scores[0],
            svm_prf_scores[1],
            svm_prf_scores[2],
            svm_scores_by_label,
            svm_confusion
        ]
    }
    

In [None]:
label_encoder            = preprocessing.LabelEncoder()

hyper_params = {
    'l1reg_c0.5':           {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'l1reg_c1':             {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'l1reg_c10':            {'lr': {'C': 0.1},  'svm': {'C': 0.01}},
    'l1reg_c100':           {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'topgenes_small':       {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'bestfit_med':          {'lr': {'C': 0.1 }, 'svm': {'C': 0.01}},
    'bestfit_large':        {'lr': {'C': 0.1 }, 'svm': {'C': 0.01}},
    'all':                  {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'bestfit_with_topgenes':{'lr': {'C': 0.1 }, 'svm': {'C': 0.01}}
}


scores = {}


for name in all_train_data.keys():
    print("************************")
    print(name)
    print("************************")

    train      = all_train_data[name]
    test       = all_test_data[name]

    runClassifiers(train['data'], train['labels'], test['data'], test['labels'], name, hyper_params, scores)
    



In [None]:
df_scores = pd.DataFrame(scores)
rows = []
for name in all_train_data.keys():    
    rows.append([name,
                 all_train_data[name]['feature_size'],
                'lr',
                df_scores.loc['lr'][name][0],
                df_scores.loc['lr'][name][1],
                df_scores.loc['lr'][name][2]])
for name in all_train_data.keys():    
    rows.append([name,
                 all_train_data[name]['feature_size'],
                'svm',
                df_scores.loc['svm'][name][0],
                df_scores.loc['svm'][name][1],
                df_scores.loc['svm'][name][2]])

df_report = pd.DataFrame(rows, columns=['name', 'feature_size', 'classifier', 'precision', 'recall', 'f1'])
display(df_report)

# best precision
sorted_df = df_report.sort_values(by='precision', ascending=0)
display(sorted_df.head(1))

# best recall
sorted_df = df_report.sort_values(by='recall', ascending=0)
display(sorted_df.head(1))

# best f1
sorted_df = df_report.sort_values(by='f1', ascending=0)
display(sorted_df.head(1))

