In [None]:
import warnings
import pandas as pd
import urllib.request
import numpy as np
from IPython.display import display
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from textwrap import wrap
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
#import xgboost as xgb

import tensorflow as tf
import tensorflow.keras as K
from tensorflow.keras.layers import Dense as Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dropout


import glob
from textwrap import wrap

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#cd /content/drive/My Drive/berkeley/W207 machine learning/Final Project/w207_6_sum19_g5_final_project

## Load the data

In [None]:
# TCGA dictionary information
tcga_dict = open("./data/tcga_dictionaries.txt","r")
dict_name_index = 0 #Set dictionary index counter to 0
for line in tcga_dict:
    if line.startswith("#"): #If line starts with #, the next line will be a known dictionary
        dict_name_index += 1
    elif dict_name_index == 5:
        code_to_disease = eval(line)
        

In [None]:
def getDataAndLabels(name, features, label_encoder):
    labels_string = features.cancer_type
   
    labels        = label_encoder.fit_transform(labels_string)

    # Get rid of the cancer type and patient_barcode columns 
    if (name == 'after_pca'):
        data = features[features.columns[1:-2]]
    else:
        data = features[features.columns[3:]]

    return {'name': name, 'feature_size': data.shape[1],
            'data': data, 'labels': labels , 'label_encoder': label_encoder }

In [None]:
print('Loading training data ...')
# label encoder
label_encoder   = preprocessing.LabelEncoder()

# get all file names that start with features_
train_files = glob.glob("./data/features_*.train.csv")
all_train_data = {}

# load all of the files
for filename in train_files:
    
    name = filename[16:-10]
    #if (name != 'after_pca'):
    print(" ", name)
    train_features = pd.read_csv(filename)
    all_train_data[name] = getDataAndLabels(name, train_features, label_encoder)

print("done.")

In [None]:
print('Loading test data ...')

test_files = glob.glob("./data/features_*.test.csv")
all_test_data = {}
for filename in test_files:
    
    name = filename[16:-9]
    #if (name != 'after_pca'):
    print(" ", name)
    test_features = pd.read_csv(filename)
    all_test_data[name] = getDataAndLabels(name, test_features, label_encoder)

print("done.")

## Functions for running different classifiers

In [None]:
def getBestParamsLogit(train_data, train_labels):
    #
    # Logistic Regression
    #
    lr = LogisticRegression(penalty='l2', multi_class = 'ovr', solver='liblinear', max_iter=150)
    params = {'C': [0.1, 0.25,  0.5,]}
    logit = GridSearchCV(lr, params, cv=5,
                         scoring='accuracy', return_train_score=True)

    # Fit  training data
    logit.fit(train_data, train_labels)  
    # Show the best C parameter to use and the expected accuracy
    print(' Best param:', logit.best_params_)
    print(' Accuracy:  ', np.round(logit.best_score_, 4) )
    
    return logit.best_params_

In [None]:
def getBestParamsSVM(train_data, train_labels):
    #
    # SVM
    #
    classifier = LinearSVC(penalty='l2')

    params = {'C': [0.01, 0.1, 0.5]}
    svm = GridSearchCV(classifier, params, cv=4, 
                       scoring='accuracy', return_train_score=True)

    # Fit  training data
    svm.fit(train_data, train_labels)  
    # Show the best C parameter to use and the expected accuracy
    print(' Best param:', svm.best_params_)
    print(' Accuracy:  ', np.round(svm.best_score_, 4) )
    
    return svm.best_params_

In [None]:
#
# Logistic regression
#
def run_logistic_regression(train_data, train_labels, test_data, test_labels, name, hyper_params, scores):
  if name in hyper_params and 'lr' in hyper_params[name]:
      best_params_logit = hyper_params[name]['lr']
  else:
      print("Running grid search on Logistic Regression...")
      best_params_logit = getBestParamsLogit(train_data, train_labels)

  # Run logistic regression with L2 regularization on reduced
  # feature set
  lr = LogisticRegression(penalty='l2', tol=.01, max_iter=150, 
                          C=best_params_logit['C'], 
                          solver="liblinear", multi_class="ovr")
  lr.fit(train_data, train_labels) 
  predict = lr.predict(test_data)

  # Get precision, recall, f1 scores
  logit_prf_scores      = precision_recall_fscore_support(test_labels, predict, average='weighted')
  logit_scores_by_label = precision_recall_fscore_support(test_labels, predict, average=None)

  # Get confusion matrix
  logit_confusion       = confusion_matrix(test_labels, predict)

  print("\nLogistic Regression", name)
  print("  precision:", np.round(logit_prf_scores[0], 4))  
  print("  recall:   ", np.round(logit_prf_scores[1], 4))  
  print("  f1:       ", np.round(logit_prf_scores[2], 4))   

  return [
          logit_prf_scores[0],
          logit_prf_scores[1],
          logit_prf_scores[2],
          logit_scores_by_label,
          logit_confusion]
           


In [None]:
#
# Linear SVM
#
def run_linear_svm(train_data, train_labels, test_data, test_labels, name, hyper_params, scores):
  print("\nLinear SVM", name)

  if name in hyper_params and 'svm' in hyper_params[name]:
    best_params_svm = hyper_params[name]['svm']
  else:
      print("Running grid search on Linear SVM...")
      best_params_svm = getBestParamsSVM(train_data, train_labels)

  svm = LinearSVC(penalty='l2', C=best_params_svm['C'])

  svm.fit(train_data, train_labels,) 
  predict = svm.predict(test_data)

  # Get precision, recall, f1 scores
  svm_prf_scores      = precision_recall_fscore_support(test_labels, predict, average='weighted')
  svm_scores_by_label = precision_recall_fscore_support(test_labels, predict, average=None)

  # Get confusion matrix
  svm_confusion       = confusion_matrix(test_labels, predict)

  print("  precision:", np.round(svm_prf_scores[0], 4))  
  print("  recall:   ", np.round(svm_prf_scores[1], 4))  
  print("  f1:       ", np.round(svm_prf_scores[2], 4))      
  
  return [
          svm_prf_scores[0],
          svm_prf_scores[1],
          svm_prf_scores[2],
          svm_scores_by_label,
          svm_confusion]


In [None]:
#
# Decision tree
#
def run_decision_tree(train_data, train_labels, test_data, test_labels, name, hyper_params, scores):

    print("\nDecision Tree", name)

    dt = DecisionTreeClassifier()
    
    dt.fit(train_data, train_labels,) 
    predict = dt.predict(test_data)

    # Get precision, recall, f1 scores
    dt_prf_scores      = precision_recall_fscore_support(test_labels, predict, average='weighted')
    dt_scores_by_label = precision_recall_fscore_support(test_labels, predict, average=None)

    # Get confusion matrix
    dt_confusion       = confusion_matrix(test_labels, predict)

    
    print("  precision:", np.round(dt_prf_scores[0], 4))  
    print("  recall:   ", np.round(dt_prf_scores[1], 4))  
    print("  f1:       ", np.round(dt_prf_scores[2], 4))
    
    return [
          dt_prf_scores[0],
          dt_prf_scores[1],
          dt_prf_scores[2],
          dt_scores_by_label,
          dt_confusion]



In [None]:
#
# Random forest
#
def run_random_forest(train_data, train_labels, test_data, test_labels, name, hyper_params, scores):
    print("\nRandom Forest", name)

    rf = RandomForestClassifier(n_estimators=500)
    
    rf.fit(train_data, train_labels,) 
    predict = rf.predict(test_data)

    # Get precision, recall, f1 scores
    rf_prf_scores      = precision_recall_fscore_support(test_labels, predict, average='weighted')
    rf_scores_by_label = precision_recall_fscore_support(test_labels, predict, average=None)

    # Get confusion matrix
    rf_confusion       = confusion_matrix(test_labels, predict)
    
    print("  precision:", np.round(rf_prf_scores[0], 4))  
    print("  recall:   ", np.round(rf_prf_scores[1], 4))  
    print("  f1:       ", np.round(rf_prf_scores[2], 4)) 
    
    return [
          rf_prf_scores[0],
          rf_prf_scores[1],
          rf_prf_scores[2],
          rf_scores_by_label,
          rf_confusion]





In [None]:
#
# Neural Net
#
def run_neural_net(train_data, train_labels, test_data, test_labels, name, hyper_params, scores):
    print("\nNeural Net", name)

    tr_lab = to_categorical(train_labels)
    test_lab = to_categorical(test_labels)
    model = K.Sequential()
    model.add(Dense(2000, input_dim=train_data.shape[1], activation='relu', 
                    kernel_regularizer=regularizers.l1_l2(l2=0.01,l1=0.01)))
    model.add(Dropout(0.2))
    model.add(Dense(1000, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(400, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='sigmoid'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ["accuracy"])
    #model.fit(train_data, tr_lab, epochs=1000, batch_size=100)
    model.fit(train_data, tr_lab, epochs=100, batch_size=100)
    evaluate = model.evaluate(x = test_data, y = test_lab)
    predict = model.predict(test_data)    
    
    # Get precision, recall, f1 scores
    nn_prf_scores      = precision_recall_fscore_support(test_labels,np.argmax(predict,1), average='weighted')
    nn_scores_by_label = precision_recall_fscore_support(test_labels,np.argmax(predict,1), average=None)

    # Get confusion matrix
    #nn_confusion       = confusion_matrix(test_labels, predict)
    
    print("  precision:", np.round(nn_prf_scores[0], 4))  
    print("  recall:   ", np.round(nn_prf_scores[1], 4))  
    print("  f1:       ", np.round(nn_prf_scores[2], 4))  
    
    return [
          nn_prf_scores[0],
          nn_prf_scores[1],
          nn_prf_scores[2],
          nn_scores_by_label,
          []]




In [None]:
#
# XGBoost
#
def run_xg_boost(train_data, train_labels, test_data, test_labels, name, hyper_params, scores):
    print("\nXG Boost", name)

    xgb_params = {
    'max_depth': 2, 
    'eta': 0.3,  
    'silent': False,  
    'verbose': True,
    'objective': 'multi:softprob',  
    'num_class': 32,
    'num_boost_round' : 2}  

    xgb_cfr = xgb.XGBClassifier(**xgb_params)
    xgb_cfr.fit(train_data, train_labels)
    
    predict = xgb_cfr.predict(test_ata)
    
    # Get precision, recall, f1 scores
    xgb_prf_scores      = precision_recall_fscore_support(test_labels, predict, average='weighted')
    xgb_scores_by_label = precision_recall_fscore_support(test_labels, predict, average=None)

    # Get confusion matrix
    xgb_confusion       = confusion_matrix(test_labels, predict)
    
    print("  precision:", np.round(xgb_prf_scores[0], 4))  
    print("  recall:   ", np.round(xgb_prf_scores[1], 4))  
    print("  f1:       ", np.round(xgb_prf_scores[2], 4))  
    
    return  [
            xgb_prf_scores[0],
            xgb_prf_scores[1],
            xgb_prf_scores[2],
            xgb_scores_by_label,
            xgb_confusion]
           

    

## Run the different classifiers 

In [None]:
def runClassifiers(train_data, train_labels, test_data, test_labels, name, hyper_params, scores):

    lr_scores = run_logistic_regression(train_data, train_labels, test_data, test_labels, name, hyper_params, scores)

    svm_scores = run_linear_svm(train_data, train_labels, test_data, test_labels, name, hyper_params, scores)
    
    dt_scores = run_decision_tree(train_data, train_labels, test_data, test_labels, name, hyper_params, scores)

    rf_scores = run_random_forest(train_data, train_labels, test_data, test_labels, name, hyper_params, scores)
       
    nn_scores = run_neural_net(train_data, train_labels, test_data, test_labels, name, hyper_params, scores)

    scores[name] = {}
    scores[name]['lr'] = lr_scores
    scores[name]['svm'] = svm_scores
    scores[name]['dt'] = dt_scores
    scores[name]['rf'] = rf_scores
    scores[name]['nn'] = nn_scores
    

In [None]:
hyper_params = {
    'l1reg_c0.5':           {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'l1reg_c1':             {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'l1reg_c10':            {'lr': {'C': 0.1},  'svm': {'C': 0.01}},
    'l1reg_c100':           {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'topgenes_small':       {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'bestfit_med':          {'lr': {'C': 0.1 }, 'svm': {'C': 0.01}},
    'bestfit_large':        {'lr': {'C': 0.1 }, 'svm': {'C': 0.01}},
    'all':                  {'lr': {'C': 0.25}, 'svm': {'C': 0.01}},
    'bestfit_with_topgenes':{'lr': {'C': 0.1 }, 'svm': {'C': 0.01}},
    'after_pca':            {'lr': {'C': 0.5 }, 'svm': {'C': 0.01}}
}


scores = {}


for name in all_train_data.keys():
    print("************************")
    print(name)
    print("************************")

    train      = all_train_data[name]
    test       = all_test_data[name]

    runClassifiers(train['data'], train['labels'], test['data'], test['labels'], name, hyper_params, scores)

## Visualize Performance across different feature sets, different classifiers

In [None]:
colors = {'lr': 'olivedrab', 'svm': 'slateblue', 
          'dt': 'mediumseagreen', 'rf': 'goldenrod',
          'xgb': 'coral', 'nn': 'crimson'}

df_scores = pd.DataFrame(scores)
rows = []
for name in all_train_data.keys():    
    for classifier in ['lr', 'svm', 'dt', 'rf', 'nn']:
        rows.append([name,
                     all_train_data[name]['feature_size'],
                    classifier,
                    df_scores.loc[classifier][name][0],
                    df_scores.loc[classifier][name][1],
                    df_scores.loc[classifier][name][2]])

df_report = pd.DataFrame(rows, columns=['name', 'feature_size', 'classifier', 'precision', 'recall', 'f1'])


In [None]:
def plot_classifier_metrics(df_report, label_encoder):
    
    plt.rcParams["figure.figsize"] = (20,20)

    labels = []
    for key, group in df_report.groupby(['feature_size', 'name']):
        labels.append(str(key[0]) + '\n' + key[1])
        
    sorted_df_report = df_report.sort_values(by=['classifier', 'feature_size', 'name'], ascending=[1,1,1])


        
    for classifier, group in sorted_df_report.groupby(['classifier']):

        plt.plot(labels, group.precision.values, color=colors[classifier], 
                 linewidth=3, label=classifier + " precision", marker='o' )
        plt.plot(labels, group.recall.values, color=colors[classifier], linestyle="dashed",
                 linewidth=3, label=classifier + " recall", marker='o' )
    

    plt.yticks(np.arange(0, .65, .01))
    plt.ylabel('Precision, Recall', fontsize=20)
    plt.xlabel('Precision and Recall across different Features and Classifiers', fontsize=20, labelpad=20)
    plt.legend()
    plt.grid()
    plt.show()

    

In [None]:
def show_precision_recall_by_label(precision_by_label, recall_by_label, name, classifier, label_encoder):

    labels = []
    for i in range(len(precision_by_label)):
        label = label_encoder.inverse_transform([i])[0]
        labels.append(label)
    
    y_pos = np.arange(len(labels))    

    fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=False)

    ax1.invert_xaxis()
    ax1.yaxis.tick_right()
    
    ax1.set_yticks(y_pos)
    ax1.set_yticklabels(labels)
    
    ax2.set_yticks(y_pos)
    ax2.set_yticklabels(labels)
        
    ax1.barh(y_pos, precision_by_label, color=colors[classifier] , label="precision")
    ax2.barh(y_pos, recall_by_label,    color=colors[classifier],  label='recall')

    ax1.set_title('Precision( ' + classifier + ')')
    ax2.set_title('Recall (' + classifier + ')')
    
    plt.grid()
    plt.show()

In [None]:
def coords_of_max(theArray, n):
    # Flatten the 2D array
    flat = theArray.flatten()
    # Partition so that the we know the sort order for
    # the cells with the highest values.  We just
    # care about the top n highest values.  So for example,
    # if n = 3, get return 3 indices.  
    indices = np.argpartition(flat, -n)[-n:]
    # Reverse so that we show index of highest value first
    # (descending)
    indices = indices[np.argsort(-flat[indices])]
    # Now return the coordinates for these indices
    # for a 2D array.  This will return 2 arrays,
    # the first for the row index, the second for the
    # column index.  The row index represents the
    # actual digit, the column index represents
    # the confused digit
    return np.unravel_index(indices, theArray.shape)

In [None]:
def show_confusion_matrix(conf_mx, label_encoder):
    # Determine the error rates for each misclassification pair
    row_sums = conf_mx.sum(axis=1, keepdims=True)
    norm_conf_mx = conf_mx / row_sums
    # Set the error rates for correctly classified pairs (the diagonal) to zero
    np.fill_diagonal(norm_conf_mx, 0)
    
    max_coords = coords_of_max(norm_conf_mx, 20)
    confusion_rows = []
    for i in range(len(max_coords[0])):

        # This is the actual label
        actual_label_idx  = max_coords[0][i]
        actual_label      = label_encoder.inverse_transform([actual_label_idx])[0]

        # This is the predicted label
        predicted_label_idx = max_coords[1][i]
        predicted_label = label_encoder.inverse_transform([predicted_label_idx])[0]
        
        # This is the error rate
        error_rate  = norm_conf_mx[max_coords[0][i], max_coords[1][i]]
        error_count = conf_mx[max_coords[0][i], max_coords[1][i]]

        row = list([ actual_label,                     
                     predicted_label,
                     code_to_disease[actual_label][0], 
                     code_to_disease[predicted_label][0], 
                     error_rate, 
                     error_count ])
        confusion_rows.append(row)
    
    df = pd.DataFrame(confusion_rows, columns=['actual', 'predicted',  'actual_name', 'predicted_name', 'error_rate', 'error_count'])
    display(df)

        

In [None]:
# Plot precision and accuracy across different classifiers
plot_classifier_metrics(df_report, label_encoder)

In [None]:
display(df_report)

In [None]:
# best precision
sorted_df = df_report.sort_values(by='precision', ascending=0)
best_precision = sorted_df.head(1)

# best recall
sorted_df = df_report.sort_values(by='recall', ascending=0)
best_recall = sorted_df.head(1)

# best f1
sorted_df = df_report.sort_values(by='f1', ascending=0)
best_f1 = sorted_df.head(1)

# Show the feature set and classifier with the best 
# precision, recall, and f1 scores
print("\n\nBest precision")
display(best_precision)
print("\n\nBest recall")
display(best_recall)
print("\n\nBest f1")
display(best_f1)

# get the scores by label and confusion matrix
# for the best prediction
best_prediction = best_precision
best_name       = best_prediction.name.values[0]
best_classifier = best_prediction.classifier.values[0]
precision_by_label = scores[best_name][best_classifier][3][0]
recall_by_label = scores[best_name][best_classifier][3][1]
best_confusion_matrix = scores[best_name][best_classifier][4]

# show a side-by-side barchart of precision and recall for each label
print("\n\nPrecision and Recall by Label for classifier ")
print("Classifier:", best_classifier, "Feature set:", best_name)
show_precision_recall_by_label(precision_by_label, recall_by_label,
                               best_name, best_classifier, label_encoder)
                                                      
                                                      
# show the confusion matrix for the best performing classifier/feature set
show_confusion_matrix(best_confusion_matrix, label_encoder)                                                      


In [None]:

# Write out scores as csv files
print("\nWriting metrics ...")
df_report.to_csv("./data/metrics.csv")
print("done.")

display(df_report)


In [None]:

df_confusion_matrix = pd.DataFrame(best_confusion_matrix)
df_precision_by_label = pd.DataFrame(precision_by_label)
df_recall_by_label = pd.DataFrame(recall_by_label)

print("\nWriting metrics ...")
df_confusion_matrix.to_csv("./data/metrics_confusion_matrix.csv")
print("done.")

print("\nWriting metrics ...")
df_precision_by_label.to_csv("./data/metrics_precision_by_label.csv")
print("done.")

print("\nWriting metrics ...")
df_recall_by_label.to_csv("./data/metrics_recall_by_label.csv")
print("done.")

