In [20]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# Import a bunch of libraries.
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import glob
from sklearn import preprocessing
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
# to make this notebook's output stable across runs
np.random.seed(42)

In [3]:
# TCGA dictionary information
tcga_dict = open("./data/tcga_dictionaries.txt","r")
dict_name_index = 0 #Set dictionary index counter to 0
for line in tcga_dict:
    if line.startswith("#"): #If line starts with #, the next line will be a known dictionary
        dict_name_index += 1
    elif dict_name_index == 5:
        code_to_disease = eval(line)

In [4]:
def getDataAndLabels(name, features):
    labels_string = features.cancer_type
    le            = preprocessing.LabelEncoder()
    labels        = le.fit_transform(labels_string)

    # Get rid of the cancer type and patient_id columns 
    data = features[features.columns[3:]]
    return {'name': name, 'feature_size': data.shape[1],
            'data': data, 'labels': labels , 'label_encoder': le }

In [9]:
print('Loading training data ...')

train_files = glob.glob("./data/features_*.train.csv")
all_train_data = {}
for filename in train_files:
    
    name = filename[16:-10]
    print(" ", name)
    train_features = pd.read_csv(filename)
    all_train_data[name] = getDataAndLabels(name, train_features)

print("done.")

Loading training data ...
  l1reg_c1
  l1reg_c100
  l1reg_c0.5
  l1reg_c10
  topgenes_small
  bestfit_med
  bestfit_large
  all
  bestfit_with_topgenes
done.


In [10]:
print('Loading test data ...')

test_files = glob.glob("./data/features_*.test.csv")
all_test_data = {}
for filename in test_files:
    
    name = filename[16:-9]
    print(" ", name)
    test_features = pd.read_csv(filename)
    all_test_data[name] = getDataAndLabels(name, test_features)

print("done.")

Loading test data ...
  l1reg_c10
  topgenes_small
  bestfit_large
  l1reg_c0.5
  l1reg_c100
  l1reg_c1
  bestfit_med
  bestfit_with_topgenes
  all
done.


In [18]:
def runClassifiers(train_data, train_labels, test_data):
    xgb_cfr = xgb.XGBClassifier(max_depth=2, random_state=42)
    xgb_cfr.fit(train_data, train_labels)
    
    xgb_pred = xgb_cfr.predict(test_data)
    
    return xgb_cfr, xgb_pred

In [None]:
for name in all_train_data.keys():
    ask = input("Would you like to run XGBoost on: %s\t" %(name))
    if ask.lower() == 'yes':
        train      = all_train_data[name]
        test       = all_test_data[name]
        X_train    = train['data']
        Y_train    = train['labels']
        X_test     = test['data']
        Y_test     = test['labels']
        cfr, pred = runClassifiers(X_train, Y_train, X_test)
        xgb_cfr_report = classification_report(Y_test, pred)
        accuracy_score(Y_test, pred)

Would you like to run XGBoost on: l1reg_c1	no
Would you like to run XGBoost on: l1reg_c100	no
Would you like to run XGBoost on: l1reg_c0.5	yes
