In [None]:
import pandas as pd
import urllib.request
import numpy as np
from IPython.display import display
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


In [None]:
print('Loading case data ...')
cases = pd.read_csv("pancancer_case_features.csv")
print("done.")

In [None]:
labels_string = cases.cancer_type
le = preprocessing.LabelEncoder()
labels = le.fit_transform(labels_string)


# Get rid of the cancer type and patient_id columns 
data = cases[cases.columns[3:]]

display(labels)
display(data.head())

In [None]:
skf = StratifiedKFold(n_splits=10)
for train_index, test_index in skf.split(data, labels):
    train_data, test_data     = data.values[train_index], data.values[test_index]
    train_labels, test_labels = labels[train_index], labels[test_index]
    print(len(train_data), len(test_data))
    



In [None]:
train_data, test_data, train_labels, test_labels = train_test_split(data, labels,
                                                    stratify=labels, 
                                                    test_size=0.25)

In [None]:



#
# Multinomial Naive Bayes
#
mnb = MultinomialNB()
params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 0.5]}
mnb = GridSearchCV(mnb, params, cv=5,
                               scoring='accuracy', return_train_score=True)
# Fit  training data
mnb.fit(train_data, train_labels)  
# Show the best alpha parameter to use and the expected accuracy
print('\nMultinomial Naive Bayes Classifier')
print(' Best param:',   mnb.best_params_)
print(' Accuracy:  ',   np.round(mnb.best_score_, 4) )


#
# Logistic Regression
#
lr = LogisticRegression(penalty='l2', multi_class = 'ovr', solver='liblinear', max_iter=150)
params = {'C': [.01,  .18, .2, 1, 10]}
logit = GridSearchCV(lr, params, cv=5,
                     scoring='accuracy', return_train_score=True)

# Fit  training data
logit.fit(train_data, train_labels)  
# Show the best C parameter to use and the expected accuracy
print('Logistic Regression Classifier')
print(' Best param:', logit.best_params_)
print(' Accuracy:  ', np.round(logit.best_score_, 4) )




