# Data analysis
## Running some simple models first

![alt text](https://scikit-learn.org/stable/_static/ml_map.png)


In [1]:
#Setting up environment
import pandas as pd
import os
import numpy as np
from IPython.display import Math
 
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn import metrics, svm,tree,preprocessing
from sklearn.tree import DecisionTreeClassifier
from pprint import pprint
from sklearn.metrics import (auc, confusion_matrix, roc_curve, accuracy_score, precision_score)

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
#Uploading data
df_processed= pd.read_csv("/Suzi fun files/QB course/QB_DS_FinalProject/data/processed/data_all_features.csv", index_col=[0])
df_processed.head()

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis_category
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
84348301,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


In [3]:
#Split data into test and train
target = df_processed['diagnosis_category']
X = df_processed.drop('diagnosis_category',axis = 1)
X = X.values
X_train, X_test, y_train, y_test  = train_test_split(X, target, test_size = 0.2,random_state = 42)

In [4]:
df_processed.shape

(569, 31)

## First algorithm: decision tree
- suitable for data with collinearity
- can be visualised 
- our dataset is quite small so computational complexity not a big problem 

I score my grid search with recall, as I want high recall in tumour diagnosis. 

In [None]:
# Tune Decision Tree
num_leaves = [5, 10, 15, 20, 30, 40, 50]
depth = np.arange(3, 21)
num_samples_split = np.arange(10,20)
params_DT = {"max_depth": depth, "min_samples_leaf": num_leaves, "min_samples_split":num_samples_split}
random_state = 42
classifier_DT = DecisionTreeClassifier(random_state=random_state)
grid_DT = GridSearchCV(classifier_DT, params_DT,scoring='roc_auc');
grid_DT.fit(X_train,y_train)

prediction_DT = grid_DT.predict(X_test)
probability_DT = grid_DT.predict_proba(X_test)

In [None]:
grid_DT_df = pd.DataFrame(grid_DT.cv_results_)
grid_DT_df.head(5)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, prediction_DT))

In [None]:
model_DT= grid_DT.best_estimator_
plt.figure(figsize=(20,10)) 
tree.plot_tree(model_DT, feature_names=df_processed.columns[0:-1], impurity=False, proportion=True)

In [None]:
print(metrics.confusion_matrix(y_test, prediction_DT))

In [None]:
tree_tn, tree_fn, tree_fp, tree_tp = metrics.confusion_matrix(y_test, prediction_DT).ravel()
recall_DT= tree_tp/(tree_tp + tree_fn)
print(recall_DT)

In [None]:
plt.plot(grid_DT_df[['mean_test_score', 'param_max_depth']].groupby('param_max_depth').mean(), 
         marker='o', linestyle='None', markersize = 10.0)
plt.xlabel('Depth of tree')
plt.ylabel('Average testing score (recall)')
plt.title('Average testing score according to the depth of the tree')

print(grid_DT.best_params_)

## That's my decision tree done.
## Let's try another non-parametric algorithm, support vector machines. 

For SVM, I need to normalise my data for zero mean and unit variance. 

Some info on SVM https://statinfer.com/204-6-8-svm-advantages-disadvantages-applications/

Application to this dataset https://towardsdatascience.com/breast-cancer-classification-using-support-vector-machine-svm-a510907d4878

Tuning inspired by https://medium.com/@aneesha/svm-parameter-tuning-in-scikit-learn-using-gridsearchcv-2413c02125a0

In [None]:
X_train_scaled = preprocessing.scale(X_train)
X_test_scaled  = preprocessing.scale(X_test)

In [None]:
# Tune support vector machine
kernels = ['linear','poly','rbf']
Cs = [0.01, 0.1, 1, 10, 100]
gammas = [0.001, 0.01, 0.1, 1]
params_SVM = {"kernel": ['linear'], "C":Cs}
{"kernel" : ['poly'],"gamma": gammas, "C":Cs}
{"kernel" : ['rbf'],"gamma": gammas, "C":Cs}
classifier_SVM = svm.SVC(probability=True)
grid_SVM = GridSearchCV(classifier_SVM,params_SVM,scoring='roc_auc')
grid_SVM.fit(X_train_scaled,y_train)

In [None]:
grid_SVM_df = pd.DataFrame(grid_SVM.cv_results_)
grid_SVM_df.head()

In [None]:
prediction_SVM = grid_SVM.predict(X_test_scaled)
probability_SVM = grid_SVM.predict_proba(X_test_scaled)
print(grid_SVM.best_params_)
print("Accuracy:",metrics.accuracy_score(y_test, prediction_SVM))
print(metrics.confusion_matrix(y_test, prediction_SVM))

In [None]:
svm_tn, svm_fn, svm_fp, svm_tp = metrics.confusion_matrix(y_test, prediction_SVM).ravel()
recall_SVM = svm_tp/(svm_tp + svm_fn)
print(recall_SVM)

### As my SVM model uses a linear kernel, I can assess feature importance
taken from https://stackoverflow.com/questions/41592661/determining-the-most-contributing-features-for-svm-classifier-in-sklearn

In [None]:
def f_importances(coef,estimator, names, top=-1):
    imp = coef
    imp, names = zip(*sorted(list(zip(imp, names))))

    if top == -1:
        top = len(names)
    plt.figure(figsize=(10,8))
    plt.barh(range(top), imp[::-1][0:top], align='center')
    plt.yticks(range(top), names[::-1][0:top])
    plt.title("Feature importance in {}.".format(estimator))
    plt.show()

features_names = list(df_processed.columns)    
model_SVM = grid_SVM.best_estimator_
f_importances(abs(model_SVM.coef_[0]),"SVM", features_names, top=-1)


## K-nearest neighbours, popular with this dataset
See https://www.geeksforgeeks.org/ml-kaggle-breast-cancer-wisconsin-diagnosis-using-knn/ or *Sarkar M, Leong TY. Application of K-nearest neighbors algorithm on breast cancer diagnosis problem. Proc AMIA Symp. 2000;759–763.*

In [None]:
# Tune KNN
neighbours = [5,10,15,20]
power_parameter = [1,2]
params_KNN = {"n_neighbors": neighbours, "p": power_parameter}
classifier_KNN = KNeighborsClassifier()
grid_KNN = GridSearchCV(classifier_KNN,params_KNN,scoring='roc_auc')
grid_KNN.fit(X_train_scaled,y_train)

In [None]:
prediction_KNN = grid_KNN.predict(X_test_scaled)
probability_KNN = grid_KNN.predict_proba(X_test_scaled)
print(grid_KNN.best_params_)
print("Accuracy:",metrics.accuracy_score(y_test, prediction_KNN))
print(metrics.confusion_matrix(y_test, prediction_KNN))

In [None]:
knn_tn, knn_fn, knn_fp, knn_tp = metrics.confusion_matrix(y_test, prediction_KNN).ravel()
recall_KNN = knn_tp/(knn_tp + knn_fn)
print(recall_KNN)

## A simple ensemble model: combine my KNN and SVM
using the best parameters from my grid search

In [None]:
model_KNN= grid_KNN.best_estimator_
model_SVM= grid_SVM.best_estimator_
classifier_SVC_KNN = VotingClassifier(estimators = [('knn',model_KNN),('svm',model_SVM)],voting = 'soft')
classifier_SVC_KNN.fit(X_train_scaled,y_train)

In [None]:
prediction_SVC_KNN = classifier_SVC_KNN.predict(X_test_scaled)
probability_SVC_KNN = classifier_SVC_KNN.predict_proba(X_test_scaled)
print("Accuracy:",metrics.accuracy_score(y_test, prediction_SVC_KNN))
print(metrics.confusion_matrix(y_test, prediction_SVC_KNN))

In [None]:
svc_knn_tn, svc_knn_fn, svc_knn_fp, svc_knn_tp = metrics.confusion_matrix(y_test, prediction_SVC_KNN).ravel()
recall_SVC_KNN = svc_knn_tp/(svc_knn_tp + svc_knn_fn)
print(recall_SVC_KNN)

## Bagging: Random Forest

In [None]:
# Tune RF
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 150, num = 11)]
max_features = ['auto', 'sqrt','log2']
max_depth = [int(x) for x in np.linspace(5, 55, num = 11)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [10, 20, 30, 40]
bootstrap = [True,False]
params_RF = {'n_estimators': n_estimators,'max_features':max_features,'max_depth':max_depth,
             'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf,'bootstrap':bootstrap}
classifier_RF = RandomForestClassifier()
grid_RF = RandomizedSearchCV(classifier_RF,params_RF,n_iter = 300,scoring='roc_auc',random_state=random_state,n_jobs = -1,cv=5,verbose=2)
grid_RF.fit(X_train,y_train)

In [None]:
prediction_RF =grid_RF.predict(X_test)
probability_RF = grid_RF.predict_proba(X_test)
print(grid_RF.best_params_)
print("Accuracy:",metrics.accuracy_score(y_test, prediction_RF))
print(metrics.confusion_matrix(y_test, prediction_RF))

In [None]:
rf_tn, rf_fn, rf_fp, rf_tp = metrics.confusion_matrix(y_test, prediction_RF).ravel()
recall_RF = rf_tp/(rf_tp + rf_fn)
print(recall_RF)

### Random Forest feature importance
https://blog.datadive.net/selecting-good-features-part-iii-random-forests/

In [None]:
print ("Features sorted by their score(decrease of impurity):")
print (sorted(zip(map(lambda x: round(x, 4), grid_RF.best_estimator_.feature_importances_), features_names), 
             reverse=True))

In [None]:
f_importances(abs(grid_RF.best_estimator_.feature_importances_),"Random Forest", features_names, top=-1)

## Boosting: AdaBoost
mostly used on Decision trees https://machinelearningmastery.com/boosting-and-adaboost-for-machine-learning/

In [None]:
classifier_AB = AdaBoostClassifier(base_estimator=None, n_estimators=40,random_state=random_state)
classifier_AB.fit(X_train,y_train)

In [None]:
prediction_AB = classifier_AB.predict(X_test)
probability_AB = classifier_AB.predict_proba(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, prediction_AB))
print(metrics.confusion_matrix(y_test, prediction_AB))

In [None]:
ab_tn, ab_fn, ab_fp, ab_tp = metrics.confusion_matrix(y_test, prediction_AB).ravel()
recall_AB = ab_tp/(ab_tp + ab_fn)
print(recall_AB)

## Gradient boosting
tuning: https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/

In [None]:
# Tuning with same parameters as my random forest
learning_rate = [0.1, 0.2, 0.3];
params_GB = {'learning_rate': learning_rate,'n_estimators': n_estimators,'max_features':max_features,'max_depth':max_depth,
             'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf}
classifier_GB = GradientBoostingClassifier()
grid_GB = RandomizedSearchCV(classifier_GB,params_GB,n_iter = 300,scoring='roc_auc',random_state=random_state,n_jobs = -1,cv=5,verbose=2)
grid_GB.fit(X_train,y_train)

In [None]:
prediction_GB = grid_GB.predict(X_test)
probability_GB = grid_GB.predict_proba(X_test)
print(grid_GB.best_params_)
print("Accuracy:",metrics.accuracy_score(y_test, prediction_GB))
print(metrics.confusion_matrix(y_test, prediction_GB))

In [None]:
gb_tn, gb_fn, gb_fp, gb_tp = metrics.confusion_matrix(y_test, prediction_GB).ravel()
recall_GB = gb_tp/(gb_tp + gb_fn)
print(recall_GB)

In [None]:
f_importances(abs(grid_GB.best_estimator_.feature_importances_),"Gradient Boosting", features_names, top=-1)

## Compare all models

In [None]:
false_pos_rate_DT,true_pos_rate_DT, thresholds_DT = roc_curve(y_test, probability_DT[:, 1])
false_pos_rate_SVM,true_pos_rate_SVM, thresholds_SVM = roc_curve(y_test, probability_SVM[:, 1])
false_pos_rate_KNN,true_pos_rate_KNN, thresholds_KNN = roc_curve(y_test, probability_KNN[:, 1])
false_pos_rate_SVC_KNN,true_pos_rate_SVC_KNN, thresholds_SVC_KNN = roc_curve(y_test, probability_SVC_KNN[:, 1])
false_pos_rate_RF,true_pos_rate_RF, thresholds_RF = roc_curve(y_test, probability_RF[:, 1])
false_pos_rate_AB,true_pos_rate_AB, thresholds_AB = roc_curve(y_test, probability_AB[:, 1])
false_pos_rate_GB,true_pos_rate_GB, thresholds_GB = roc_curve(y_test, probability_GB[:, 1])

# Plot the ROC curve
fig = plt.figure(figsize=(12,12))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(false_pos_rate_DT, true_pos_rate_DT, label='DT')
plt.plot(false_pos_rate_SVM, true_pos_rate_SVM, label='SVM')
plt.plot(false_pos_rate_KNN, true_pos_rate_KNN, label='KNN')
plt.plot(false_pos_rate_SVC_KNN, true_pos_rate_SVC_KNN, label='SVC/KNN')
plt.plot(false_pos_rate_RF, true_pos_rate_RF, label='RF')
plt.plot(false_pos_rate_AB, true_pos_rate_AB, label='AB')
plt.plot(false_pos_rate_GB, true_pos_rate_GB, label='GB')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC')
plt.legend(loc='best')
plt.show()

In [None]:
print(thresholds_DT)
print(thresholds_SVM)
print(thresholds_KNN)

In [None]:
# Summary
print("Accuracy DT:",metrics.accuracy_score(y_test, prediction_DT)*100,"Recall DT:",recall_DT*100)
print("Accuracy SVM:",metrics.accuracy_score(y_test, prediction_SVM)*100,"Recall SVM:",recall_SVM*100)
print("Accuracy KNN:",metrics.accuracy_score(y_test, prediction_KNN)*100,"Recall KNN:",recall_KNN*100)
print("Accuracy SVC/KNN:",metrics.accuracy_score(y_test, prediction_SVC_KNN)*100,"Recall SVC/KNN:",recall_SVC_KNN*100)
print("Accuracy RF:",metrics.accuracy_score(y_test, prediction_RF)*100,"Recall RF:",recall_RF*100)
print("Accuracy AB:",metrics.accuracy_score(y_test, prediction_AB)*100,"Recall AB:",recall_AB*100)

## Save my test and train data for use in different notebooks

In [None]:
type(X_test)

In [None]:
type(y_train)

In [None]:
print(y_train)
type(y_train)

In [None]:
#y_train.to_csv("/Suzi fun files/QB course/QB_DS_FinalProject/data/interim/y_train.csv")
#y_test.to_csv("/Suzi fun files/QB course/QB_DS_FinalProject/data/interim/y_test.csv")

In [None]:
#pd.DataFrame(X_test).to_csv("/Suzi fun files/QB course/QB_DS_FinalProject/data/interim/X_test.csv")
#pd.DataFrame(X_train).to_csv("/Suzi fun files/QB course/QB_DS_FinalProject/data/interim/X_train.csv")