# Breast Cancer Dataset
# This is an example script that uses the Breast Cancer Wisconsin dataset (https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic))  To search for the dataset, just type in the key words: breat cancer, 

###############################################################
## what we apply in this project:
#
### K-Nearest Neighbors
## Logistic Regresssssion
## Decision Tree
## Random Forests
## Neural Network
## Support Vector Machine

##Teacher Assistant: Shibo Yao
##  Please be sure to run conda installation for python-graphviz. 
## conda install python-graphviz


In [1]:
conda install python-graphiz

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.

PackagesNotFoundError: The following packages are not available from current channels:

  - python-graphiz

Current channels:

  - https://repo.anaconda.com/pkgs/main/osx-64
  - https://repo.anaconda.com/pkgs/main/noarch
  - https://repo.anaconda.com/pkgs/r/osx-64
  - https://repo.anaconda.com/pkgs/r/noarch

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.



Note: you may need to restart the kernel to use updated packages.


In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier     #KNN
from sklearn.linear_model import LogisticRegression    #Logistic Regression
from sklearn.tree import DecisionTreeClassifier        #Decision Tree
from sklearn.ensemble import RandomForestClassifier    #Random Forest
from sklearn.neural_network import MLPClassifier       #Neural Network
from sklearn.svm import SVC                            #SVM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import tree
import matplotlib.pylab as plt
import numpy as np
import graphviz

%matplotlib inline

In [None]:
#load the breast cancer data and few EDA
cancer = load_breast_cancer()
print(cancer.DESCR)

In [None]:
print(cancer.feature_names)

In [None]:
print(cancer.target_names)

In [None]:
cancer.data

In [None]:
type(cancer.data)
cancer.data.shape

In [None]:
#----------KNN Classifier 
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=66)

training_accuracy = []
test_accuracy = []

#try KNN for diffrent k nearest neighbor from 1 to 15
neighbors_setting = range(1,15)

for n_neighbors in neighbors_setting:
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train,y_train)
    training_accuracy.append(knn.score(X_train, y_train))
    test_accuracy.append(knn.score(X_test, y_test))
 
plt.plot(neighbors_setting,training_accuracy, label='Accuracy of the training set')
plt.plot(neighbors_setting,test_accuracy, label='Accuracy of the test set')
plt.ylabel('Accuracy')
plt.xlabel('Number of Neighbors')
plt.legend()

#by looking at plot, best result accurs when n_neighbors is 6

In [None]:
print("Accuracy of the training set for 6NN: {:3f}".format(training_accuracy[5]))
print("Accuracy of the test set for 6NN: {:3f}".format(test_accuracy[5]))

In [None]:
#----------------Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=42)

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

print('Accuracy on the training set: {:.3f}'.format(log_reg.score(X_train,y_train)))
print('Accuracy on the testing set: {:.3f}'.format(log_reg.score(X_test,y_test)))

#It seems as it does better than KNN

In [None]:
# Print out the coefficients of logistic coefficients
print('Coefficients: \n', log_reg.coef_)

In [None]:
probs = log_reg.predict_proba(X_test)
predicted_labels=log_reg.predict(X_test)
print ("Class \"", cancer.target_names[predicted_labels[0]], "\":" ,probs[0][predicted_labels[0]])   # print the predicted result of the first test instance

In [None]:
#----------------- Decision Tree
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=42)

training_accuracy = []
test_accuracy = []

max_dep = range(1,15)

for md in max_dep:
    my_decision_tree = DecisionTreeClassifier(max_depth=md,random_state=0, criterion="gini") #"entropy" "gini"
    my_decision_tree.fit(X_train,y_train)
    training_accuracy.append(my_decision_tree.score(X_train, y_train))
    test_accuracy.append(my_decision_tree.score(X_test, y_test))
 
plt.plot(max_dep,training_accuracy, label='Accuracy of the training set')
plt.plot(max_dep,test_accuracy, label='Accuracy of the test set')
plt.ylabel('Accuracy')
plt.xlabel('Max Depth')
plt.legend()

# By having larger max_depth (>5), we overfit the model into training data, so the accuracy for training set become 
# but the accuracy for test set decrease

# other parameters than can work with:
# - min_samples_leaf, max_sample_leaf
# - max_leaf_node

# by looking at plot, best result accurs when max_depth is 


In [None]:
import os
#import PIL.Image
from IPython.display import Image
# BEFORE YOU DRAW THIS TREE, YOU NEED TO LOAD IN GRAPHVIZ MODULE. 
tree.export_graphviz(my_decision_tree, out_file='cancerTree.dot', class_names=['malignant','benign'], feature_names=cancer.feature_names, impurity=False, filled=True)
os.system("dot -T png " + "cancerTree.dot" + " -o " + "cancerTree" + ".png")
Image(filename= "cancerTree.png")



In [None]:
import os
from sklearn import tree
from IPython.display import Image
#export_graphviz(tree, out_file='cancerTree.dot', class_names=['malignant','benign'], feature_names=cancer.feature_names, impurity=False, filled=True)

# A function that gives a visual representation of the decision tree

def Decision_Tree_Image(decision_tree, feature_names, name="temp"):
    
    # Export our decision tree to graphviz format
    dot_file = tree.export_graphviz(decision_tree, out_file=name + '.dot', feature_names=feature_names)
    
    # Call graphviz to make an image file from our decision tree
    os.system("dot -T png " + name + ".dot -o " + name + ".png")
    
    # Return the .png image so we can see it
    return Image(filename= name + '.png')

#Let's print an image with the results
Decision_Tree_Image(my_decision_tree, cancer.feature_names)


In [None]:
print('Feature importances: {}'.format(my_decision_tree.feature_importances_))
type(my_decision_tree.feature_importances_)

In [None]:
#Feature Importance
n_feature = cancer.data.shape[1]
plt.barh(range(n_feature), my_decision_tree.feature_importances_, align='center')
plt.yticks(np.arange(n_feature), cancer.feature_names)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
# Decision Tress perform well and we don't need to standardize features
# But as you see, it can easilly overfit 

In [None]:
# ---------------- Random Forests
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

forest = RandomForestClassifier(n_estimators=100, random_state=0, criterion="gini")
forest.fit(X_train,y_train)

#you can tune parameter such as:
# - n_job (how many cores)(n_job=-1 => all cores)
# - max_depth
# - max_feature


print('acc for training data: {:.3f}'.format(forest.score(X_train,y_train)))
print('acc for test data: {:.3f}'.format(forest.score(X_test,y_test)))


In [None]:
#Feature Importance
n_feature = cancer.data.shape[1]
plt.barh(range(n_feature), forest.feature_importances_, align='center')
plt.yticks(np.arange(n_feature), cancer.feature_names)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
# Random Forest perform well and we don't need to standardize features
# Better than DT because of randomization
# It may not work well with sparse data

In [None]:
# ------------- Neural Network
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

mlp = MLPClassifier(random_state=42)
mlp.fit(X_train, y_train)

print('acc for training data: {:.3f}'.format(mlp.score(X_train, y_train)))
print('acc for test data: {:.3f}'.format(mlp.score(X_test, y_test)))

In [None]:
print('The max per each feature:\n{}'.format(cancer.data.max(axis=0)))

In [None]:
#let's improve on the Nearest Neighbor

#1- Scaling X data, let us standandize the training and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit(X_train).transform(X_train)

X_test_scaled = scaler.fit(X_train).transform(X_test)

print('The max per each feature:\n{}'.format(X_train_scaled.max(axis=0)))
print('The max per each feature:\n{}'.format(X_test_scaled.max(axis=0)))

## In Scikit Learn,  Neural Network is termed "multilayer perceptron (MLP)"

In [None]:
mlp = MLPClassifier(max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_train)

print('acc for training data: {:.3f}'.format(mlp.score(X_train_scaled, y_train)))
print('acc for test data: {:.3f}'.format(mlp.score(X_test_scaled, y_test)))

In [None]:
mlp

In [None]:
#2- change alpha
mlp = MLPClassifier(max_iter=1000, alpha=1, random_state=42)
mlp.fit(X_train_scaled,y_train)
print('acc for training data: {:.3f}'.format(mlp.score(X_train_scaled, y_train)))
print('acc for test data: {:.3f}'.format(mlp.score(X_test_scaled, y_test)))

In [None]:
#we can play around with other hyper parameter to improve the performance

In [None]:
plt.figure(figsize=(20,5))
plt.imshow(mlp.coefs_[0],interpolation='None',cmap='GnBu')
print(len(mlp.coefs_[0][0]))
plt.yticks(range(30),cancer.feature_names)
plt.xlabel('Colums in weight matrix')
plt.ylabel('Input feature')
plt.colorbar()

In [None]:
# --------- SVM (Support Vector Machine)
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=98)

svm= SVC()
svm.fit(X_train, y_train)

print('acc on train dataset: {:.3f}'.format(svm.score(X_train,y_train)))
print('acc on test dataset: {:.3f}'.format(svm.score(X_test,y_test)))

In [None]:
#it mean we overfit into our train dataset
# we can change hyper parameter to improve the model
# one way it to apply scaling
print(X_train.min(axis=0))
print(X_train.max(axis=0))

In [None]:
# We can see that there are huge difference between min and max and between diffrent features,  Some features are 
# in the difference scales.  

In [None]:
# We can see that there are huge diffrence between min and max and between diffrent features
plt.plot(X_train.min(axis=0), 'o', label='Min')
plt.plot(X_train.max(axis=0), 'v', label='Max')
plt.xlabel('Feature Index')
plt.ylabel('Feature Magnitude in Log Scale')
plt.yscale('log')
plt.legend(loc='upper right')

In [None]:
min_train = X_train.min(axis=0)
max_train = X_train.max(axis=0)

range_train = (X_train - min_train).max(axis=0)

print (range_train)

range_train_new = max_train-min_train

print (range_train_new)

X_train_scaled = (X_train - min_train)/range_train
X_test_scaled = (X_test - min_train)/range_train

print (X_train_scaled.shape)

print(min_train.shape)





In [None]:
svm = SVC(kernel='linear')
svm.fit(X_train_scaled, y_train)
print('acc on train dataset: {:.3f}'.format(svm.score(X_train_scaled,y_train)))
print('acc on test dataset: {:.3f}'.format(svm.score(X_test_scaled,y_test)))

In [None]:
svm = SVC(kernel='poly')
svm.fit(X_train_scaled, y_train)
print('acc on train dataset: {:.3f}'.format(svm.score(X_train_scaled,y_train)))
print('acc on test dataset: {:.3f}'.format(svm.score(X_test_scaled,y_test)))

In [None]:
svm = SVC(kernel='rbf')
svm.fit(X_train_scaled, y_train)
print('acc on train dataset: {:.3f}'.format(svm.score(X_train_scaled,y_train)))
print('acc on test dataset: {:.3f}'.format(svm.score(X_test_scaled,y_test)))

In [None]:
# we did much better now, but now we are underfitting
# to fix it we need change hyper parameters

In [None]:
svm = SVC(kernel='linear', C=1000)
svm.fit(X_train_scaled, y_train)
print('acc on train dataset: {:.3f}'.format(svm.score(X_train_scaled,y_train)))
print('acc on test dataset: {:.3f}'.format(svm.score(X_test_scaled,y_test)))

In [None]:
svm = SVC(kernel='poly', C=1000)
svm.fit(X_train_scaled, y_train)
print('acc on train dataset: {:.3f}'.format(svm.score(X_train_scaled,y_train)))
print('acc on test dataset: {:.3f}'.format(svm.score(X_test_scaled,y_test)))

In [None]:
svm = SVC(kernel='rbf', C=100000)
svm.fit(X_train_scaled, y_train)
print('acc on train dataset: {:.3f}'.format(svm.score(X_train_scaled,y_train)))
print('acc on test dataset: {:.3f}'.format(svm.score(X_test_scaled,y_test)))

## We need to run a grid-search to find the best hyper-parameter!

In [None]:
for penalty in range(0,20):
    for kernel_function in ['linear', 'rbf', 'poly']:
        svm = SVC(kernel=kernel_function, C=2**penalty)
        svm.fit(X_train_scaled, y_train)
        print('penalty:', 2**penalty, "kernel function", kernel_function, 'acc on train dataset: {:.3f}'.format(svm.score(X_train_scaled,y_train)))
        print('acc on test dataset: {:.3f}'.format(svm.score(X_test_scaled,y_test)))