In [22]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [23]:
ds = pd.read_csv('https://raw.githubusercontent.com/datasets/breast-cancer/master/data/breast-cancer.csv')
ds = ds.dropna()
ds.t(2)

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,40-49,premefalse,15-19,0-2,True,3,right,left_up,False,recurrence-events
1,50-59,ge40,15-19,0-2,False,1,right,central,False,false-recurrence-events


In [24]:
features_to_encode = ['age', 'mefalsepause', 'tumor-size', 'inv-falsedes', 'breast', 
                      'breast-quad', 'falsede-caps', 'irradiat']

ds = pd.get_dummies(ds, columns=features_to_encode)
ds.head(2)

Unnamed: 0,deg-malig,class,age_20-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,mefalsepause_ge40,mefalsepause_lt40,...,breast_right,breast-quad_central,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up,falsede-caps_False,falsede-caps_True,irradiat_False,irradiat_True
0,3,recurrence-events,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,1,1,0
1,1,false-recurrence-events,0,0,0,1,0,0,1,0,...,1,1,0,0,0,0,1,0,1,0


In [25]:
y = ds['class'].values

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

ds.drop('class', axis='columns', inplace=True)
print(list(ds.columns))
X = ds.values

['deg-malig', 'age_20-29', 'age_30-39', 'age_40-49', 'age_50-59', 'age_60-69', 'age_70-79', 'mefalsepause_ge40', 'mefalsepause_lt40', 'mefalsepause_premefalse', 'tumor-size_0-4', 'tumor-size_10-14', 'tumor-size_15-19', 'tumor-size_20-24', 'tumor-size_25-29', 'tumor-size_30-34', 'tumor-size_35-39', 'tumor-size_40-44', 'tumor-size_45-49', 'tumor-size_5-9', 'tumor-size_50-54', 'inv-falsedes_0-2', 'inv-falsedes_12-14', 'inv-falsedes_15-17', 'inv-falsedes_24-26', 'inv-falsedes_3-5', 'inv-falsedes_6-8', 'inv-falsedes_9-11', 'breast_left', 'breast_right', 'breast-quad_central', 'breast-quad_left_low', 'breast-quad_left_up', 'breast-quad_right_low', 'breast-quad_right_up', 'falsede-caps_False', 'falsede-caps_True', 'irradiat_False', 'irradiat_True']


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 1)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

SEED = 1

In [6]:
lr = LogisticRegression(random_state=SEED)
knn = KNN()
rf = RandomForestClassifier(random_state=SEED)

# Define a list called classifier that contains the tuples (classifier_name, classifier)
classifiers = [('Logistic', lr),
               ('K Nearest', knn),
               ('Forest', rf)]

# Iterate over the defined list of tuples containing the classifiers
for clf_name, clf in classifiers:
    #fit clf to the training set
    clf.fit(X_train, y_train)
    # Predict the labels of the test set
    y_pred = clf.predict(X_test)
    
    print('{:s}:{:s}'.format(clf_name, classification_report(y_test, y_pred)))

Logistic:              precision    recall  f1-score   support

           0       0.81      0.89      0.85        19
           1       0.67      0.50      0.57         8

    accuracy                           0.78        27
   macro avg       0.74      0.70      0.71        27
weighted avg       0.77      0.78      0.77        27

K Nearest:              precision    recall  f1-score   support

           0       0.79      1.00      0.88        19
           1       1.00      0.38      0.55         8

    accuracy                           0.81        27
   macro avg       0.90      0.69      0.71        27
weighted avg       0.85      0.81      0.78        27

Forest:              precision    recall  f1-score   support

           0       0.86      1.00      0.93        19
           1       1.00      0.62      0.77         8

    accuracy                           0.89        27
   macro avg       0.93      0.81      0.85        27
weighted avg       0.90      0.89      0.88     



In [7]:
from sklearn.externals import joblib
joblib.dump(rf, 'rfmodel.pkl')



['rfmodel.pkl']