# Breast Cancer Wisconsin (Diagnostic) - model_v2
# Yuming Yao (100904611)


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

###### Part I: Importing Data

In [2]:
with open("breast-cancer-wisconsin.names") as f:
    print(f.read())

Citation Request:
   This breast cancer databases was obtained from the University of Wisconsin
   Hospitals, Madison from Dr. William H. Wolberg.  If you publish results
   when using this database, then please include this information in your
   acknowledgements.  Also, please cite one or more of:

   1. O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear 
      programming", SIAM News, Volume 23, Number 5, September 1990, pp 1 & 18.

   2. William H. Wolberg and O.L. Mangasarian: "Multisurface method of 
      pattern separation for medical diagnosis applied to breast cytology", 
      Proceedings of the National Academy of Sciences, U.S.A., Volume 87, 
      December 1990, pp 9193-9196.

   3. O. L. Mangasarian, R. Setiono, and W.H. Wolberg: "Pattern recognition 
      via linear programming: Theory and application to medical diagnosis", 
      in: "Large-scale numerical optimization", Thomas F. Coleman and Yuying
      Li, editors, SIAM Publications, Philadelphia 199

In [3]:
df = pd.read_csv("breast-cancer-wisconsin.data", sep=",", header = None)

In [4]:
headers = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 
           'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size',
           'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class'] 
df.columns = headers
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
df.shape

(699, 11)

In [6]:
df.dtypes

Sample code number              int64
Clump Thickness                 int64
Uniformity of Cell Size         int64
Uniformity of Cell Shape        int64
Marginal Adhesion               int64
Single Epithelial Cell Size     int64
Bare Nuclei                    object
Bland Chromatin                 int64
Normal Nucleoli                 int64
Mitoses                         int64
Class                           int64
dtype: object

In [7]:
df.replace('?', np.NaN, inplace = True)

In [8]:
df.isnull().sum()

Sample code number              0
Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64

In [9]:
df = df.dropna()

In [10]:
df['Bare Nuclei'] = df['Bare Nuclei'].astype(str).astype(int)

In [11]:
df.dtypes

Sample code number             int64
Clump Thickness                int64
Uniformity of Cell Size        int64
Uniformity of Cell Shape       int64
Marginal Adhesion              int64
Single Epithelial Cell Size    int64
Bare Nuclei                    int32
Bland Chromatin                int64
Normal Nucleoli                int64
Mitoses                        int64
Class                          int64
dtype: object

In [12]:
df.Class.unique()

array([2, 4], dtype=int64)

In [13]:
df['Diagnosis'] = df['Class'].map({4:1,2:0})
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class,Diagnosis
0,1000025,5,1,1,1,2,1,3,1,1,2,0
1,1002945,5,4,4,5,7,10,3,2,1,2,0
2,1015425,3,1,1,1,2,2,3,1,1,2,0
3,1016277,6,8,8,1,3,4,3,7,1,2,0
4,1017023,4,1,1,3,2,1,3,1,1,2,0


In [14]:
df.drop(columns=['Sample code number', 'Class'], inplace=True)

In [15]:
df.shape

(683, 10)

In [16]:
df.describe()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Diagnosis
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,4.442167,3.150805,3.215227,2.830161,3.234261,3.544656,3.445095,2.869693,1.603221,0.349927
std,2.820761,3.065145,2.988581,2.864562,2.223085,3.643857,2.449697,3.052666,1.732674,0.477296
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,0.0
75%,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,1.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0


###### Part II: Training

In [17]:
y = df["Diagnosis"]
y

0      0
1      0
2      0
3      0
4      0
      ..
694    0
695    0
696    1
697    1
698    1
Name: Diagnosis, Length: 683, dtype: int64

In [18]:
X = df.drop(["Diagnosis"], axis = 1)
X

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1
695,2,1,1,1,2,1,1,1,1
696,5,10,10,3,7,3,8,10,2
697,4,8,6,4,3,4,10,6,1


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 

In [20]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [21]:
y_pred = model.predict(X_test)

###### Part III: Evaluation

In [22]:
print('Accuracy of Decision Tree Classifier on train set: {:.4f}'.format(model.score(X_train, y_train)))
print('Accuracy of Decision Tree Classifier on test set: {:.4f}'.format(model.score(X_test, y_test)))
print('Precision of Decision Tree Classifier on test set: {:.4f}'.format(precision_score(y_test, y_pred)))

Accuracy of Decision Tree Classifier on train set: 1.0000
Accuracy of Decision Tree Classifier on test set: 0.9317
Precision of Decision Tree Classifier on test set: 0.9306


In [23]:
confusion_matrix(y_test, y_pred)

array([[124,   5],
       [  9,  67]], dtype=int64)

In [24]:
print("Accuracy of the model is:", metrics.accuracy_score(y_test, y_pred))

Accuracy of the model is: 0.9317073170731708
