# Project Cancer Detection

# Breast Cancer Wisconsin (Diagnostic) Data Set

# Importing Dataset

In [47]:
import numpy as np
import pandas as pd

In [48]:
col = ['id', 'clump Thickness', 'Uniformity of cell size', 
       'uniformity of cell shape', 'marginal adhesion',
       'single epithelial cell size', 'bare nuclei',
       'bland chromatin', 'normal nucleoli', 'mitoses', 'class']

In [49]:
df = pd.read_csv("breast_cancer_wisconsin.data.csv", names=col, header=None)
df.head()

Unnamed: 0,id,clump Thickness,Uniformity of cell size,uniformity of cell shape,marginal adhesion,single epithelial cell size,bare nuclei,bland chromatin,normal nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


# Data Preprocessing

In [50]:
np.where(df.isnull())

(array([], dtype=int64), array([], dtype=int64))

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   id                           699 non-null    int64 
 1   clump Thickness              699 non-null    int64 
 2   Uniformity of cell size      699 non-null    int64 
 3   uniformity of cell shape     699 non-null    int64 
 4   marginal adhesion            699 non-null    int64 
 5   single epithelial cell size  699 non-null    int64 
 6   bare nuclei                  699 non-null    object
 7   bland chromatin              699 non-null    int64 
 8   normal nucleoli              699 non-null    int64 
 9   mitoses                      699 non-null    int64 
 10  class                        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [52]:
df["bare nuclei"].describe()

count     699
unique     11
top         1
freq      402
Name: bare nuclei, dtype: object

In [53]:
df["bare nuclei"].value_counts()

bare nuclei
1     402
10    132
2      30
5      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: count, dtype: int64

In [54]:
df[df["bare nuclei"] == "?"]

Unnamed: 0,id,clump Thickness,Uniformity of cell size,uniformity of cell shape,marginal adhesion,single epithelial cell size,bare nuclei,bland chromatin,normal nucleoli,mitoses,class
23,1057013,8,4,5,1,2,?,7,3,1,4
40,1096800,6,6,6,9,6,?,7,8,1,2
139,1183246,1,1,1,1,1,?,2,1,1,2
145,1184840,1,1,3,1,2,?,2,1,1,2
158,1193683,1,1,2,1,3,?,1,1,1,2
164,1197510,5,1,1,1,2,?,3,1,1,2
235,1241232,3,1,4,1,2,?,3,1,1,2
249,169356,3,1,1,1,2,?,3,1,1,2
275,432809,3,1,3,1,2,?,2,1,1,2
292,563649,8,8,8,1,2,?,6,10,1,4


In [55]:
df["class"].value_counts()

class
2    458
4    241
Name: count, dtype: int64

In [56]:
df["bare nuclei"].replace("?", 0, inplace=True)

Note that for class: 2 is benign, 4 is for malignant

$$\frac{\text{df[Class]}}{2} - 1$$



In [57]:
df['class'] = df['class'] / 2 - 1

In [58]:
df["class"].value_counts()

class
0.0    458
1.0    241
Name: count, dtype: int64

Now 0 is for benign<br>
and 1 is for malignant

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           699 non-null    int64  
 1   clump Thickness              699 non-null    int64  
 2   Uniformity of cell size      699 non-null    int64  
 3   uniformity of cell shape     699 non-null    int64  
 4   marginal adhesion            699 non-null    int64  
 5   single epithelial cell size  699 non-null    int64  
 6   bare nuclei                  699 non-null    object 
 7   bland chromatin              699 non-null    int64  
 8   normal nucleoli              699 non-null    int64  
 9   mitoses                      699 non-null    int64  
 10  class                        699 non-null    float64
dtypes: float64(1), int64(9), object(1)
memory usage: 60.2+ KB


In [60]:
# Data

In [61]:
X = df.drop(["id", "class"], axis=1)
X_col = X.columns

In [62]:
y = df["class"]

In [63]:
from sklearn.preprocessing import StandardScaler

In [64]:
X = StandardScaler().fit_transform(X.values)

# Training & Testing KNN

In [65]:
from sklearn.model_selection import train_test_split

In [66]:
df1 = pd.DataFrame(X, columns=X_col)

In [67]:
df1.head()

Unnamed: 0,clump Thickness,Uniformity of cell size,uniformity of cell shape,marginal adhesion,single epithelial cell size,bare nuclei,bland chromatin,normal nucleoli,mitoses
0,0.206936,-0.699995,-0.743299,-0.633247,-0.549561,-0.677144,-0.179662,-0.611825,-0.343912
1,0.206936,0.283845,0.266875,0.768621,1.710106,1.796673,-0.179662,-0.284112,-0.343912
2,-0.503866,-0.699995,-0.743299,-0.633247,-0.549561,-0.402275,-0.179662,-0.611825,-0.343912
3,0.562336,1.595632,1.613773,-0.633247,-0.097628,0.147462,-0.179662,1.354454,-0.343912
4,-0.148465,-0.699995,-0.743299,0.067687,-0.549561,-0.677144,-0.179662,-0.611825,-0.343912


In [68]:
X_train, X_test, y_train, y_test = train_test_split(df1, y,
                                                    train_size=0.8, 
                                                    random_state=42)

<h4> Normalisation

In [69]:
from sklearn.preprocessing import MinMaxScaler

pd.DataFrame(MinMaxScaler().fit_transform(df.drop(['id', 'class'], axis=1).values), columns=X_col).head()

Unnamed: 0,clump Thickness,Uniformity of cell size,uniformity of cell shape,marginal adhesion,single epithelial cell size,bare nuclei,bland chromatin,normal nucleoli,mitoses
0,0.444444,0.0,0.0,0.0,0.111111,0.1,0.222222,0.0,0.0
1,0.444444,0.333333,0.333333,0.444444,0.666667,1.0,0.222222,0.111111,0.0
2,0.222222,0.0,0.0,0.0,0.111111,0.2,0.222222,0.0,0.0
3,0.555556,0.777778,0.777778,0.0,0.222222,0.4,0.222222,0.666667,0.0
4,0.333333,0.0,0.0,0.222222,0.111111,0.1,0.222222,0.0,0.0


In [73]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [77]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))        

In [78]:
from sklearn.neighbors import KNeighborsClassifier

In [79]:
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric="minkowski")

In [80]:
knn.fit(X_train, y_train)

In [83]:
print_score(knn, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 0.9678

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.98      0.97      0.98       363
         1.0       0.95      0.96      0.95       196

    accuracy                           0.97       559
   macro avg       0.96      0.97      0.96       559
weighted avg       0.97      0.97      0.97       559


Confusion Matrix: 
 [[353  10]
 [  8 188]]

Average Accuracy: 	 0.9624
Accuracy SD: 		 0.0148


In [82]:
print_score(knn, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.9714

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.97      0.99      0.98        95
         1.0       0.98      0.93      0.95        45

    accuracy                           0.97       140
   macro avg       0.97      0.96      0.97       140
weighted avg       0.97      0.97      0.97       140


Confusion Matrix: 
 [[94  1]
 [ 3 42]]



# Grid Search

In [85]:
from sklearn.model_selection import GridSearchCV

In [86]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [87]:
params = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

In [96]:
grid_search_cv = GridSearchCV(KNeighborsClassifier(),
                              params, 
                              n_jobs=-1,
                              verbose=1,
                             return_train_score=True)

In [97]:
grid_search_cv.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [98]:
grid_search_cv.best_estimator_

In [99]:
print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 0.9696

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       363
         1.0       0.95      0.96      0.96       196

    accuracy                           0.97       559
   macro avg       0.97      0.97      0.97       559
weighted avg       0.97      0.97      0.97       559


Confusion Matrix: 
 [[354   9]
 [  8 188]]

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds

In [100]:
print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.9786

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.98      0.99      0.98        95
         1.0       0.98      0.96      0.97        45

    accuracy                           0.98       140
   macro avg       0.98      0.97      0.98       140
weighted avg       0.98      0.98      0.98       140


Confusion Matrix: 
 [[94  1]
 [ 2 43]]



In [101]:
grid_search_cv.best_params_

{'n_neighbors': 7}

In [102]:
grid_search_cv.cv_results_['mean_train_score']

array([1.        , 0.97629434, 0.97853348, 0.96735279, 0.96958793,
       0.96824465, 0.97048278, 0.96869307, 0.96824465, 0.96645494])

In [103]:
grid_search_cv.cv_results_

{'mean_fit_time': array([0.00570216, 0.00609617, 0.00452557, 0.00492921, 0.00454154,
        0.00253539, 0.01293206, 0.00280905, 0.00330563, 0.00181136]),
 'std_fit_time': array([0.00317758, 0.00483189, 0.00152618, 0.00142976, 0.00577922,
        0.00192855, 0.0084193 , 0.00232457, 0.00226601, 0.00162913]),
 'mean_score_time': array([0.01314921, 0.01699862, 0.01706176, 0.01037621, 0.01645765,
        0.02090645, 0.01986675, 0.01945415, 0.01756372, 0.01654835]),
 'std_score_time': array([0.0043043 , 0.00303272, 0.00247989, 0.00448476, 0.0094558 ,
        0.00313578, 0.00577794, 0.01183036, 0.00631299, 0.00148995]),
 'param_n_neighbors': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 1},
  {'n_neighbors': 2},
  {'n_neighbors': 3},
  {'n_neighbors': 4},
  {'n_neighbors': 5},
  {'n_neighbors': 6},


# SVM, Random Forest, XGBoost

In [104]:
from sklearn import svm
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)
print_score(clf, X_train, y_train, X_test, y_test, train=True)
print_score(clf, X_train, y_train, X_test, y_test, train=False)

Train Result:

accuracy score: 0.9750

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.99      0.97      0.98       363
         1.0       0.95      0.98      0.96       196

    accuracy                           0.97       559
   macro avg       0.97      0.98      0.97       559
weighted avg       0.98      0.97      0.98       559


Confusion Matrix: 
 [[353  10]
 [  4 192]]

Average Accuracy: 	 0.9660
Accuracy SD: 		 0.0170
Test Result:

accuracy score: 0.9786

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.98      0.99      0.98        95
         1.0       0.98      0.96      0.97        45

    accuracy                           0.98       140
   macro avg       0.98      0.97      0.98       140
weighted avg       0.98      0.98      0.98       140


Confusion Matrix: 
 [[94  1]
 [ 2 43]]



In [105]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
print_score(clf, X_train, y_train, X_test, y_test, train=True)
print_score(clf, X_train, y_train, X_test, y_test, train=False)

Train Result:

accuracy score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       363
         1.0       1.00      1.00      1.00       196

    accuracy                           1.00       559
   macro avg       1.00      1.00      1.00       559
weighted avg       1.00      1.00      1.00       559


Confusion Matrix: 
 [[363   0]
 [  0 196]]

Average Accuracy: 	 0.9588
Accuracy SD: 		 0.0254
Test Result:

accuracy score: 0.9643

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.97      0.98      0.97        95
         1.0       0.95      0.93      0.94        45

    accuracy                           0.96       140
   macro avg       0.96      0.96      0.96       140
weighted avg       0.96      0.96      0.96       140


Confusion Matrix: 
 [[93  2]
 [ 3 42]]



In [106]:
import xgboost as xgb
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train)
print_score(clf, X_train, y_train, X_test, y_test, train=True)
print_score(clf, X_train, y_train, X_test, y_test, train=False)

Train Result:

accuracy score: 0.9982

Classification Report: 
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       363
         1.0       1.00      0.99      1.00       196

    accuracy                           1.00       559
   macro avg       1.00      1.00      1.00       559
weighted avg       1.00      1.00      1.00       559


Confusion Matrix: 
 [[363   0]
 [  1 195]]

Average Accuracy: 	 0.9535
Accuracy SD: 		 0.0183
Test Result:

accuracy score: 0.9429

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.94      0.98      0.96        95
         1.0       0.95      0.87      0.91        45

    accuracy                           0.94       140
   macro avg       0.95      0.92      0.93       140
weighted avg       0.94      0.94      0.94       140


Confusion Matrix: 
 [[93  2]
 [ 6 39]]



***
***