# Comparing Naive Bayes and K-Nearest Neighbor Classification Methods of Breast Cancer in Coimbra, Portugal 2018 with Python
Data yang digunakan adalah data kanker payudara di area Coimbra, Portugal pada 2018 dengan jumlah 116 data dan 10 variabel. Variabel tersebut antara lain:
1.  Age (years) 
2. BMI (kg/m2) 
3. Glucose (mg/dL) 
4. Insulin (µU/mL) 
5. HOMA 
6. Leptin (ng/mL) 
7. Adiponectin (µg/mL) 
8. Resistin (ng/mL) 
9. MCP-1(pg/dL) 
10. Classification (1= healthy controls and 2= patients)

In [187]:
#Importing all the needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [188]:
cancer = pd.read_csv("DataR2.csv")
cancer.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92,1


In [144]:
cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 10 columns):
Age               116 non-null int64
BMI               116 non-null float64
Glucose           116 non-null int64
Insulin           116 non-null float64
HOMA              116 non-null float64
Leptin            116 non-null float64
Adiponectin       116 non-null float64
Resistin          116 non-null float64
MCP.1             116 non-null float64
Classification    116 non-null int64
dtypes: float64(7), int64(3)
memory usage: 9.1 KB


In [189]:
# Variabel independen (X)
x = cancer.drop(["Classification"], axis = 1)
x.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92


In [190]:
# Variabel dependen
y = cancer["Classification"]
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Classification, dtype: int64

In [303]:
from sklearn.model_selection import train_test_split

In [304]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 123)

In [305]:
y_test

4      1
89     2
53     2
100    2
28     1
105    2
59     2
65     2
90     2
33     1
50     1
8      1
5      1
0      1
13     1
44     1
82     2
101    2
70     2
60     2
62     2
42     1
85     2
95     2
Name: Classification, dtype: int64

In [341]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(x_train)

x_train = scaler.transform(x_train)  
x_test = scaler.transform(x_test) 

# Klasifikasi : Naive Bayes

In [365]:
from sklearn.naive_bayes import GaussianNB

In [366]:
# Mengaktifkan/memanggil/membuat fungsi klasifikasi Naive bayes
modelnb = GaussianNB()

In [367]:
# Memasukkan data training pada fungsi klasifikasi naive bayes
nbtrain = modelnb.fit(x_train, y_train)

In [368]:
nbtrain.class_count_

array([42., 50.])

In [369]:
# Menentukan hasil prediksi dari x_test
y_pred = nbtrain.predict(x_test)
y_pred

array([1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       2, 1], dtype=int64)

In [347]:
# Menentukan probabilitas hasil prediksi
nbtrain.predict_proba(x_test)

array([[9.87788163e-01, 1.22118368e-02],
       [1.45443011e-12, 1.00000000e+00],
       [9.50310237e-01, 4.96897629e-02],
       [9.80202417e-01, 1.97975829e-02],
       [9.94829631e-01, 5.17036944e-03],
       [7.14666090e-03, 9.92853339e-01],
       [9.66509660e-01, 3.34903396e-02],
       [5.55875956e-01, 4.44124044e-01],
       [9.30422221e-69, 1.00000000e+00],
       [4.40813323e-08, 9.99999956e-01],
       [1.90471059e-17, 1.00000000e+00],
       [9.59193538e-01, 4.08064616e-02],
       [9.68870247e-01, 3.11297533e-02],
       [9.22726655e-01, 7.72733452e-02],
       [9.99217914e-01, 7.82085682e-04],
       [9.47188022e-01, 5.28119781e-02],
       [8.98562658e-18, 1.00000000e+00],
       [9.84267435e-01, 1.57325650e-02],
       [9.73658595e-01, 2.63414051e-02],
       [9.79376416e-01, 2.06235836e-02],
       [9.33941625e-01, 6.60583747e-02],
       [9.91441926e-01, 8.55807439e-03],
       [1.11533698e-02, 9.88846630e-01],
       [9.57619888e-01, 4.23801120e-02]])

In [348]:
from sklearn.metrics import confusion_matrix

In [349]:
confusion_matrix(y_test, y_pred)

array([[8, 2],
       [9, 5]], dtype=int64)

In [350]:
# Merapikan hasil confusion matrix
y_actual = pd.Series([1,2,2,2,1,2,2,2,2,1,1,1,1,1,1,1,2,2,2,2,2,1,2,2], name = "actual")
y_pred = pd.Series([1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       2, 1], name = "prediction")
df_confusion = pd.crosstab(y_actual, y_pred)

In [351]:
df_confusion

prediction,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
1,8,2
2,9,5


In [352]:
from sklearn.metrics import classification_report

In [353]:
print (classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          1       0.47      0.80      0.59        10
          2       0.71      0.36      0.48        14

avg / total       0.61      0.54      0.52        24



In [354]:
#(TP/ TN)/ n
Accuracy = (8+5)/24
Accuracy

0.5416666666666666

In [319]:
from pycm import*

In [214]:
y_actual1 = [1,2,2,2,1,2,2,2,2,1,1,1,1,1,1,1,2,2,2,2,2,1,2,2]
y_pred1 = [1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       2, 1]

In [297]:
print(ConfusionMatrix(y_actual1, y_pred1))

Predict          1     2     
Actual
1                10    0     

2                14    0     





Overall Statistics : 

95% CI                                                           (0.21942,0.61391)
AUNP                                                             0.5
AUNU                                                             0.5
Bennett S                                                        -0.16667
CBA                                                              0.20833
Chi-Squared                                                      None
Chi-Squared DF                                                   1
Conditional Entropy                                              -0.0
Cramer V                                                         None
Cross Entropy                                                    -0.0
Gwet AC1                                                         0.00592
Hamming Loss                                                     0.58333
Joint Entropy

# Klasifikasi : K- Nearest Neighbor 

In [355]:
from sklearn.neighbors import KNeighborsClassifier 

In [356]:
#membuat fungsi klasifikasi KNN
classifier = KNeighborsClassifier(n_neighbors=5) 

In [357]:
# Memasukkan data training pada fungsi klasifikasi KNN
classifier.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [358]:
# Menentukan hasil prediksi dari x_test
y_pred = classifier.predict(x_test) 
y_pred

array([1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1,
       1, 2], dtype=int64)

In [359]:
y_test

4      1
89     2
53     2
100    2
28     1
105    2
59     2
65     2
90     2
33     1
50     1
8      1
5      1
0      1
13     1
44     1
82     2
101    2
70     2
60     2
62     2
42     1
85     2
95     2
Name: Classification, dtype: int64

In [360]:
# Menentukan probabilitas hasil prediksi
classifier.predict_proba(x_test)

array([[0.8, 0.2],
       [0. , 1. ],
       [0.2, 0.8],
       [0.8, 0.2],
       [1. , 0. ],
       [0.4, 0.6],
       [0.4, 0.6],
       [0. , 1. ],
       [0. , 1. ],
       [0.4, 0.6],
       [0.4, 0.6],
       [0.8, 0.2],
       [0.2, 0.8],
       [0.4, 0.6],
       [0.6, 0.4],
       [0.8, 0.2],
       [0.2, 0.8],
       [0.6, 0.4],
       [0.6, 0.4],
       [0. , 1. ],
       [0.2, 0.8],
       [1. , 0. ],
       [0.6, 0.4],
       [0.4, 0.6]])

In [361]:
from sklearn.metrics import classification_report, confusion_matrix 

In [362]:
print(confusion_matrix(y_test, y_pred)) 

[[ 6  4]
 [ 4 10]]


In [370]:
# Merapikan hasil confusion matrix
y_actual = pd.Series([1,2,2,2,1,2,2,2,2,1,1,1,1,1,1,1,2,2,2,2,2,1,2,2], name = "actual")
y_pred = pd.Series([1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1,
       1, 2], name = "prediction")
df_confusion = pd.crosstab(y_actual, y_pred)

In [371]:
df_confusion

prediction,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6,4
2,4,10


In [229]:
print(classification_report(y_test, y_pred)) 

             precision    recall  f1-score   support

          1       0.60      0.60      0.60        10
          2       0.71      0.71      0.71        14

avg / total       0.67      0.67      0.67        24



In [372]:
#(TP/ TN)/ n
Accuracy = (6+10)/24
Accuracy

0.6666666666666666

In [230]:
y_actual1 = [1,2,2,2,1,2,2,2,2,1,1,1,1,1,1,1,2,2,2,2,2,1,2,2]
y_pred1 = [1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1,
       1, 2]

In [231]:
print(ConfusionMatrix(y_actual1, y_pred1))

Predict          1     2     
Actual
1                6     4     

2                4     10    





Overall Statistics : 

95% CI                                                           (0.47807,0.85527)
AUNP                                                             0.65714
AUNU                                                             0.65714
Bennett S                                                        0.33333
CBA                                                              0.65714
Chi-Squared                                                      2.37061
Chi-Squared DF                                                   1
Conditional Entropy                                              0.90805
Cramer V                                                         0.31429
Cross Entropy                                                    0.97987
Gwet AC1                                                         0.35135
Hamming Loss                                                     0.