In [1]:
from logistic_regression import LogisticRegression
from polynomialize import polynomialize
from metrix import confusion_matrix, accuracy, precision, recall, f1_score
from train_test_split import train_test_split

import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('example_datasets/breast-cancer-wisconsin.data', header=None)
columns = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
df.columns = columns

In [3]:
df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [4]:
df['Class'] = df['Class'] / 2 - 1 # Very bad. Open to new ideas
df['Class'] = df['Class'].astype('int')

In [5]:
df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,0
1,1002945,5,4,4,5,7,10,3,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,4,1,1,3,2,1,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,0
695,841769,2,1,1,1,2,1,1,1,1,0
696,888820,5,10,10,3,7,3,8,10,2,1
697,897471,4,8,6,4,3,4,10,6,1,1


In [6]:
df.corr()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bland Chromatin,Normal Nucleoli,Mitoses,Class
Sample code number,1.0,-0.055308,-0.041603,-0.041576,-0.064878,-0.045528,-0.060051,-0.052072,-0.034901,-0.080226
Clump Thickness,-0.055308,1.0,0.644913,0.654589,0.486356,0.521816,0.558428,0.535835,0.350034,0.716001
Uniformity of Cell Size,-0.041603,0.644913,1.0,0.906882,0.705582,0.751799,0.755721,0.722865,0.458693,0.817904
Uniformity of Cell Shape,-0.041576,0.654589,0.906882,1.0,0.683079,0.719668,0.735948,0.719446,0.438911,0.818934
Marginal Adhesion,-0.064878,0.486356,0.705582,0.683079,1.0,0.599599,0.666715,0.603352,0.417633,0.6968
Single Epithelial Cell Size,-0.045528,0.521816,0.751799,0.719668,0.599599,1.0,0.616102,0.628881,0.479101,0.682785
Bland Chromatin,-0.060051,0.558428,0.755721,0.735948,0.666715,0.616102,1.0,0.665878,0.344169,0.756616
Normal Nucleoli,-0.052072,0.535835,0.722865,0.719446,0.603352,0.628881,0.665878,1.0,0.428336,0.712244
Mitoses,-0.034901,0.350034,0.458693,0.438911,0.417633,0.479101,0.344169,0.428336,1.0,0.42317
Class,-0.080226,0.716001,0.817904,0.818934,0.6968,0.682785,0.756616,0.712244,0.42317,1.0


In [7]:
model = LogisticRegression(C=0.5)

"""
     Decision boundary is linear: 
        Theta_0 + Theta_1 * X_1 + Theta_2 * X_2
"""

X = df[['Uniformity of Cell Size', 'Bland Chromatin']].values
y = df['Class'].values

"""
    Splitting the data into train (80%) and test (20%)
"""

X_train, X_test = train_test_split(X)
y_train, y_test = train_test_split(y)

model.fit(X_train, y_train)

(array([0.53482376, 1.35653742, 1.17443952]), 0.15432837680836833)

In [8]:
pred = model.predict(X_test)

In [9]:
y_test [ y_test == 1 ].size, y_test [ y_test == 0 ].size

(35, 105)

In [10]:
accuracy(y_test, pred)

0.9714285714285714

In [11]:

"""
    (   True Positive, False Positive 
        False Negative, True Negative  )
"""

confusion_matrix(y_test, pred)

Unnamed: 0,Actual Positive,Actual Negative
Predicted Positive,35,4
Predicted Negative,0,101


In [12]:
"""
    True Pos / Predicted Positive
"""

precision(y_test, pred)

0.8974358974358975

In [13]:
"""
    True Pos / Actual Positive
"""

recall(y_test, pred)

1.0

In [14]:
"""
    2 * Precision * Recall / (Precision + Recall)
"""

f1_score(y_test, pred)

0.945945945945946

In [15]:
model = LogisticRegression(add_intercept=False, C = 0.01)

X = df[['Uniformity of Cell Size', 'Bland Chromatin']].values
y = df['Class'].values

X_ = polynomialize(X[:, 0], X[:, 1], 4)

X_train, X_test = train_test_split(X_)
y_train, y_test = train_test_split(y)

"""
     Decision boundary is a 4th order polynomial: 
        Theta_0 + Theta_1 * X_1 + Theta_2 * X_2 + Theta_ * X_2^2 + Theta_3 * X_1*X_2 + ... + Theta_k * X1^4 +
        ... + Theta_i * X_2^4
"""

model.fit(X_train, y_train)

(array([0.99854255, 0.98965714, 0.98965763, 0.98971239, 0.98970734,
        0.98971962, 0.98995675, 0.98991739, 0.98993203, 0.99003161,
        0.99101602, 0.99082623, 0.99081069, 0.99102333, 0.99171532]),
 0.016338016740051614)

In [16]:
pred = model.predict(X_test)

In [17]:
y_test [ y_test == 1 ].size, y_test [ y_test== 0 ].size

(35, 105)

In [18]:
accuracy(y_test, pred)

0.9785714285714285

In [19]:
confusion_matrix(y_test, pred)

Unnamed: 0,Actual Positive,Actual Negative
Predicted Positive,32,0
Predicted Negative,3,105


In [20]:
recall(y_test, pred)

0.9142857142857143

In [21]:
precision(y_test, pred)

1.0

In [22]:
f1_score(y_test, pred)

0.955223880597015