In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score

In [2]:
dataset = pd.read_csv('C:\\Users\\WesleyZhou\\Desktop\\CS559\\Assignment2\\breast-cancer-wisconsin.data.csv')

In [3]:
col = ["F1","F2","F3","F4","F5","F6","F7","F8","F9"]
X = dataset[col].values
y = dataset["Class"].values

In [4]:
count = []
for k in range(len(X)):
    if(X[k][5] == '?'):
        count.append(k)
X = np.delete(X,count,0)
y = np.delete(y,count,0)

In [5]:
X = X.astype(int)

In [6]:
# 2 for benign, 4 for malignant so change 2=>0, 4 =>1
for a in range(len(y)):
    if(y[a] == 2):
        y[a] = 0
    else:
        y[a] = 1

In [7]:
#给X添加dummy列
dummy = np.ones(len(X))
X = np.c_[X,dummy]

In [8]:
X_train, X_test_Final, y_train, y_test_Final = train_test_split(X,y,test_size = 0.3,random_state=None)
X = X_train
y = y_train

In [9]:
def sigmoid(X,w):
    z = np.dot(X,w)
    return 1.0 / (1.0 + np.exp(-z))

In [10]:
#需要使用*乘
def lossFunction(y,fx):
    return (-y * np.log(fx) - ((1-y) * np.log(1-fx))).mean()

In [11]:
def predict(X,w):
    return np.where(sigmoid(X,w)>=0.5,1,0)

In [12]:
def SGD(X,y,w,alpha,epochs):
    lossSGD = []
    
    for i in range(epochs):
        index = np.arange(0,len(X))
        np.random.shuffle(index)
        for j in range(len(index)):
            fx = sigmoid(X[j],w)
            gradient = np.dot(X[j].T,fx-y[j])
            w -= alpha * gradient
            
        hypothesisSGD = sigmoid(X,w)
        lossSGD.append(lossFunction(y,hypothesisSGD))
    
    return w

In [13]:
def mini_batch_GD(X,y,w,alpha,epochs,batch_size):
    lossMini = []
    index = np.arange(0,len(X))
    
    for i in range(epochs):
        index = np.arange(0,len(X))
        np.random.shuffle(index)
        start = 0
        for j in range(0,len(index),batch_size):
            if(j == 0):
                continue        
            fx = sigmoid(X[start:j],w)
            gradient = np.dot(X[start:j].T,fx-y[start:j])/batch_size
            w -= alpha * gradient
            start = j
            
        hypothesisMini = sigmoid(X,w)
        lossMini.append(lossFunction(y,hypothesisMini))
    return w

In [14]:
#K-Fold SGD and Mini_Batch SGD
SGD_Model = []
SGD_Correct = []
mini_Model = []
mini_Correct = []

kf = KFold(n_splits=10,shuffle = False)
for train_index,test_index in kf.split(X):
    X_train = X[[train_index]]
    y_train = y[[train_index]]
    X_test = X[[test_index]]
    y_test = y[[test_index]]
    
    #SGD
    w = np.zeros(X.shape[1])
    alpha = 0.01
    epochs = 500
    SGDres = SGD(X_train,y_train,w,alpha,epochs)
    predSGD = predict(X_test,SGDres)
    SGD_correct_rate = (predSGD == y_test).mean()
    
    SGD_Model.append(SGDres)
    SGD_Correct.append(SGD_correct_rate)
    
    #Mini_Batch SGD
    w = np.zeros(X.shape[1])
    alpha = 0.01
    epochs = 500
    batch_size = 20
    minires = mini_batch_GD(X_train,y_train,w,alpha,epochs,batch_size)
    predmini = predict(X_test,minires)
    mini_correct_rate = (predmini == y_test).mean()
    
    mini_Model.append(minires)
    mini_Correct.append(mini_correct_rate)
    

In [15]:
SGD_Correct

[0.9583333333333334,
 0.9791666666666666,
 0.9791666666666666,
 0.9583333333333334,
 0.9583333333333334,
 0.9375,
 0.9791666666666666,
 1.0,
 1.0,
 1.0]

In [16]:
mini_Correct

[0.9375,
 0.9791666666666666,
 0.9583333333333334,
 0.9791666666666666,
 0.9375,
 0.9583333333333334,
 0.9375,
 1.0,
 1.0,
 0.9787234042553191]

In [17]:
w_SGD = mini_Model[9]
w_Mini = mini_Model[8]

pred_SGD = predict(X_test_Final,w_SGD)
pred_Mini = predict(X_test_Final,w_Mini)

In [18]:
SGD_Accuracy = (pred_SGD == y_test_Final).mean()
Mini_Accuracy = (pred_Mini == y_test_Final).mean()


In [19]:
precision_index = []
for i in range(len(y_test_Final)):
    if y_test_Final[i] == 1:
        precision_index.append(i)

precision_tp_SGD = 0
precision_fn_SGD = 0
precision_tp_Mini = 0
precision_fn_Mini = 0

for idx in precision_index:
    if pred_SGD[idx] == y_test_Final[idx]:
        precision_tp_SGD+=1
        
    if pred_SGD[idx] != y_test_Final[idx]:
        precision_fn_SGD+=1
        
    if pred_Mini[idx] == y_test_Final[idx]:
        precision_tp_Mini+=1
        
    if pred_Mini[idx] != y_test_Final[idx]:
        precision_fn_Mini+=1
        
recall_SGD = precision_tp_SGD/(precision_tp_SGD+precision_fn_SGD)
recall_Mini = precision_tp_Mini/(precision_tp_Mini+precision_fn_Mini)

In [20]:
precision_index = []
for i in range(len(y_test_Final)):
    if y_test_Final[i] == 0:
        precision_index.append(i)
        
precision_fp_SGD = 0
precision_fp_Mini = 0

for idx in precision_index:
    if y_test_Final[idx] != pred_SGD[idx]:
        precision_fp_SGD+=1
    if y_test_Final[idx] != pred_Mini[idx]:
        precision_fp_Mini+=1
        
precision_SGD = precision_tp_SGD/(precision_tp_SGD+precision_fp_SGD)
precision_Mini = precision_tp_Mini/(precision_tp_Mini+precision_fp_Mini)

In [23]:
print("%s,%f,%s,%f,%s,%f,%s,%f,%s,%f,%s,%f"%("SGD_Accuracy",SGD_Accuracy,"Mini_Accuracy",Mini_Accuracy,"recall_SGD",recall_SGD,"recall_Mini",recall_Mini,"precision_SGD",precision_SGD,"precision_Mini",precision_Mini))

SGD_Accuracy,0.965854,Mini_Accuracy,0.965854,recall_SGD,0.935897,recall_Mini,0.935897,precision_SGD,0.973333,precision_Mini,0.973333
