In [1]:
import numpy as np
import pandas as pd
import csv
from random import randint
from sklearn.metrics import accuracy_score

In [2]:
#Load training and testing data    
df_test = pd.read_csv("LRTest.csv")
df_train = pd.read_csv("LRTrain.csv")

In [3]:
#Split X and y
X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:,-1]
X_test = df_test.iloc[:, :-1]
y_test = df_test.iloc[:,-1]

In [4]:
# initialize
np.random.seed(100)
w = np.zeros(X_train.shape[1])
gamma = 0.00001
T = 2000

In [5]:
#define a function that calculates gradient descent
def gradientdes(w, X_train, y_train):
    n = X_train.shape[0]
    return 1/n * sum ((1 / (1+ np.exp(-w.T.dot(X_train.iloc[i]))) - y_train.iloc[i]) * X_train.iloc[i]  for i in range(n))

In [6]:
# train 
for i in range(T):
    w -= gamma * gradientdes(w, X_train, y_train) 

In [7]:
threshold = np.linspace(0,1,11,endpoint = True)

In [8]:
# create a list consisting of the probability predicted
def get_prob(w):
    result = []
    for i in range(X_test.shape[0]):
        result.append (1 / (1 + np.exp( - w.T.dot(X_test.iloc[i]))))
    return result

In [9]:
result = get_prob(w)

In [10]:
pred = [0 if y <= 0.6 else 1 for y in result]
accuracy_score(y_test,pred)

0.9256505576208178

# Try different gammas

In [11]:
w1 = np.zeros(X_train.shape[1])
gamma = 0.0001
for i in range(T):
    w1 -= gamma * gradientdes(w1, X_train, y_train) 
result1 = get_prob(w1)

In [12]:
pred1 = [0 if y <= 0.6 else 1 for y in result1]
accuracy_score(y_test,pred1)

0.9182156133828996

In [13]:
w2 = np.zeros(X_train.shape[1])
gamma = 0.000001
for i in range(T):
    w2 -= gamma * gradientdes(w2, X_train, y_train) 
result2 = get_prob(w2)

In [14]:
pred2 = [0 if y <= 0.6 else 1 for y in result2]
accuracy_score(y_test,pred2)

0.9033457249070632

### gamma of 0.00001 performed the best.
# Now try different iteration times on 0.00001

In [15]:
w3 = np.zeros(X_train.shape[1])
T = 1000
for i in range(T):
    w3 -= gamma * gradientdes(w3, X_train, y_train) 
result3 = get_prob(w3)

In [16]:
pred3 = [0 if y <= 0.6 else 1 for y in result3]
accuracy_score(y_test,pred3)

0.8884758364312267

In [17]:
w4 = np.zeros(X_train.shape[1])
T = 3000
for i in range(T):
    w4 -= gamma * gradientdes(w4, X_train, y_train) 
result4 = get_prob(w4)

In [18]:
pred4 = [0 if y <= 0.6 else 1 for y in result4]
accuracy_score(y_test,pred4)

0.9144981412639405

# So the default values work the best. Now creating the performance table

In [19]:
NofP = y_test.sum()
NofN = y_test.count() - NofP

In [20]:
TPR,FPR,TNR,FNR = [],[],[],[]
for t in threshold:    
    temp = result
    pred = [0 if y <= t else 1 for y in temp]
    TP = 0
    FP = 0
    for i in range(X_test.shape[0]):
        if pred[i] == y_test[i] == 1:
            TP += 1
        if pred[i] == 1 and y_test[i] != pred[i]:
            FP +=1
    TPR.append(TP/NofP)
    FPR.append(FP/NofN)
    TNR.append(1 - FP/NofN)
    FNR.append(1 - TP/NofP)

TPR,FPR,TNR,FNR = pd.Series(TPR),pd.Series(FPR),pd.Series(TNR),pd.Series(FNR)

performance_table = pd.concat([TPR,FPR,TNR,FNR],axis=1)
performance_table.rename(columns = {0:'TPR',1:'FPR',2:'TNR',3:'FNR'})
performance_table['t'] = threshold
performance_table.set_index('t')

Unnamed: 0_level_0,0,1,2,3
t,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,1.0,1.0,0.0,0.0
0.1,0.979592,0.584795,0.415205,0.020408
0.2,0.959184,0.216374,0.783626,0.040816
0.3,0.908163,0.105263,0.894737,0.091837
0.4,0.887755,0.05848,0.94152,0.112245
0.5,0.857143,0.040936,0.959064,0.142857
0.6,0.846939,0.02924,0.97076,0.153061
0.7,0.785714,0.011696,0.988304,0.214286
0.8,0.734694,0.005848,0.994152,0.265306
0.9,0.714286,0.0,1.0,0.285714
