In [1]:
# load data
import pandas as pd
knn_df = pd.read_csv('binary_gtky.csv')
knn_df.head()

Unnamed: 0,co_op,credit_hours,work_hours
0,0,18,0
1,0,16,0
2,1,12,8
3,0,17,0
4,1,12,5


In [2]:
# Set up the arrays
import numpy as np
# should properly pre-process so that the two numeric features are on the same scale
raw_X = knn_df.to_numpy()[:,1:]
y = knn_df.to_numpy()[:,0]
from sklearn import preprocessing
scale_X = preprocessing.scale(raw_X)
Phi = np.hstack([scale_X, np.ones(scale_X.shape[0]).reshape(-1,1)])
Phi

array([[ 1.14917373, -0.84036991,  1.        ],
       [-0.26117585, -0.84036991,  1.        ],
       [-3.08187502,  0.22739421,  1.        ],
       [ 0.44399894, -0.84036991,  1.        ],
       [-3.08187502, -0.17301734,  1.        ],
       [ 0.44399894,  1.16168782,  1.        ],
       [-0.26117585, -0.43995837,  1.        ],
       [ 0.44399894,  0.76127628,  1.        ],
       [ 0.44399894, -0.84036991,  1.        ],
       [-0.26117585,  2.49639298,  1.        ],
       [ 0.44399894, -0.84036991,  1.        ],
       [ 1.14917373, -0.84036991,  1.        ],
       [ 0.44399894, -0.84036991,  1.        ],
       [-0.26117585, -0.17301734,  1.        ],
       [ 1.14917373, -0.84036991,  1.        ],
       [-0.26117585,  0.76127628,  1.        ],
       [ 0.44399894,  1.16168782,  1.        ],
       [-0.26117585, -0.84036991,  1.        ],
       [-0.26117585, -0.84036991,  1.        ],
       [ 1.14917373,  0.49433524,  1.        ],
       [-0.26117585,  0.49433524,  1.   

In [3]:
# The sigmoid
def sigmoid(x, w):
    return 1/(1 + np.exp(-x.T.dot(w)))

def logistic_grad(X, y, w):
    L = 0
    for i, j in enumerate(X):
        x = j.reshape(X.shape[1], 1)
        L += (sigmoid(x, w) - y[i])*x

    dL_dw = L/len(X)
    return dL_dw

def gradient_descent(X, y, w, eta):
    for i in range(2000):
        w = w - eta*logistic_grad(X, y, w)
    return w

In [4]:
# starting w
w0 = np.array([[1],
              [1],
              [1]])

# starting predictions vs truth
preds = []
for i in range(Phi.shape[0]):
    preds.append(int(sigmoid(Phi[i,:], w0)[0] > .5))
    print(f'Pred v Truth: {sigmoid(Phi[i,:], w0)[0], y[i]}')

Pred v Truth: (np.float64(0.787312922986581), np.int64(0))
Pred v Truth: (np.float64(0.4746353512768594), np.int64(0))
Pred v Truth: (np.float64(0.13534765713422703), np.int64(1))
Pred v Truth: (np.float64(0.6464861313300426), np.int64(0))
Pred v Truth: (np.float64(0.09492829535050801), np.int64(1))
Pred v Truth: (np.float64(0.9312266746533144), np.int64(0))
Pred v Truth: (np.float64(0.5741652250099661), np.int64(1))
Pred v Truth: (np.float64(0.9007222280006686), np.int64(1))
Pred v Truth: (np.float64(0.6464861313300426), np.int64(0))
Pred v Truth: (np.float64(0.962138262646873), np.int64(1))
Pred v Truth: (np.float64(0.6464861313300426), np.int64(1))
Pred v Truth: (np.float64(0.787312922986581), np.int64(0))
Pred v Truth: (np.float64(0.6464861313300426), np.int64(0))
Pred v Truth: (np.float64(0.6377950579215216), np.int64(1))
Pred v Truth: (np.float64(0.787312922986581), np.int64(1))
Pred v Truth: (np.float64(0.8175894539237655), np.int64(1))
Pred v Truth: (np.float64(0.93122667465331

In [5]:
# Accuracy
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(y_true=y, y_pred=preds)
true_positive = conf_mat[1,1]
false_positive = conf_mat[0,1]
true_negative = conf_mat[0,0]
false_negative = conf_mat[1,0]

# Accuracy (prop. of correctly classified students)
accuracy = (true_positive + true_negative) / knn_df.shape[0]
print(f'Accuracy of the algorithm: {accuracy}')

Accuracy of the algorithm: 0.5185185185185185


In [6]:
# Run Gradient Descent to (hopefully) improve predictions
w_gd = gradient_descent(Phi, y, w0, 0.1)
print(w_gd)

[[-0.76924703]
 [ 0.64441136]
 [ 0.32423452]]


In [7]:
# gradient descent predictions vs truth
preds = []
for i in range(Phi.shape[0]):
    preds.append(int(sigmoid(Phi[i,:], w_gd)[0] > .5))
    print(f'Pred v Truth: {sigmoid(Phi[i,:], w_gd)[0], y[i]}')

Pred v Truth: (np.float64(0.24949491869888654), np.int64(0))
Pred v Truth: (np.float64(0.4958999295674604), np.int64(0))
Pred v Truth: (np.float64(0.9448772823106036), np.int64(1))
Pred v Truth: (np.float64(0.3638126030499743), np.int64(0))
Pred v Truth: (np.float64(0.9297896981504709), np.int64(1))
Pred v Truth: (np.float64(0.6750892742500156), np.int64(0))
Pred v Truth: (np.float64(0.560115076209392), np.int64(1))
Pred v Truth: (np.float64(0.6161561840443207), np.int64(1))
Pred v Truth: (np.float64(0.3638126030499743), np.int64(0))
Pred v Truth: (np.float64(0.894149688894559), np.int64(1))
Pred v Truth: (np.float64(0.3638126030499743), np.int64(1))
Pred v Truth: (np.float64(0.24949491869888654), np.int64(0))
Pred v Truth: (np.float64(0.3638126030499743), np.int64(0))
Pred v Truth: (np.float64(0.6019625009126487), np.int64(1))
Pred v Truth: (np.float64(0.24949491869888654), np.int64(1))
Pred v Truth: (np.float64(0.7341377469082329), np.int64(1))
Pred v Truth: (np.float64(0.67508927425

In [8]:
conf_mat = confusion_matrix(y_true=y, y_pred=preds)
true_positive = conf_mat[1,1]
false_positive = conf_mat[0,1]
true_negative = conf_mat[0,0]
false_negative = conf_mat[1,0]

# Accuracy (prop. of correctly classified students)
accuracy = (true_positive + true_negative) / knn_df.shape[0]
print(f'Accuracy of the algorithm: {accuracy}')

Accuracy of the algorithm: 0.7037037037037037


In [9]:
# Check the other measures of binary classification fit
# Could compare to KNN classifier from knn.ipynb
# Precision (prop. of predicted co-op students that were correctly predicted?)
precision = true_positive / (true_positive + false_positive)
print(f'Precision of the algorithm: {precision}')

# Recall (prop. of co-op students that were predicted correctly?)
recall = true_positive / (true_positive + false_negative)
print(f'Recall of the algorithm: {recall}')

# F1 Score (score between 0 and 1 where 1 is best)
F1_score = (2*true_positive) / (2*true_positive + false_positive + false_negative)
print(f'F1 Score of the algorithm: {F1_score}')

Precision of the algorithm: 0.7692307692307693
Recall of the algorithm: 0.6666666666666666
F1 Score of the algorithm: 0.7142857142857143


In [10]:
# Compare to sklearn
from sklearn.linear_model import LogisticRegression

# My Phi matrix already has the column of 1s, so no need to fit an intercept
clf = LogisticRegression(random_state=0, fit_intercept=False).fit(Phi, y)
print(clf.coef_)
print(clf.predict(Phi))

[[-0.57701581  0.5373397   0.2421836 ]]
[0 0 1 0 1 1 1 1 0 1 0 0 0 1 0 1 1 0 0 0 1 1 0 1 0 1 0]


In [11]:
preds = clf.predict(Phi)
conf_mat = confusion_matrix(y_true=y, y_pred=preds)
true_positive = conf_mat[1,1]
false_positive = conf_mat[0,1]
true_negative = conf_mat[0,0]
false_negative = conf_mat[1,0]

# Accuracy (prop. of correctly classified students)
accuracy = (true_positive + true_negative) / knn_df.shape[0]
print(f'Accuracy of the algorithm: {accuracy}')

Accuracy of the algorithm: 0.7037037037037037


In [12]:
# Note; again, should probably be CROSS VALIDATING to properly
# assess performance, but this data set is so small, no real reason to
