In [626]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import random

In [627]:
X = pd.read_csv('datasets/question-3-features-train.csv')
Y = pd.read_csv('datasets/question-3-labels-train.csv')
test_X = pd.read_csv('datasets/question-3-features-test.csv')
test_Y = pd.read_csv('datasets/question-3-labels-test.csv')

X = X.to_numpy()
Y = Y.to_numpy()
X = (X - X.mean()) / (X.max() - X.min()) ## min-max normalization 

# Model 

In [628]:
def confusion_matrix(gt_y, pred_y):
    # initializing to 1 in order to avoid division by 0
    tp = 0.1
    tn = 0.1
    fp = 0.1
    fn = 0.1
    for g_y, p_y in zip(gt_y, pred_y):
        if g_y == 0 and p_y == 0:
            tn += 1
        elif g_y == 1 and p_y == 1:
            tp += 1
        elif g_y == 1 and p_y == 0:
            fn += 1
        elif g_y == 0 and p_y == 1:
            fp += 1
    precision = tp / tp + fp
    recall = tp / tp + fn
    npv = tn / fn + tn
    fpr = tn / fp + tn
    fdr = fp / fp + tp
    f1 = (2 * recall * precision) / (recall + precision)
    f2 = (5 * precision * recall) / (4 * precision + recall)
    return precision,recall,npv,fpr,fdr,f1,f2

In [629]:
def calc_accuracy(gt_y, pred_y):
    correct = 0
    for g_y, p_y in zip(gt_y, pred_y):
        if g_y == p_y:
            correct += 1
    return (correct/float(len(gt_y)) * 100)

In [630]:
def sigmoid(scores):
    return 1 / (1 + np.exp(-scores))

In [631]:
def log_likelihood(features, target, weights):
    scores = np.dot(features, weights)
    ll = np.sum( target*scores - np.log(1 + np.exp(scores)) )
    return ll

In [632]:
def logistic_regression(features, target, num_steps, learning_rate, add_intercept = False):
    if add_intercept:
        intercept = np.ones((features.shape[0], 1))
        features = np.hstack((intercept, features))
        
    weights = np.zeros(features.shape[1])
    
    for step in range(num_steps):
        scores = np.dot(features, weights)
        predictions = sigmoid(scores)

        # Update weights with gradient
        output_error_signal = target - predictions
        gradient = np.dot(features.T, output_error_signal)
        weights += learning_rate * gradient
        
        # Print log-likelihood every so often
        #if step % 10000 == 0:
        #    print(log_likelihood(features, target, weights))
        
    return weights

In [673]:
Y = np.squeeze(Y)
rates = [1e-01,1e-02,1e-03,1e-04,1e-05]
for current_rate in rates:
    # training the model to obtain the weights
    weights = logistic_regression(X, Y, num_steps = 1000, learning_rate = current_rate, add_intercept=True)
    # preparing test data
    data_with_intercept = np.hstack((np.ones((test_X.shape[0], 1)), test_X))
    # multiply test data with our weights
    final_scores = np.dot(data_with_intercept, weights)
    # prediction with test data and trained weights
    preds = np.round(sigmoid(final_scores))
    report = confusion_matrix(Y,preds)
    print('Learning Rate: ' + str(current_rate) + '    Accuracy : ' + str(calc_accuracy(test_Y['Survived'], preds)))    
    print('Precision: ' + str(report[0]) + ' Recall: ' + str(report[1]) + ' NPV: ' + str(report[2]) + 
          ' FPR: ' + str(report[3]) + ' FDR: ' + str(report[4]) + '\nF1: ' + str(report[5]) + ' F2: ' + str(report[6]))
    print()

  return 1 / (1 + np.exp(-scores))


Learning Rate: 0.1    Accuracy : 69.27374301675978
Precision: 40.1 Recall: 48.1 NPV: 75.6732484076433 FPR: 75.99514066496162 FDR: 20.1
F1: 43.737188208616786 F2: 46.254436450839336

Learning Rate: 0.01    Accuracy : 72.06703910614524
Precision: 26.1 Recall: 53.1 NPV: 89.79097888675624 FPR: 91.60996015936254 FDR: 15.1
F1: 34.997727272727275 F2: 43.99714285714286

Learning Rate: 0.001    Accuracy : 70.39106145251397
Precision: 20.1 Recall: 56.1 NPV: 95.80780399274046 FPR: 99.02670157068061 FDR: 12.1
F1: 29.596062992125987 F2: 41.30439560439561

Learning Rate: 0.0001    Accuracy : 72.06703910614524
Precision: 23.1 Recall: 54.1 NPV: 92.8156308851224 FPR: 95.22217194570135 FDR: 14.1
F1: 32.37590673575129 F2: 42.65221843003413

Learning Rate: 1e-05    Accuracy : 68.71508379888269
Precision: 33.1 Recall: 52.1 NPV: 82.68708414872798 FPR: 83.62647975077881 FDR: 16.1
F1: 40.48145539906103 F2: 46.73468834688347



### Learning rate 0.01 gives %72 accuracy.

##### Ignore the overflow encountered error

### Mini-Batch Stochastic Gradient Ascent 

In [658]:
X = pd.read_csv('datasets/question-3-features-train.csv')
Y = pd.read_csv('datasets/question-3-labels-train.csv')
test_X = pd.read_csv('datasets/question-3-features-test.csv')
test_Y = pd.read_csv('datasets/question-3-labels-test.csv')
X = (X - X.mean()) / (X.max() - X.min()) ## min-max normalization 
X

Unnamed: 0,Pclass,Age,Fare
0,-0.657303,0.516361,0.005325
1,0.342697,-0.174767,-0.046603
2,0.342697,-0.049108,-0.046904
3,0.342697,0.001156,-0.048205
4,0.342697,-0.002624,-0.046603
...,...,...,...
707,-0.657303,-0.137069,0.115459
708,-0.657303,0.302740,0.053626
709,0.342697,-0.137069,-0.046603
710,-0.657303,0.026288,-0.002783


In [666]:
def stochastic_gradient_ascent_regression(X,learning_rate=0.01, n_epochs=1000, k=100):  
    # initialising weights to gaussian random numbers N(0,0.01)
    w = random.normal(loc=0, scale=0.01, size = X.shape[1] - 1)
    #print(w.shape)
    b = np.random.randn(1,1)   # Random intercept value
    epoch = 1
    while epoch <= n_epochs:
        # taking mini batches with size k
        temp = X.sample(k)
        X_tr = temp.iloc[:,:3].values
        #print(X_tr)
        #print(X_tr.shape)
        y_tr = temp.iloc[:,-1].values
        #print(y_tr)
        #print(y_tr.shape)
        Lw = w
        Lb = b
        for i in range(k):
            # calculate derivative of gradients
            Lw = (-2/k * X_tr[i]) * (y_tr[i] - np.dot(X_tr[i],w.T) - b)
            Lb = (-2/k) * (y_tr[i] - np.dot(X_tr[i],w.T) - b)
            # gradient ascent
            w = w + learning_rate * Lw
            b = b + learning_rate * Lb
        epoch += 1
    return w

In [670]:
weights_sga = stochastic_gradient_ascent_regression(X.join(Y['Survived']))
final_scores = np.dot(X, weights_sga.squeeze())
preds = np.round(sigmoid(final_scores))
report = confusion_matrix(test_Y,preds)
print('Learning Rate: ' + str(0.01) + '    Accuracy : ' + str(calc_accuracy(test_Y['Survived'], preds)))
print('Precision: ' + str(report[0]) + ' Recall: ' + str(report[1]) + ' NPV: ' + str(report[2]) + 
     ' FPR: ' + str(report[3]) + ' FDR: ' + str(report[4]) + '\nF1: ' + str(report[5]) + ' F2: ' + str(report[6]))
print()

Learning Rate: 0.01    Accuracy : 55.3072625698324
Precision: 1.1 Recall: 1.1 NPV: 1.1 FPR: 1.1 FDR: 1.1
F1: 1.1 F2: 1.1



  return 1 / (1 + np.exp(-scores))
