In [74]:
## import libraries
import numpy as np
import pandas as pd
import operator
import warnings
warnings.filterwarnings("ignore")

In [75]:
train_df = pd.read_csv("../datasets/question4/q4-train-dataset.csv")
test_df = pd.read_csv("../datasets/question4/q4-test-dataset.csv")
train = train_df.to_numpy()
test = test_df.to_numpy()

In [76]:
def normalize(data):
    data  -= np.min(data,axis = 0)
    data /= (np.max(data, axis = 0) - np.min(data,axis = 0))
    return data        

In [77]:
train = train_df.to_numpy()
test = test_df.to_numpy()

In [78]:
x_train = train[:, :-1]
y_train = train[:, 29:30]
x_test = test[:, :-1]
y_test = test[:, 29:30]
x_train = normalize(x_train)
x_test = normalize(x_test)

In [79]:
def sigmoid_func(x_train, weights):
    return 1 / (1 + np.exp(- np.dot(x_train, weights))) ## p(y = 0| x, w)

def update_rule(learning_rate, weights, coefficients):
     ## new parameters  = old parameters + step size
    weights = weights + learning_rate * coefficients
    return weights

def gradient_ascent(learning_rate, iteration_count, x_train, y_train, batch_size):
    weights = np.random.normal(0, 0.01, x_train.shape[1]) ## N(0,0.01) distribution for initial weights
    for i in range(iteration_count):
        coefficients = 0  
        for j in range(y_train.shape[0]):
            likelihoods = y_train[j][0] - sigmoid_func(x_train[j], weights) ## predicted - likelihood
            coefficients += np.dot(x_train[j].T, likelihoods) # coefficients for new weights
            if( (j == y_train.shape[0] - 1) or ( j != 0 and j % batch_size == 0 )):
                weights = update_rule(learning_rate, weights, coefficients)
                gradient = 0
    return weights

def classifier(predictions):
    for i in range(predictions.shape[0]):
        if predictions[i] >= 0:
            predictions[i] = 1
        else:
            predictions[i] = 0
    return predictions

def test(x_test, y_test, weights):
    linear_predictions = np.dot(x_train, weights)
    predictions = classifier(linear_predictions)
    return predictions


def calc_accuracy(actual_values, predictions):
    correct = 0
    for pred, act in zip(predictions, actual_values):
        if pred == act[0]:
            correct += 1
    return (correct/float(len(actual_values))*100)

def get_performance_metrics(actual_values, predictions):
    tp = 0
    fp = 0
    fn = 0
    tn = 0
    precision = 0
    recall = 0
    fdr = 0
    fpr = 0
    npv = 0
    for pred, act in zip(predictions, actual_values):
        if pred == 1:
            if act[0] == 1.0:
                tp += 1
            else:
                fp += 1
        else:
            if act[0] == 1.0:
                fn += 1
            else:
                tn += 1
    if tp == 0 and fp == 0:
        precision =  0
        recall = 0
        fdr = 0
        fpr = 0
    else:
        precision = ( (tp/(tp + fp)) )
        recall = ( (tp / (tp + fn) ) )
        fdr = ( (fp / (tp + fp)))
        fpr = ( (fp / (fp + tn)))
    if tn == 0 and fn == 0:
        npv = 0
    else:
        npv = ( (tn / (tn + fn)) )
    return (fp, tp, fn, tn, precision, recall, fdr, fpr, npv)

def get_f_scores( beta, precision, recall ):
    return ( (1 + np.power(beta, 2)) * precision * recall) / ((np.power(beta,2) * precision) + recall)
         
            

In [80]:
def print_performance_metric(fp, tp, fn, tn, precision, recall, fdr, fpr, npv, f1, f2):
    print("True Positives: ", tp)
    print("False Positives: ", fp)     
    print("False Negatives: ", fn)  
    print("True Negatives: ", tn) 
    print("Presicion: ", precision)
    print("Recall: ", recall)
    print("False Positive Rate: ", fpr)
    print("False Discovery Rate: ", fdr)
    print("Negative Predictive Value: ", npv)
    print( "F1: ", f1 )
    print( "F2: ", f2 )    

In [81]:
def get_sorted_weights(weights):
    sorted_weights = []
    for i in range(len(weights)):
        sorted_weights.append((i, abs(weights[i])))
    sorted_weights.sort(key = operator.itemgetter(1))
    return sorted_weights

In [82]:
iterations = np.arange(100, 1100, 100)
learning_rates = [0.001, 0.01, 0.1]

In [83]:
weights = []
for it in iterations:
    for lr in learning_rates:
        weight = gradient_ascent(lr, it, x_train, y_train, x_train.shape[0]) ## full batch
        weights.append((it, lr, weight))

In [84]:
predictions = []
for w in weights:
    predictions.append((w[0], w[1], test(x_test, y_test, w[2])))

In [85]:
accuracies = [] 
for prediction in predictions:
    acc = calc_accuracy(y_test, prediction[2])
    accuracies.append( (prediction[0], prediction[1], acc) )

In [86]:
for acc in accuracies:
    print( "iteration" , acc[0], " learning rate:", acc[1], " accuracy:" , acc[2] )

iteration 100  learning rate: 0.001  accuracy: 96.03174603174604
iteration 100  learning rate: 0.01  accuracy: 75.39682539682539
iteration 100  learning rate: 0.1  accuracy: 95.23809523809523
iteration 200  learning rate: 0.001  accuracy: 96.03174603174604
iteration 200  learning rate: 0.01  accuracy: 96.82539682539682
iteration 200  learning rate: 0.1  accuracy: 95.23809523809523
iteration 300  learning rate: 0.001  accuracy: 96.82539682539682
iteration 300  learning rate: 0.01  accuracy: 96.03174603174604
iteration 300  learning rate: 0.1  accuracy: 86.5079365079365
iteration 400  learning rate: 0.001  accuracy: 96.82539682539682
iteration 400  learning rate: 0.01  accuracy: 96.03174603174604
iteration 400  learning rate: 0.1  accuracy: 96.03174603174604
iteration 500  learning rate: 0.001  accuracy: 96.82539682539682
iteration 500  learning rate: 0.01  accuracy: 96.03174603174604
iteration 500  learning rate: 0.1  accuracy: 96.03174603174604
iteration 600  learning rate: 0.001  accu

In [87]:
(fp, tp, fn, tn, precision, recall, fdr, fpr, npv) = get_performance_metrics(y_test, predictions[27][2]) ## iteration = 1000, lr = 0.001

In [88]:
f1 = get_f_scores(1, precision, recall)
f2 = get_f_scores(2, precision, recall)

In [89]:
print_performance_metric(fp, tp, fn, tn, precision, recall, fdr, fpr, npv, f1, f2)

True Positives:  60
False Positives:  1
False Negatives:  3
True Negatives:  62
Presicion:  0.9836065573770492
Recall:  0.9523809523809523
False Positive Rate:  0.015873015873015872
False Discovery Rate:  0.01639344262295082
Negative Predictive Value:  0.9538461538461539
F1:  0.9677419354838709
F2:  0.9584664536741213


In [90]:
full_weights = []
for i in range(len(weights)):
    if weights[i][1] == 0.001 and weights[i][0] == 1000:
        for j in range(len(weights[i][2])):
            full_weights.append(  weights[i][2][j] )

In [91]:
sorted_weights = get_sorted_weights(full_weights)
np.array(sorted_weights)[19:29]

array([[20.        ,  1.68047411],
       [28.        ,  1.68506205],
       [15.        ,  1.95950926],
       [26.        ,  2.03871339],
       [ 1.        ,  2.45340442],
       [12.        ,  2.79422803],
       [11.        ,  3.17434624],
       [10.        ,  4.27014621],
       [13.        ,  6.2043206 ],
       [ 3.        ,  7.46958511]])

In [92]:
## mini batch
### from now an I choosed my learning rate 0.001 which is the best learning rate
mini_batch_weights = []
for it in iterations:
    weight = gradient_ascent(0.001, it, x_train, y_train, 32)
    mini_batch_weights.append((it, weight))

In [93]:
mini_batch_predictions = []
for w in mini_batch_weights:
    mini_batch_predictions.append((w[0], test(x_test, y_test, w[1])))

In [94]:
mini_batch_accuracies = [] 
for prediction in mini_batch_predictions:
    acc = calc_accuracy(y_test, prediction[1])
    mini_batch_accuracies.append( (prediction[0], acc) )

In [95]:
for acc in mini_batch_accuracies:
    print( "iteration" , acc[0], " accuracy", acc[1] )

iteration 100  accuracy 96.82539682539682
iteration 200  accuracy 96.82539682539682
iteration 300  accuracy 96.82539682539682
iteration 400  accuracy 96.82539682539682
iteration 500  accuracy 96.82539682539682
iteration 600  accuracy 96.82539682539682
iteration 700  accuracy 88.09523809523809
iteration 800  accuracy 93.65079365079364
iteration 900  accuracy 96.82539682539682
iteration 1000  accuracy 96.03174603174604


In [96]:
### iteration_count = 1000, learning_rate = 0.001
(mini_batch_fp, mini_batch_tp, mini_batch_fn, mini_batch_tn, mini_batch_precision, mini_batch_recall, mini_batch_fdr, mini_batch_fpr, mini_batch_npv) = get_performance_metrics(y_test, mini_batch_predictions[9][1])

In [97]:
f1 = get_f_scores(1, mini_batch_precision, mini_batch_recall)
f2 = get_f_scores(2, mini_batch_precision, mini_batch_recall)

In [98]:
print_performance_metric(mini_batch_fp, mini_batch_tp, mini_batch_fn, mini_batch_tn, mini_batch_precision, mini_batch_recall, mini_batch_fdr, mini_batch_fpr, mini_batch_npv, f1, f2)

True Positives:  63
False Positives:  5
False Negatives:  0
True Negatives:  58
Presicion:  0.9264705882352942
Recall:  1.0
False Positive Rate:  0.07936507936507936
False Discovery Rate:  0.07352941176470588
Negative Predictive Value:  1.0
F1:  0.9618320610687023
F2:  0.9843750000000001


In [99]:
## stochastic gradient ascent

In [100]:
stochastic_weights = []
for it in iterations:
    weight = gradient_ascent(0.001, it, x_train, y_train, 1) ## stochastic gradient ascent can be provided by setting the batch_size = 1
    stochastic_weights.append((it, weight))

In [101]:
stochastic_predictions = []
for w in stochastic_weights:
    stochastic_predictions.append((w[0], test(x_test, y_test, w[1])))

In [102]:
stochastic_accuracies = [] 
for prediction in stochastic_predictions:
    acc = calc_accuracy(y_test, prediction[1])
    stochastic_accuracies.append( (prediction[0], acc) )

In [103]:
for acc in stochastic_accuracies:
    print( "iteration" , acc[0], " accuracy", acc[1] )

iteration 100  accuracy 96.03174603174604
iteration 200  accuracy 99.20634920634922
iteration 300  accuracy 98.4126984126984
iteration 400  accuracy 98.4126984126984
iteration 500  accuracy 98.4126984126984
iteration 600  accuracy 98.4126984126984
iteration 700  accuracy 97.61904761904762
iteration 800  accuracy 97.61904761904762
iteration 900  accuracy 97.61904761904762
iteration 1000  accuracy 97.61904761904762


In [104]:
(fp, tp, fn, tn, precision, recall, fdr, fpr, npv) = get_performance_metrics(y_test, stochastic_predictions[9][1]) ## iteration = 1000, lr = 0.001

In [105]:
f1 = get_f_scores(1, precision, recall)
f2 = get_f_scores(2, precision, recall)

In [106]:
print_performance_metric(fp, tp, fn, tn, precision, recall, fdr, fpr, npv, f1, f2)

True Positives:  61
False Positives:  1
False Negatives:  2
True Negatives:  62
Presicion:  0.9838709677419355
Recall:  0.9682539682539683
False Positive Rate:  0.015873015873015872
False Discovery Rate:  0.016129032258064516
Negative Predictive Value:  0.96875
F1:  0.976
F2:  0.9713375796178345
