In [65]:
import numpy as np
import csv
import os

(2) Gaussian Discriminant Analysis
---------------------------------------
We have two classes from (1). Now, we use a GDA model where we assume that the class conditional densities are Gaussian and that the two classes share the same covariance matrix. We estimate the parameters using the maximum likelihood approach.


In [70]:
def learn(training_set):
    
    # Populate list X with data
    data = list(csv.reader(open(training_set)))
    X = []
    for index in range(len(data)):
        X.append(data[index])
    # Convert lists to numpy float arrays
    training_set = np.asarray(X)
    training_set = training_set.astype(np.float)
    
    # Get matrix of training examples. We want to find u0, u1, cov (s0, s1), pclass_0, pclass_1
    total = len(training_set)
    u_0 = 0
    u_1 = 0
    s_0 = 0
    s_1 = 0
    count_0 = 0
    count_1 = 0
    
    # Loop through once to get class counts and sum (for u_0, u_1)
    for row in training_set:
        # Check class label
        if row[-1] == -1:
            u_0 += row[:-1]
            count_0 += 1
        elif row[-1] == 1:
            u_1 += row[:-1]
            count_1 += 1
        else:
            print("Made a mistake with dataset creation!")
     
    # Get averages across the row
    u_0 /= count_0
    u_1 /= count_1
    
    # Posterior probabilities (follow Bernoulli(pi), where pi = N_0/N)
    pclass_0 = count_0 / total
    pclass_1 = count_1 / total
    
    # Loop through again to get (x(n) - u)(x(n) - u) ^T (for s_0, s_1)
    for row in training_set:
        # Check class label
        if row[-1] == -1:
            x_0 = row[:-1] - u_0
            x_0 = x_0[:, np.newaxis]
            s_0 += np.dot(x_0, x_0.T)
        
        elif row[-1] == 1:
            x_1 = row[:-1] - u_1
            x_1 = x_1[:, np.newaxis]
            s_1 += np.dot(x_1, x_1.T)
        else:
            print("Made a mistake with dataset creation!")
        
    # Get s_0, s_1
    s_0 /= count_0
    s_1 /= count_1

    # Now we can find the covariance matrix
    cov = (pclass_0 * s_0) + (pclass_1 * s_1)

    # Finally, get weights w0, w1
    cov_inverse = np.linalg.inv(cov)
    w_1 = np.dot(cov_inverse, (u_0 - u_1))
    w_0 = -0.5 * np.dot(u_0.T, np.dot(cov_inverse, u_0)) + 0.5 * np.dot(u_1.T, np.dot(cov_inverse, u_1)) \
    + np.log(pclass_0/pclass_1)
    print("%%%%%%%%  Weights   %%%%%%%\n")
    print("w_1 : \n", w_1)
    print("w_0 : \n", w_0)
    
    weight_dir = os.path.join(os.getcwd(), 'Assignment2_260601793_2_1_coefficients.txt')
    text_coefs = np.append(w_0, w_1)
    np.savetxt(weight_dir, text_coefs, delimiter=",", header='w_0 (the bias term) first, then w_1 (the feature weights)')
        
    return w_1, w_0
   

In [71]:
def get_pclass_0(w_1, w_0, x):
    # Returns probability of class 0 (negative class) given x
    a = np.dot(w_1, x) + w_0   
    return sigmoid(a) 

def get_pclass_1(w_1, w_0, x):
    # Returns probability of class 1 (positive class) given x
    return 1 - pclass_0(w_1, w_0, x)

def sigmoid(x):
    # Generic sigmoid function (apply as sigmoid(w^T x + w0))
    sig = 1/ (1 + np.exp(-x))
    return sig

In [72]:
def evaluate(test_set, w_1, w_0):
    # Given the coeficients we learned, use test set to evaluate the accuracy, precision,
    # recall, and F-measure performance of the model.

    # Populate list X with data
    data = list(csv.reader(open(test_set)))
    X = []
    for index in range(len(data)):
        X.append(data[index])
    
    # Convert lists to numpy float arrays
    test_set = np.asarray(X)
    test_set = test_set.astype(np.float)
    
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    total = 0

    # Go through test set examples. Check label. Get class probability and if it is less than
    # 0.5, choose the negative class 
    for row in test_set:
        target_class = row[-1]
        pclass_0 = get_pclass_0(w_1, w_0, row[:-1])
        total +=1
        if pclass_0 < 0.5:
            choice = 1
            if target_class == 1:
                tp += 1
            else:
                fp += 1
        else :
            choice = 0
            if target_class == -1:
                tn += 1
            else:
                fn += 1
    
    acc = (tn + tp)/ total
    prec = tp / (tp + fp) 
    recall = tp / (tp + fn)
    fmeasure = 2 * prec * recall / (prec + recall)

    print("\n%%%%%%%%%% Performance Metrics %%%%%%%%%%%%%\n")
    print("Accuracy: {}".format(acc))
    print("Precision: {}".format(prec))
    print("Recall: {}".format(recall))
    print("F-Measure: {}".format(fmeasure))


In [73]:
training_set = 'DS1_train_set.csv'
test_set = 'DS1_test_set.csv'
w_1, w_0 = learn(training_set)
evaluate(test_set, w_1, w_0)

%%%%%%%%  Weights   %%%%%%%

w_1 : 
 [ 14.61824043  -8.54453874  -5.45640546  -2.78454392  -9.72679861
  -4.52021572  16.5667912  -24.47268517 -29.16183147   9.45524633
 -13.36125264 -12.35934761  15.60255283  12.82041171  -5.82533067
  13.43813039  29.30757966  -6.9410348   -0.12493205  -5.28734351]
w_0 : 
 27.681970714115067

%%%%%%%%%% Performance Metrics %%%%%%%%%%%%%

Accuracy: 0.95375
Precision: 0.9437652811735942
Recall: 0.965
F-Measure: 0.9542645241038319
