In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
data_path = "data/"
gen_data_path = "generated/"
c1_m1_file = "DS2_c1_m1.txt" # positive 
c1_m2_file = "DS2_c1_m2.txt" # positive 
c1_m3_file = "DS2_c1_m3.txt" # positive 
c2_m1_file = "DS2_c2_m1.txt" # negative
c2_m2_file = "DS2_c2_m2.txt" # negative
c2_m3_file = "DS2_c2_m3.txt" # negative
cov1_file = "DS2_Cov1.txt"
cov2_file = "DS2_Cov2.txt"
cov3_file = "DS2_Cov3.txt"

probability_mixture = [0.1,0.42,0.48]
num_class = 2
num_features = 20
num_obs = 2000
train_val_test_sizes = [0.6, 0.2, 0.2]

In [3]:
def get_matrix(feat, file_path, squeeze = True):
    if squeeze:
        return np.squeeze(pd.read_csv(file_path, header = None).drop(columns = feat).values, axis = 0)
    else:
        return pd.read_csv(file_path, header = None).drop(columns = feat).values
    
def get_x_y_values(dataset):
    return dataset.drop('class', axis = 1).values, dataset['class'].values

In [4]:
df = pd.read_csv(gen_data_path + 'DS2', index_col = 'Unnamed: 0')
df_train = pd.read_csv(gen_data_path + 'DS2_train', index_col = 'Unnamed: 0')
test_x, test_y = get_x_y_values(df)

In [5]:
def sigmoid(a):
    return 1 / (1 + math.exp(-a))

def set_coeff(data):
    neg_xs = np.array(df_train.loc[df_train['class'] == 0].drop('class', axis = 1))
    pos_xs = np.array(df_train.loc[df_train['class'] == 1].drop('class', axis = 1))
    all_xs = np.array(df_train.drop('class', axis = 1))
    
    # get means and covariance metrix
    cov = np.cov(all_xs.T)
    m0 = np.mean(neg_xs, axis = 0)
    m1 = np.mean(pos_xs, axis = 0)
    
    # calculate w and w0
    inv_cov = np.linalg.inv(cov)
    w = np.matmul(inv_cov, (m0-m1))
    w0_0 = -0.5 * np.matmul(m0.T, np.matmul(inv_cov, m0))
    w0_1 = 0.5 * np.matmul(m1.T, np.matmul(inv_cov, m1))
    w0_2 = np.log((len(neg_xs)/len(all_xs))/(len(pos_xs)/len(all_xs)))
    
    # adding b/c 50/50 percent chances
    w0 = w0_0 + w0_1 + w0_2
    return w, w0

def eval_x(x):
    a = np.matmul(w_glob, x) + w0_glob
    return round(1 - sigmoid(a))

In [6]:
w_glob, w0_glob = set_coeff(df_train)

In [7]:
def test_model(data_x, data_y):
    TN, TP, FN, FP = 0, 0, 0, 0
    for x, y in zip(data_x, data_y):
        y_pred = eval_x(x)
        if y_pred == y:
            if y_pred:
                TP += 1
            else: 
                TN += 1
        else: 
            if y_pred:
                FP += 1
            else: 
                FN += 1
    
    accuracy = (TN + TP)/len(data_y)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F_measure = (2 * recall * precision) / (recall + precision)
    
    return accuracy, precision, recall, F_measure

In [8]:
accuracy, precision, recall, F_measure = test_model(test_x, test_y)

In [9]:
print("The accuracy is: %s." % (accuracy))
print("The precision is: %s." % (precision))
print("The recall is: %s." % (recall))
print("The F-measure is: %s." % (F_measure))

The accuracy is: 0.485.
The precision is: 0.48292682926829267.
The recall is: 0.49748743718592964.
The F-measure is: 0.49009900990099003.


In [10]:
print("The learnt w0 coefficiants is: %s." % w0_glob)
print("The learnt w coefficiants are: %s." % w_glob)

The learnt w0 coefficiants is: 0.0186337247932.
The learnt w coefficiants are: [ 0.00862355 -0.03733938 -0.11616578  0.05516113 -0.06615171 -0.09799603
  0.00843698 -0.11474018  0.0557585   0.00328878 -0.04304396 -0.02190424
 -0.0071664   0.11735962 -0.02379976  0.05307851  0.023536    0.06232784
  0.00127837  0.11328722].
