In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
data_path = "data/"
gen_data_path = "generated/"
m0_file = "DS1_m_0.txt"
m1_file = "DS1_m_1.txt"
cov_file = "DS1_Cov.txt"
num_features = 20
train_val_test_sizes = [0.6, 0.2, 0.2]

In [3]:
def get_matrix(feat, file_path, squeeze = True):
    if squeeze:
        return np.squeeze(pd.read_csv(file_path, header = None).drop(columns = feat).values, axis = 0)
    else:
        return pd.read_csv(file_path, header = None).drop(columns = feat).values

In [4]:
def get_x_y_values(dataset):
    return dataset.drop('class', axis = 1).values, dataset['class'].values

In [5]:
df = pd.read_csv(gen_data_path + 'DS1', index_col = 'Unnamed: 0')
df_train = pd.read_csv(gen_data_path + 'DS1_train', index_col = 'Unnamed: 0')

In [6]:
test_x, test_y = get_x_y_values(df)

In [7]:
def sigmoid(a):
    return 1 / (1 + math.exp(-a))

def set_coeff(data):
    neg_xs = np.array(df_train.loc[df_train['class'] == 0].drop('class', axis = 1))
    pos_xs = np.array(df_train.loc[df_train['class'] == 1].drop('class', axis = 1))
    all_xs = np.array(df_train.drop('class', axis = 1))
    
    # get means and covariance metrix
    cov = np.cov(all_xs.T)
    m0 = np.mean(neg_xs, axis = 0)
    m1 = np.mean(pos_xs, axis = 0)

    inv_cov = np.linalg.inv(cov)
    w = np.matmul(inv_cov, (m0-m1))
    
    # calculate w and w0
    w0_0 = -0.5 * np.matmul(m0.T, np.matmul(inv_cov, m0))
    w0_1 = 0.5 * np.matmul(m1.T, np.matmul(inv_cov, m1))
    w0_2 = np.log((len(neg_xs)/len(all_xs))/(len(pos_xs)/len(all_xs)))
    
    w0 = w0_0 + w0_1 + w0_2
    return w, w0

def eval_x(x):
    a = np.matmul(w_glob, x) + w0_glob
    return round(1 - sigmoid(a))

In [8]:
w_glob, w0_glob = set_coeff(df_train)

In [9]:
def test_model(data_x, data_y):
    TN, TP, FN, FP = 0, 0, 0, 0
    for x, y in zip(data_x, data_y):
        y_pred = eval_x(x)
        if y_pred == y:
            if y_pred:
                TP += 1
            else: 
                TN += 1
        else: 
            if y_pred:
                FP += 1
            else: 
                FN += 1
    
    accuracy = (TN + TP)/len(data_y)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F_measure = (2 * recall * precision) / (recall + precision)
    
    return accuracy, precision, recall, F_measure

In [10]:
accuracy, precision, recall, F_measure = test_model(test_x, test_y)

In [11]:
print("The accuracy is: %s." % (accuracy))
print("The precision is: %s." % (precision))
print("The recall is: %s." % (recall))
print("The F-measure is: %s." % (F_measure))

The accuracy is: 0.9725.
The precision is: 0.9753694581280788.
The recall is: 0.9705882352941176.
The F-measure is: 0.972972972972973.


In [12]:
print("The learnt w0 coefficiants is: %s." % w0_glob)
print("The learnt w coefficiants are: %s." % w_glob)

The learnt w0 coefficiants is: 6.95577207861.
The learnt w coefficiants are: [ 3.68175705 -2.23886983 -1.37359386 -0.76470616 -2.53453359 -1.14754645
  4.25172267 -6.26509508 -7.47615356  2.44177177 -3.34774001 -3.02371555
  3.93788275  3.26562175 -1.46690355  3.40306674  7.50179268 -1.73910642
 -0.06064171 -1.26718443].
