In [28]:
import numpy as np
import math
from sklearn.metrics import roc_auc_score

In [29]:
'''
    load data from txt file
'''
def load_data(filename):
    f = open(filename, 'r')
    k = 0
    data = f.read()
    f.close()
    data_list = data.split('\n')
    arr_list = []
    for line in data_list:
        if line != '' :
            arr = line.split(' ')
            arr_list.append(arr)
    return arr_list

In [30]:
'''
    transform original data to samples dataset
'''
def samples_dict(features, labels):
    samples = {}
    for i in range(len(labels)):
        samples[str(i+1)] = {}
    for item in features:
        samples[item[0]][item[1]] = item[2]
    return samples        

In [31]:
'''
    given the x, compute the probability of each class using LogSumExp to prevent underflow
'''
def log_p_pair(log_p0, log_p1, dict_x):
    log_denominator = 0
    logsum_p0 = math.log(0.5)
    logsum_p1 = math.log(0.5)
    for key, value in dict_x.items():
        logsum_p0 += int(value) * log_p0[key]
        logsum_p1 += int(value) * log_p1[key]
    max_ak = max(logsum_p0,logsum_p1)
    log_denominator = max_ak + math.log(math.exp(logsum_p0 - max_ak) + math.exp(logsum_p1 - max_ak))
    return ((logsum_p0 - log_denominator),(logsum_p1 - log_denominator))

In [32]:
test_features = load_data('test-features.txt')
test_labels = np.ravel(load_data('test-labels.txt'))
train_features = load_data('train-features.txt')
train_labels = np.ravel(load_data('train-labels.txt'))
train_samples = samples_dict(train_features,train_labels)
test_samples = samples_dict(test_features,test_labels)

uni_features = np.unique(np.ravel(np.mat(train_features)[:,1]))
test_uni_features = np.unique(np.ravel(np.mat(test_features)[:,1]))
#obtain the number of whole possible words
total_words = max((max(uni_features.astype(int)),max(test_uni_features.astype(int))))

In [33]:
'introducing a smoothing'
smoothing = 1
features_times_0 = {}
features_times_1 = {}
for i in range(total_words+1):
    features_times_0[str(i)] = smoothing
    features_times_1[str(i)] = smoothing
    
for item in train_features:
    # classify and count features times
    if train_labels[int(item[0])-1] == '0':
        features_times_0[item[1]] += int(item[2])
    else:
        features_times_1[item[1]] += int(item[2])

total_times_0 = 0
total_times_1 = 0
for i in features_times_0.values():
    total_times_0 += i
for i in features_times_1.values():
    total_times_1 += i

In [34]:
log_p_0 = {}
log_p_1 = {}
for key, value in features_times_0.items():
    log_p_0[key] = math.log(value) - math.log(total_times_0)
for key, value in features_times_1.items():
    log_p_1[key] = math.log(value) - math.log(total_times_1)

In [35]:
prob_arr = []
results = []
for i in range(len(train_labels)):
    #temp = scores(log_p_0,log_p_1, train_samples[str(i+1)])
    p_0, p_1 = log_p_pair(log_p_0,log_p_1, train_samples[str(i+1)])
    prob_arr.append(math.exp(p_1))
    if p_0 >= p_1:
        results.append('0')
    else:
        results.append('1')
print '3(a) classification error on training set'
err = 0.0
for i in range(len(results)):
    if results[i] != train_labels[i]:
        err += 1
print err/len(train_labels)

print '3(a) AUC on training set'
print roc_auc_score(train_labels.astype(int), prob_arr)

3(a) classification error on training set
0.0128571428571
3(a) AUC on training set
0.999763265306


In [36]:
test_prob_arr = []
test_results = []
for i in range(len(test_labels)):
    #temp = scores(log_p_0,log_p_1, train_samples[str(i+1)])
    p_0, p_1 = log_p_pair(log_p_0,log_p_1, test_samples[str(i+1)])
    test_prob_arr.append(math.exp(p_1))
    if p_0 >= p_1:
        test_results.append('0')
    else:
        test_results.append('1')
print '3(b) classification error on test set'
test_err = 0.0
for i in range(len(test_results)):
    if test_results[i] != test_labels[i]:
        test_err += 1
print test_err/len(test_labels)

print '3(b) AUC on test set'
print roc_auc_score(test_labels.astype(int), test_prob_arr)

3(b) classification error on test set
0.0192307692308
3(b) AUC on test set
0.989881656805


3(b)  Comparison

Comparing the AUC and the classification error from test set with those from training set, it shows that the AUC from test set is a little less than the AUC from train_set and the classification error from test set is bigger than that from train set. But the model from train set still performs well in test set.