In [13]:
from mnist import MNIST
import sklearn.metrics as metrics
import numpy as np
import scipy as sp
import numpy.ma as ma
from scipy.stats import mode
from scipy.io import loadmat
import csv, time, math
import matplotlib.pyplot as plt
from collections import defaultdict


def load_data(filename):
    data_dict = loadmat(filename, mat_dtype=True)
    return data_dict['train']

def write_prediction(label_test, filename):
    print "Writing to file {}...".format(filename)
    label_test = label_test.flatten()
    with open(filename, 'wb') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=' ', quotechar=' ', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["Id,Category"])
        for i, cat in enumerate(label_test):
            spamwriter.writerow([str(i+1) + "," + str(int(cat))])

def naive_svd(U, S, V, data, filename):
    valid, labels = [], []
    R_estimate = U.dot(S).dot(V.T)
    valid_data = np.zeros((100, 100))
    with open(filename) as fh:
        for line in fh:
            user, joke, score = np.array(line.split(','), dtype=int)
            valid_data[user-1][joke-1] = score
            valid.append(score)
            labels.append(1 if R_estimate[user-1][joke-1]>0 else 0)
    return valid, labels

def predict_report(R_estimate, filename):
    valid, labels = [], []
    valid_data = np.zeros((100, 100))
    with open(filename) as fh:
        for line in fh:
            user, joke, score = np.array(line.split(','), dtype=int)
            valid_data[user-1][joke-1] = score
            valid.append(score)
            labels.append(1 if R_estimate[user-1][joke-1]>0 else 0)
    return valid, labels

def MSE(Up, Vp, data, d, max_iter=1000, reg=10):
    Up_old, Vp_old = Up.copy(), Vp.copy()
    valid, labels = [], []
    for _ in range(max_iter):
        for i in range(Up.shape[0]):
            A = reg*np.eye(d)
            B = np.zeros((d, ))
            for j in range(Vp.shape[0]):
                if not np.isnan(data[i][j]):
                    A += np.outer(Vp[j], Vp[j])
                    B += data[i][j]*Vp[j]
            u = sp.linalg.solve(A, B)
            Up[i] = u

        for j in range(Vp.shape[0]):
            A = reg*np.eye(d)
            B = np.zeros((d, ))
            for i in range(Up.shape[0]):
                if not np.isnan(data[i][j]):
                    A += np.outer(Up[i], Up[i])
                    B += data[i][j]*Up[i]
            v = sp.linalg.solve(A, B)
            Vp[j] = v

        if np.allclose(Up_old, Up, atol=1e-08) and np.allclose(Vp_old, Vp, atol=1e-08):
            break
        else:
            Up_old, Vp_old = Up.copy(), Vp.copy()
    R_estimate = Up.dot(Vp.T)
    return R_estimate

In [2]:
raw_data = load_data("./joke_data/joke_train.mat")
data = np.nan_to_num(raw_data)

In [3]:
U, S, V = np.linalg.svd(data)
U2, S2, V2 = U[:, :2], np.diag(S[:2]), V[:2, :].T
U5, S5, V5 = U[:, :5], np.diag(S[:5]), V[:5, :].T
U10, S10, V10 = U[:, :10], np.diag(S[:10]), V[:10, :].T
U20, S20, V20 = U[:, :20], np.diag(S[:20]), V[:20, :].T

In [5]:
valid, labels = naive_svd(U2, S2, V2, data, "./joke_data/validation.txt")
print "Accuracy Score: {}..".format(metrics.accuracy_score(labels, valid))

Accuracy Score: 0.705149051491..


In [6]:
valid, labels = naive_svd(U5, S5, V5, data, "./joke_data/validation.txt")
print "Accuracy Score: {}..".format(metrics.accuracy_score(labels, valid))

Accuracy Score: 0.715447154472..


In [7]:
valid, labels = naive_svd(U10, S10, V10, data, "./joke_data/validation.txt")
print "Accuracy Score: {}..".format(metrics.accuracy_score(labels, valid))

Accuracy Score: 0.716531165312..


In [8]:
valid, labels = naive_svd(U20, S20, V20, data, "./joke_data/validation.txt")
print "Accuracy Score: {}..".format(metrics.accuracy_score(labels, valid))

Accuracy Score: 0.685907859079..


In [30]:
R_estimate = MSE(U2, V2, raw_data, 2, max_iter=10, reg=300)
valid, labels = predict_report(R_estimate, "./joke_data/validation.txt")
print "Accuracy Score: {}..".format(metrics.accuracy_score(labels, valid))

Accuracy Score: 0.706775067751..


In [31]:
R_estimate = MSE(U5, V5, raw_data, 5, max_iter=10, reg=300)
valid, labels = predict_report(R_estimate, "./joke_data/validation.txt")
print "Accuracy Score: {}..".format(metrics.accuracy_score(labels, valid))

Accuracy Score: 0.719783197832..


In [32]:
R_estimate = MSE(U10, V10, raw_data, 10, max_iter=10, reg=300)
valid, labels = predict_report(R_estimate, "./joke_data/validation.txt")
print "Accuracy Score: {}..".format(metrics.accuracy_score(labels, valid))

Accuracy Score: 0.731978319783..


In [33]:
R_estimate = MSE(U20, V20, raw_data, 20, max_iter=10, reg=300)
valid, labels = predict_report(R_estimate, "./joke_data/validation.txt")
print "Accuracy Score: {}..".format(metrics.accuracy_score(labels, valid))

Accuracy Score: 0.733604336043..


In [34]:
R_estimate = MSE(U10, V10, raw_data, 10, max_iter=10, reg=300)
valid, labels = predict_report(R_estimate, "./joke_data/validation.txt")
print "Accuracy Score: {}..".format(metrics.accuracy_score(labels, valid))

Accuracy Score: 0.733875338753..


In [35]:
labels = []
with open("./joke_data/query.txt") as fh:
    for line in fh:
        id, user, joke = np.array(line.split(','), dtype=int)
        labels.append(1 if R_estimate[user-1][joke-1]>0 else 0)
write_prediction(np.array(labels), "kaggle_submission.txt")

Writing to file kaggle_submission.txt...
