In [8]:
import pickle
import sys
import time
import numpy as np
from scipy import sparse
from scipy.spatial.distance import cosine
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

In [6]:
def cal_time(func):
    def wrapper(*args, **kwargs):
        t1 = time.time()
        result = func(*args, **kwargs)
        t2 = time.time()
        print("%s training time: %s secs." % (func.__name__, t2 - t1))
        return result
    return wrapper


def load_data(path='data/'):
    train = sparse.load_npz(path + "train.npz")
    test = sparse.load_npz(path + "test.npz")
    test_X = np.c_[test.tocoo().row, test.tocoo().col]
    test_y = test.tocoo().data
    return train.toarray(), test_X, test_y


@cal_time
def get_similarity(M):
    module = np.linalg.norm(M, axis=1).reshape(-1,1)
    return M @ M.T / module.T / module

@cal_time
def trainCF(train, SIM, test_X):
    pred_y = []
    for user, movie in test_X:
        others = train[:,movie].nonzero()[0]
        sims = SIM[user, others].reshape(-1)
        score = train[others, movie].reshape(-1)
        s = np.sum(sims * score) / np.sum(sims)
        pred_y.append(s)
    return pred_y

def rmse(test_y, pred_y):
    return np.sqrt(mean_squared_error(test_y, pred_y))

In [3]:
train, test_X, test_y = load_data("data/")
SIM = np.load("data/sims.npy")

In [9]:
pred_y = trainCF(train, SIM, test_X)
print("RMSE=%f" % rmse(test_y, pred_y))

trainCF training time: 188.69590973854065 secs.
RMSE=1.018369
