In [1]:
import numpy as np
import pandas as pd

In [2]:
import operator

from copy import copy

In [3]:
from sklearn.metrics import mean_squared_error

In [4]:
df = pd.read_csv("../data/train.txt", sep='\t', header=None, names=["users", "items", "rank"])
df.head()

Unnamed: 0,users,items,rank
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [5]:
df["rank"].unique()

array([5, 3, 4, 1, 2])

In [6]:
n_users = df["users"].max()
n_items = df["items"].max()

In [7]:
df[["users", "items"]] -= 1
df.head()

Unnamed: 0,users,items,rank
0,0,0,5
1,0,1,3
2,0,2,4
3,0,3,3
4,0,4,3


In [8]:
def score(matrix, values):
    y_pred = [matrix[user, item] for user, item, rank in values]
    y_true = map(operator.itemgetter(2), values)
    return mean_squared_error(y_true, y_pred)

In [9]:
def iter_step(weights, bias, df_grouped, C, alpha):
    weights_, bias_ = copy(weights), copy(bias)
    
    for user, group in df_grouped["users"]:
        index = group["items"].values
        
        Q = weights_["items"][index]
        Q = np.hstack((np.ones(shape=(len(index), 1), dtype=float), Q))
        r = group["rank"].values - bias_["items"][index]
        G = np.diag(1.0 + alpha * r)
        
        A = np.dot(Q.T, np.dot(G, Q)) + C * np.eye(Q.shape[1])
        b = np.dot(Q.T, np.dot(G, r))
        
        solution = np.linalg.solve(A, b)
        weights_["users"][user], bias_["users"][user] = solution[1:], solution[0] 
    
    for item, group in df_grouped["items"]:
        index = group["users"].values
        
        P = weights_["users"][index]
        P = np.hstack((np.ones(shape=(len(index), 1), dtype=float), P))
        r = group["rank"].values - bias_["users"][index]
        G = np.diag(1.0 + alpha * r)
        
        A = np.dot(P.T, np.dot(G, P)) + C * np.eye(P.shape[1])
        b = np.dot(P.T, np.dot(G, r))
        
        solution = np.linalg.solve(A, b)
        weights_["items"][item], bias_["items"][item] = solution[1:], solution[0]
        
    return weights_, bias_

In [10]:
index = np.random.permutation(df.shape[0])
beta = int(0.8 * len(index))

df_train, df_valid = df.iloc[index[:beta]], df.iloc[index[beta:]]

In [11]:
df_grouped = {cat : df_train.groupby(by=cat) for cat in ["users", "items"]}

In [12]:
C = 5.0
n_features = 2

In [13]:
alpha_range = np.arange(0, 0.31, 0.01).tolist()
alpha = float(len(df)) / (n_users * n_items)
alpha_range.append(alpha)
alpha_range = sorted(alpha_range)

In [14]:
weights = dict()
weights["users"] = np.random.normal(loc=0.0, scale=0.01, size=(n_users, n_features))
weights["items"] = np.random.normal(loc=0.0, scale=0.01, size=(n_items, n_features))
    
bias = dict()
bias["users"] = np.random.normal(loc=0.0, scale=0.01, size=(n_users, ))
bias["items"] = np.random.normal(loc=0.0, scale=0.01, size=(n_items, ))

scores = []

for alpha_i, alpha in enumerate(alpha_range):
    weights_, bias_ = copy(weights), copy(bias)
    
    for i in range(20):
        weigths, bias = iter_step(weights, bias, df_grouped, C, alpha)
            
    matrix = bias["users"].reshape(-1, 1) + bias["items"].reshape(1, -1) \
        + np.dot(weights["users"], weights["items"].T)
    scores.append(
        (score(matrix, df_train.values),
         score(matrix, df_valid.values))
    )
        
    print "\r{} of {} iters passed...".format(alpha_i + 1, len(alpha_range)),
    
for alpha, scr in zip(alpha_range, scores):
    print "\r{:>10.5f} {:>10.5f} {:>10.5f}".format(alpha, *scr)

   0.00000    0.74190    0.87380
   0.01000    0.73447    0.86290
   0.02000    0.73247    0.86090
   0.03000    0.73047    0.85918
   0.04000    0.72869    0.85768
   0.05000    0.72712    0.85637
   0.05710    0.72612    0.85555
   0.06000    0.72574    0.85524
   0.07000    0.72453    0.85428
   0.08000    0.72348    0.85346
   0.09000    0.72257    0.85277
   0.10000    0.72179    0.85220
   0.11000    0.72113    0.85174
   0.12000    0.72056    0.85137
   0.13000    0.72010    0.85109
   0.14000    0.71971    0.85088
   0.15000    0.71938    0.85074
   0.16000    0.71910    0.85067
   0.17000    0.71889    0.85070
   0.18000    0.71876    0.85080
   0.19000    0.71871    0.85096
   0.20000    0.71873    0.85117
   0.21000    0.71880    0.85144
   0.22000    0.71892    0.85174
   0.23000    0.71909    0.85208
   0.24000    0.71931    0.85246
   0.25000    0.71956    0.85288
   0.26000    0.71986    0.85332
   0.27000    0.72020    0.85379
   0.28000    0.72056    0.85428
   0.29000