In [1]:
import numpy as np
import pandas as pd

In [2]:
import operator

from copy import copy

In [3]:
from sklearn.metrics import mean_squared_error

In [4]:
df = pd.read_csv("../data/train.txt", sep='\t', header=None, names=["users", "items", "rank"])
df.head()

Unnamed: 0,users,items,rank
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [5]:
df["rank"].unique()

array([5, 3, 4, 1, 2])

In [6]:
n_users = df["users"].max()
n_items = df["items"].max()

In [7]:
df[["users", "items"]] -= 1
df.head()

Unnamed: 0,users,items,rank
0,0,0,5
1,0,1,3
2,0,2,4
3,0,3,3
4,0,4,3


In [8]:
def score(matrix, values):
    y_pred = [matrix[user, item] for user, item, rank in values]
    y_true = map(operator.itemgetter(2), values)
    return mean_squared_error(y_true, y_pred)

In [9]:
def iter_step(weights, bias, df_grouped, C, alpha):
    weights_, bias_ = copy(weights), copy(bias)
    
    for user, group in df_grouped["users"]:
        index = group["items"].values
        
        Q = weights_["items"][index]
        Q = np.hstack((np.ones(shape=(len(index), 1), dtype=float), Q))
        r = group["rank"].values - bias_["items"][index]
        G = np.diag(1.0 + alpha * r)
        
        A = np.dot(Q.T, np.dot(G, Q)) + C * np.eye(Q.shape[1])
        b = np.dot(Q.T, np.dot(G, r))
        
        solution = np.linalg.solve(A, b)
        weights_["users"][user], bias_["users"][user] = solution[1:], solution[0] 
    
    for item, group in df_grouped["items"]:
        index = group["users"].values
        
        P = weights_["users"][index]
        P = np.hstack((np.ones(shape=(len(index), 1), dtype=float), P))
        r = group["rank"].values - bias_["users"][index]
        G = np.diag(1.0 + alpha * r)
        
        A = np.dot(P.T, np.dot(G, P)) + C * np.eye(P.shape[1])
        b = np.dot(P.T, np.dot(G, r))
        
        solution = np.linalg.solve(A, b)
        weights_["items"][item], bias_["items"][item] = solution[1:], solution[0]
        
    return weights_, bias_

In [10]:
index = np.random.permutation(df.shape[0])
beta = int(0.8 * len(index))

df_train, df_valid = df.iloc[index[:beta]], df.iloc[index[beta:]]

In [11]:
df_grouped = {cat : df_train.groupby(by=cat) for cat in ["users", "items"]}

In [12]:
alpha_range = np.arange(0, 0.31, 0.01).tolist()
alpha = float(len(df)) / (n_users * n_items)
alpha_range.append(alpha)
alpha_range = sorted(alpha_range)

In [15]:
C_range = [0.01, 0.1, 1.0, 5.0, 10.0, 25.0, 50.0, 100.0]
features_range = range(3, 10) + [15, 30, 50, 100, 150, 200]

In [16]:
scores = dict()

for feature_i, n_features in enumerate(features_range):
    weights = dict()
    weights["users"] = np.random.normal(loc=0.0, scale=0.01, size=(n_users, n_features))
    weights["items"] = np.random.normal(loc=0.0, scale=0.01, size=(n_items, n_features))
    
    bias = dict()
    bias["users"] = np.random.normal(loc=0.0, scale=0.01, size=(n_users, ))
    bias["items"] = np.random.normal(loc=0.0, scale=0.01, size=(n_items, ))
    
    weights_, bias_ = copy(weights), copy(bias)
    
    for C in C_range:
        for alpha in alpha_range:
            weights = copy(weights_)
            bias = copy(bias_)
        
            scores_ = []
            for i in range(20):
                weigths, bias = iter_step(weights, bias, df_grouped, C, alpha)
                matrix = bias["users"].reshape(-1, 1) + bias["items"].reshape(1, -1) \
                    + np.dot(weights["users"], weights["items"].T)
                scores_.append(
                    (score(matrix, df_train.values),
                     score(matrix, df_valid.values))
                )

            scores_ = sorted(scores_, key=operator.itemgetter(1))
            params = (n_features, C, alpha)
            scores[params] = scores_[0]
        
            print "n_factors = {:>6}; C = {:>6.4f}; alpha = {:>6.2f}; {:>10.5f} {:>10.5f}".format(
                params[0], params[1], params[2], scores_[0][0], scores_[0][1])

n_factors =      3; C = 0.0100; alpha =   0.00;    0.77484    1.19125
n_factors =      3; C = 0.0100; alpha =   0.01;    0.62065    1.41355
n_factors =      3; C = 0.0100; alpha =   0.02;    0.61929    1.50615
n_factors =      3; C = 0.0100; alpha =   0.03;    0.61699    1.63346
n_factors =      3; C = 0.0100; alpha =   0.04;    0.61663    1.56553
n_factors =      3; C = 0.0100; alpha =   0.05;    0.61664    1.54525
n_factors =      3; C = 0.0100; alpha =   0.06;    0.61689    1.53733
n_factors =      3; C = 0.0100; alpha =   0.06;    0.61693    1.52698
n_factors =      3; C = 0.0100; alpha =   0.07;    0.61752    1.52225
n_factors =      3; C = 0.0100; alpha =   0.08;    0.61815    1.50816
n_factors =      3; C = 0.0100; alpha =   0.09;    0.61872    1.50100
n_factors =      3; C = 0.0100; alpha =   0.10;    0.61954    1.50016
n_factors =      3; C = 0.0100; alpha =   0.11;    0.61981    1.49410
n_factors =      3; C = 0.0100; alpha =   0.12;    0.62055    1.52387
n_factors =      3; 

KeyboardInterrupt: 

In [None]:
def comparator(a, b):
    result = a[1][1] - b[1][1]
    if result == 0:
        return a[1][0] - b[1][0]
    else:
        return result

In [None]:
sorted(scores.items(), cmp=comparator)

In [None]:
C_range = [1.0, 5.0, 10.0, 25.0, 50.0]
features_range = range(3, 10)

In [None]:
print " " * (len("\rtrain: {:>4}".format(0)) - 1),
for C in C_range:
    print "{:>10}".format(C),
print

for feature_i, n_features in enumerate(features_range):
    weights = dict()
    weights["users"] = np.random.normal(loc=0.0, scale=0.01, size=(n_users, n_features))
    weights["items"] = np.random.normal(loc=0.0, scale=0.01, size=(n_items, n_features))
    
    bias = dict()
    bias["users"] = np.random.normal(loc=0.0, scale=0.01, size=(n_users, ))
    bias["items"] = np.random.normal(loc=0.0, scale=0.01, size=(n_items, ))
    
    weights_, bias_ = copy(weights), copy(bias)
    scores = []
    
    for C_i, C in enumerate(C_range):
        weights = copy(weights_)
        bias = copy(bias_)
        
        for i in range(20):
            weigths, bias = iter_step(weights, bias, df_grouped, C, alpha)
            
        matrix = bias["users"].reshape(-1, 1) + bias["items"].reshape(1, -1) \
            + np.dot(weights["users"], weights["items"].T)
        scores.append(
            (score(matrix, df_train.values),
             score(matrix, df_valid.values))
        )
        
        print "\r{} of {} iters passed...".format(C_i + 1, len(C_range)),
        
    print "\rtrain: {:>4}".format(n_features),
    for scr in map(operator.itemgetter(0), scores):
        print "{:>10.5f}".format(scr),
    print
    
    print "\rvalid: {:>4}".format(n_features),
    for scr in map(operator.itemgetter(1), scores):
        print "{:>10.5f}".format(scr),
    print
    