In [1]:
import numpy as np
import pandas as pd

In [2]:
import operator

from copy import copy

In [3]:
from sklearn.metrics import mean_squared_error

In [4]:
df = pd.read_csv("../data/train.txt", sep='\t', header=None, names=["users", "items", "rank"])
df.head()

Unnamed: 0,users,items,rank
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [5]:
df["rank"].unique()

array([5, 3, 4, 1, 2])

In [6]:
n_users = df["users"].max()
n_items = df["items"].max()

In [7]:
df[["users", "items"]] -= 1
df.head()

Unnamed: 0,users,items,rank
0,0,0,5
1,0,1,3
2,0,2,4
3,0,3,3
4,0,4,3


In [8]:
def score(matrix, values):
    y_pred = [matrix[user, item] for user, item, rank in values]
    y_true = map(operator.itemgetter(2), values)
    return mean_squared_error(y_true, y_pred)

In [9]:
def iter_step(weights, bias, df_grouped, C):
    weights_, bias_ = copy(weights), copy(bias)
    
    for user, group in df_grouped["users"]:
        index = group["items"].values
        
        Q = weights_["items"][index]
        Q = np.hstack((np.ones(shape=(len(index), 1), dtype=float), Q))
        r = group["rank"].values - bias_["items"][index]
        
        A = np.dot(Q.T, Q) + C * np.eye(Q.shape[1])
        b = np.dot(Q.T, r)
        
        solution = np.linalg.solve(A, b)
        weights_["users"][user], bias_["users"][user] = solution[1:], solution[0] 
    
    for item, group in df_grouped["items"]:
        index = group["users"].values
        
        P = weights_["users"][index]
        P = np.hstack((np.ones(shape=(len(index), 1), dtype=float), P))
        r = group["rank"].values - bias_["users"][index]
        
        A = np.dot(P.T, P) + C * np.eye(P.shape[1])
        b = np.dot(P.T, r)
        
        solution = np.linalg.solve(A, b)
        weights_["items"][item], bias_["items"][item] = solution[1:], solution[0]
        
    return weights_, bias_

In [10]:
index = np.random.permutation(df.shape[0])
alpha = int(0.8 * len(index))

df_train, df_valid = df.iloc[index[:alpha]], df.iloc[index[alpha:]]

In [11]:
df_grouped = {cat : df_train.groupby(by=cat) for cat in ["users", "items"]}

In [16]:
C_range = np.power(10.0, np.arange(-2, 3))
features_range = [5, 10, 15, 30, 50, 100]

[  1.00000000e-02   1.00000000e-01   1.00000000e+00   1.00000000e+01
   1.00000000e+02]
[5, 10, 15, 30, 50, 100]


In [13]:
print " " * (len("\rtrain: {:>4}".format(0)) - 1),
for C in C_range:
    print "{:>10}".format(C),
print

for feature_i, n_features in enumerate(features_range):
    weights = dict()
    weights["users"] = np.random.normal(loc=0.0, scale=0.01, size=(n_users, n_features))
    weights["items"] = np.random.normal(loc=0.0, scale=0.01, size=(n_items, n_features))
    
    bias = dict()
    bias["users"] = np.random.normal(loc=0.0, scale=0.01, size=(n_users, ))
    bias["items"] = np.random.normal(loc=0.0, scale=0.01, size=(n_items, ))
    
    weights_, bias_ = copy(weights), copy(bias)
    scores = []
    
    for C_i, C in enumerate(C_range):
        weights = copy(weights_)
        bias = copy(bias_)
        
        for i in range(20):
            weigths, bias = iter_step(weights, bias, df_grouped, C)
            
        matrix = bias["users"].reshape(-1, 1) + bias["items"].reshape(1, -1) \
            + np.dot(weights["users"], weights["items"].T)
        scores.append(
            (score(matrix, df_train.values),
             score(matrix, df_valid.values))
        )
        
        print "\r{} of {} iters passed...".format(C_i + 1, len(C_range)),
        
    print "\rtrain: {:>4}".format(n_features),
    for scr in map(operator.itemgetter(0), scores):
        print "{:>10.5f}".format(scr),
    print
    
    print "\rvalid: {:>4}".format(n_features),
    for scr in map(operator.itemgetter(1), scores):
        print "{:>10.5f}".format(scr),
    print
    

                  0.01        0.1        1.0       10.0      100.0
train:    5    0.54061    0.53079    0.54662    0.72215    2.26989
valid:    5    1.79758    1.30202    0.96363    0.90310    2.32198
train:   10    0.38761    0.37861    0.40183    0.63300    2.26989
valid:   10    2.49061    1.91308    1.16880    0.90813    2.32198
train:   15    0.28554    0.27345    0.29726    0.57174    2.26989
valid:   15    3.31811    2.23528    1.31672    0.91768    2.32199
train:   30    0.10498    0.09367    0.11582    0.46683    2.26989
valid:   30    3.71704    2.78308    1.65124    0.93212    2.32198
train:   50    0.01888    0.01357    0.02945    0.40912    2.26989
valid:   50    3.36446    2.77980    1.72317    0.92823    2.32197
train:  100    0.00005    0.00007    0.00645    0.38320    2.26989
valid:  100    1.77428    1.57048    1.17212    0.91852    2.32195


In [20]:
features_range = [150, 200]




In [21]:
print " " * (len("\rtrain: {:>4}".format(0)) - 1),
for C in C_range:
    print "{:>10}".format(C),
print

for feature_i, n_features in enumerate(features_range):
    weights = dict()
    weights["users"] = np.random.normal(loc=0.0, scale=0.01, size=(n_users, n_features))
    weights["items"] = np.random.normal(loc=0.0, scale=0.01, size=(n_items, n_features))
    
    bias = dict()
    bias["users"] = np.random.normal(loc=0.0, scale=0.01, size=(n_users, ))
    bias["items"] = np.random.normal(loc=0.0, scale=0.01, size=(n_items, ))
    
    weights_, bias_ = copy(weights), copy(bias)
    scores = []
    
    for C_i, C in enumerate(C_range):
        weights = copy(weights_)
        bias = copy(bias_)
        
        for i in range(20):
            weigths, bias = iter_step(weights, bias, df_grouped, C)
            
        matrix = bias["users"].reshape(-1, 1) + bias["items"].reshape(1, -1) \
            + np.dot(weights["users"], weights["items"].T)
        scores.append(
            (score(matrix, df_train.values),
             score(matrix, df_valid.values))
        )
        
        print "\r{} of {} iters passed...".format(C_i + 1, len(C_range)),
        
    print "\rtrain: {:>4}".format(n_features),
    for scr in map(operator.itemgetter(0), scores):
        print "{:>10.5f}".format(scr),
    print
    
    print "\rvalid: {:>4}".format(n_features),
    for scr in map(operator.itemgetter(1), scores):
        print "{:>10.5f}".format(scr),
    print
    

                  0.01        0.1        1.0       10.0      100.0
train:  150    0.00000    0.00002    0.00538    0.38280    2.26989
valid:  150    1.30675    1.23441    1.04093    0.91844    2.32198
train:  200    0.00000    0.00002    0.00529    0.38281    2.26989
valid:  200    1.15587    1.13891    1.02496    0.91844    2.32198


In [22]:
C_range = [1.0, 5.0, 10.0, 25.0, 50.0]
features_range = range(3, 10)

In [23]:
print " " * (len("\rtrain: {:>4}".format(0)) - 1),
for C in C_range:
    print "{:>10}".format(C),
print

for feature_i, n_features in enumerate(features_range):
    weights = dict()
    weights["users"] = np.random.normal(loc=0.0, scale=0.01, size=(n_users, n_features))
    weights["items"] = np.random.normal(loc=0.0, scale=0.01, size=(n_items, n_features))
    
    bias = dict()
    bias["users"] = np.random.normal(loc=0.0, scale=0.01, size=(n_users, ))
    bias["items"] = np.random.normal(loc=0.0, scale=0.01, size=(n_items, ))
    
    weights_, bias_ = copy(weights), copy(bias)
    scores = []
    
    for C_i, C in enumerate(C_range):
        weights = copy(weights_)
        bias = copy(bias_)
        
        for i in range(20):
            weigths, bias = iter_step(weights, bias, df_grouped, C)
            
        matrix = bias["users"].reshape(-1, 1) + bias["items"].reshape(1, -1) \
            + np.dot(weights["users"], weights["items"].T)
        scores.append(
            (score(matrix, df_train.values),
             score(matrix, df_valid.values))
        )
        
        print "\r{} of {} iters passed...".format(C_i + 1, len(C_range)),
        
    print "\rtrain: {:>4}".format(n_features),
    for scr in map(operator.itemgetter(0), scores):
        print "{:>10.5f}".format(scr),
    print
    
    print "\rvalid: {:>4}".format(n_features),
    for scr in map(operator.itemgetter(1), scores):
        print "{:>10.5f}".format(scr),
    print
    

                   1.0        5.0       10.0       25.0       50.0
train:    3    0.63364    0.69832    0.77893    1.02610    1.46227
valid:    3    0.91217    0.87888    0.91705    1.11283    1.52057
train:    4    0.58679    0.65921    0.74693    1.01947    1.46227
valid:    4    0.92916    0.87004    0.90556    1.11032    1.52059
train:    5    0.55410    0.62663    0.72191    1.01447    1.46227
valid:    5    0.98047    0.87566    0.90454    1.10921    1.52057
train:    6    0.51829    0.60060    0.70018    1.01171    1.46227
valid:    6    0.99169    0.88615    0.90759    1.10849    1.52059
train:    7    0.48525    0.57163    0.68143    1.00961    1.46227
valid:    7    1.03303    0.89706    0.90440    1.10776    1.52059
train:    8    0.45615    0.54816    0.66438    1.00952    1.46227
valid:    8    1.07555    0.91253    0.90730    1.10772    1.52056
train:    9    0.42780    0.52463    0.64819    1.00948    1.46227
valid:    9    1.11756    0.92213    0.90745    1.10772    1.5