In [1]:
import numpy as np
import pandas as pd

In [2]:
import operator

from copy import copy

In [3]:
from sklearn.metrics import mean_squared_error

In [4]:
df = pd.read_csv("../data/train.txt", sep='\t', header=None, names=["users", "items", "rank"])
df.head()

Unnamed: 0,users,items,rank
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [5]:
df["rank"].unique()

array([5, 3, 4, 1, 2])

In [6]:
n_users = df["users"].max()
n_items = df["items"].max()

In [7]:
df[["users", "items"]] -= 1
df.head()

Unnamed: 0,users,items,rank
0,0,0,5
1,0,1,3
2,0,2,4
3,0,3,3
4,0,4,3


In [8]:
def score(matrix, values):
    y_pred = [matrix[user, item] for user, item, rank in values]
    y_true = map(operator.itemgetter(2), values)
    return mean_squared_error(y_true, y_pred)

In [9]:
def iter_step(weights, df_grouped, C):
    weights_ = copy(weights)
    
    for user, group in df_grouped["users"]:
        Q = weights_["items"][group["items"].values]
        r = group["rank"].values
        
        A = np.dot(Q.T, Q) + C * np.eye(Q.shape[1])
        b = np.dot(Q.T, r)
        
        weights_["users"][user] = np.linalg.solve(A, b)
    
    for item, group in df_grouped["items"]:
        P = weights_["users"][group["users"].values]
        r = group["rank"].values
        
        A = np.dot(P.T, P) + C * np.eye(P.shape[1])
        b = np.dot(P.T, r)
        
        weights_["items"][item] = np.linalg.solve(A, b)
    
    return weights_

In [10]:
index = np.random.permutation(df.shape[0])
alpha = int(0.8 * len(index))

df_train, df_valid = df.iloc[index[:alpha]], df.iloc[index[alpha:]]

In [11]:
df_grouped = {cat : df_train.groupby(by=cat) for cat in ["users", "items"]}

In [12]:
C_range = np.power(10.0, np.arange(-2, 3))
features_range = [5, 10, 15, 30, 50, 100]

print C_range
print features_range

[  1.00000000e-02   1.00000000e-01   1.00000000e+00   1.00000000e+01
   1.00000000e+02]
[5, 10, 15, 30, 50, 100]


In [13]:
print " " * (len("\rtrain: {:>4}".format(0)) - 1),
for C in C_range:
    print "{:>10}".format(C),
print

for feature_i, n_features in enumerate(features_range):
    weights = dict()
    weights["users"] = np.random.normal(loc=0.0, scale=0.01, size=(n_users, n_features))
    weights["items"] = np.random.normal(loc=0.0, scale=0.01, size=(n_items, n_features))
    
    weights_ = copy(weights)
    scores = []
    
    for C_i, C in enumerate(C_range):
        weights = copy(weights_)
        
        for i in range(20):
            weigths = iter_step(weights, df_grouped, C)
            
        matrix = np.dot(weights["users"], weights["items"].T)
        scores.append(
            (score(matrix, df_train.values),
             score(matrix, df_valid.values))
        )
        
        print "\r{} of {} iters passed...".format(C_i + 1, len(C_range)),
        
    print "\rtrain: {:>4}".format(n_features),
    for scr in map(operator.itemgetter(0), scores):
        print "{:>10.5f}".format(scr),
    print
    
    print "\rvalid: {:>4}".format(n_features),
    for scr in map(operator.itemgetter(1), scores):
        print "{:>10.5f}".format(scr),
    print
    

                  0.01        0.1        1.0       10.0      100.0
train:    5    0.58737    0.58827    0.58990    0.75122    2.84288
valid:    5    2.43197    1.23583    1.01461    0.96909    2.95030
train:   10    0.41589    0.41814    0.42926    0.65590    2.84288
valid:   10    2.35523    1.72507    1.19922    0.97509    2.95024
train:   15    0.30848    0.30611    0.31783    0.59157    2.84288
valid:   15    3.16124    2.22200    1.34095    0.98485    2.95018
train:   30    0.12324    0.11982    0.12863    0.48210    2.84288
valid:   30    4.50099    3.29638    1.74120    1.00395    2.95023
train:   50    0.02814    0.02747    0.03579    0.42184    2.84288
valid:   50    4.74406    4.31660    1.87589    1.00722    2.95015
train:  100    0.00035    0.00042    0.00682    0.39373    2.84288
valid:  100    4.55402    5.42915    1.51608    0.99673    2.95018


In [18]:
C_range = [1.0, 5.0, 10.0, 25.0, 50.0]
features_range = range(3, 10)

In [19]:
print " " * (len("\rtrain: {:>4}".format(0)) - 1),
for C in C_range:
    print "{:>10}".format(C),
print

for feature_i, n_features in enumerate(features_range):
    weights = dict()
    weights["users"] = np.random.normal(loc=0.0, scale=0.01, size=(n_users, n_features))
    weights["items"] = np.random.normal(loc=0.0, scale=0.01, size=(n_items, n_features))
    
    weights_ = copy(weights)
    scores = []
    
    for C_i, C in enumerate(C_range):
        weights = copy(weights_)
        
        for i in range(20):
            weigths = iter_step(weights, df_grouped, C)
            
        matrix = np.dot(weights["users"], weights["items"].T)
        scores.append(
            (score(matrix, df_train.values),
             score(matrix, df_valid.values))
        )
        
        print "\r{} of {} iters passed...".format(C_i + 1, len(C_range)),
        
    print "\rtrain: {:>4}".format(n_features),
    for scr in map(operator.itemgetter(0), scores):
        print "{:>10.5f}".format(scr),
    print
    
    print "\rvalid: {:>4}".format(n_features),
    for scr in map(operator.itemgetter(1), scores):
        print "{:>10.5f}".format(scr),
    print
    

                   1.0        5.0       10.0       25.0       50.0
train:    3    0.68474    0.73406    0.81532    1.10634    1.67888
valid:    3    0.92070    0.91968    0.97979    1.24305    1.79474
train:    4    0.63142    0.69136    0.78005    1.09594    1.67888
valid:    4    0.94003    0.91039    0.97129    1.23906    1.79470
train:    5    0.58873    0.65635    0.75361    1.08883    1.67888
valid:    5    0.96774    0.91982    0.97250    1.23761    1.79476
train:    6    0.55143    0.62474    0.72914    1.08327    1.67889
valid:    6    1.00876    0.93919    0.96935    1.23507    1.79473
train:    7    0.51370    0.59475    0.70674    1.08070    1.67889
valid:    7    1.05804    0.94668    0.96799    1.23468    1.79474
train:    8    0.48677    0.57074    0.68892    1.07886    1.67888
valid:    8    1.10041    0.95950    0.97225    1.23440    1.79483
train:    9    0.45449    0.54621    0.67170    1.07880    1.67888
valid:    9    1.11350    0.96962    0.97711    1.23432    1.7