In [110]:
from collections import defaultdict
import pandas as pd
import scipy.optimize
import numpy 
from sklearn import svm
import random

In [126]:
train_dataset = pd.read_csv('train.csv')
validation_dataset = pd.read_csv('valid.csv')
test_dataset = pd.read_csv('test.csv')

dataset = pd.concat([train_dataset, validation_dataset]) # merges train and validation dataset into one beacause we don't need a validation set for similarity-based rating prediction
dataset.shape

(175869, 6)

In [127]:
train_dataset.head()

Unnamed: 0,item,user,paid,time,rating,review
0,Fisher_Price_Loving_Family_Sweet_Sounds_Dollhouse,karleigh,79.99,1071878400,4.0,i researched and looked at all the fisher pric...
1,Nokia_E62_Smartphone,mfw1982,199.0,1196294400,5.0,no it doesnt have a camera and yes the keys ar...
2,pr-Dell_DJ_15GB_MP3_Player,davydanger,199.0,1081987200,1.0,i was very excited to buy this product given a...
3,Blue_s_Clues_Bath_Time_Blue,kbmg,gift,990489600,2.0,my two kids ages 2 and 4 are big blues clues f...
4,Spider_Man_Gloves,pluckyduck,9.99,1031443200,3.0,dad was walking through toys r us the other we...


In [128]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
ratingDict = {}

In [129]:
for index, row in dataset.iterrows():
  item = row['item']
  user = row['user']
  rating = row['rating']

  usersPerItem[item].add(user)
  itemsPerUser[user].add(item)

  ratingDict[(item, user)] = rating

  reviewsPerUser[user].append({'item': item, 'rating': rating})
  reviewsPerItem[item].append({'user': user, 'rating': rating})

In [130]:
ratingMean = sum([d['rating'] for i,d in dataset.iterrows()]) / len(dataset)

In [131]:
ratingMean

3.629007954784527

In [132]:
labels = [d['rating'] for i,d in dataset.iterrows()]

In [133]:
userAverages = defaultdict(float)
itemAverages = defaultdict(float)

for u in itemsPerUser:
    rs = [ratingDict[(i,u)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(i,u)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

In [134]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

Bias Only model

In [196]:
N = len(dataset)
nUsers = len(reviewsPerUser)
nItems = len(reviewsPerItem)
users = list(reviewsPerUser.keys())
items = list(reviewsPerItem.keys())

In [197]:
alpha = ratingMean

In [198]:
userBiases = defaultdict(float)
itemBiases = defaultdict(float)

In [199]:

def prediction(user, item):
    return alpha + userBiases[user] + itemBiases[item]

In [200]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    alpha = theta[0]
    userBiases = dict(zip(users, theta[1:nUsers+1]))
    itemBiases = dict(zip(items, theta[1+nUsers:]))

In [201]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(d['user'], d['item']) for i,d in dataset.iterrows()]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in userBiases:
        cost += lamb*userBiases[u]**2
    for i in itemBiases:
        cost += lamb*itemBiases[i]**2
    return cost

In [202]:
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(dataset)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    for i,d in dataset.iterrows():
        u,i = d['user'], d['item']
        pred = prediction(u, i)
        diff = pred - d['rating']
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    return numpy.array(dtheta)

In [203]:
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + [0.0]*(nUsers+nItems),
                             derivative, args = (labels, 0.000005), maxiter=100)

MSE = 2.2419428837212503
MSE = 2.219513490604071
MSE = 2.139113881125881
MSE = 61.28241720504475
MSE = 2.1362176841846305
MSE = 1.9592247223578028
MSE = 1.9364304784473452
MSE = 1.8547870267018822
MSE = 1.5205513438642062
MSE = 1.377930392080828
MSE = 1.1815155745356969
MSE = 1.0644465322717513
MSE = 0.9802848785014515
MSE = 0.8837297694604103
MSE = 0.8053330445124265
MSE = 0.7490976716978598
MSE = 0.6867506824713819
MSE = 1.1592833515910728
MSE = 0.6856001680880784
MSE = 0.668070325959807
MSE = 0.6498474288029386
MSE = 0.6361525378447217
MSE = 0.6281797374907204
MSE = 0.6154312501241653
MSE = 0.5928245233005754
MSE = 0.5668392868466373
MSE = 0.5447020996745311
MSE = 0.5387616993018053
MSE = 0.5347486362848033
MSE = 0.5264809766514112
MSE = 0.5222837652455017
MSE = 0.5229410669875301
MSE = 0.5122930329483303
MSE = 0.5100455801456114
MSE = 0.5085480270410409
MSE = 0.5130078421506069
MSE = 0.5096115457632093
MSE = 0.5090016810380557
MSE = 0.5080736268959749
MSE = 0.5079092907939785
MSE =

(array([ 3.58714951,  0.88946612,  1.08661452, ..., -0.55120852,
        -1.27120264,  0.49108303]),
 0.8241355216727738,
 {'grad': array([ 4.66616407e-04,  3.41937470e-07, -6.58211634e-07, ...,
          3.41381856e-10, -4.25935373e-07,  1.30434004e-08]),
  'task': 'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT',
  'funcalls': 123,
  'nit': 100,
  'warnflag': 1})

In [204]:
def predictionBiasOnly(user, item):
    if user not in userBiases:
        userBiases[user] = 0
    if item not in itemBiases:
        itemBiases[item] = 0
    return alpha + userBiases[user] + itemBiases[item]

In [205]:
testpred = [predictionBiasOnly(d['user'], d['item']) for i,d in test_dataset.iterrows()]
testlabels = [d['rating'] for i,d in test_dataset.iterrows()]

In [206]:
testMSE = MSE(testpred, testlabels)

Bias Only model
lambda - 0.01, MSE - 2.2027241538606552;
lambda - 0.001, MSE - 2.096447470904356; 
lambda - 0.005, MSE - 2.0302324437660197;
lambda - 0.0001, MSE - 1.8416213205671876;
lambda - 0.00001, MSE - 1.744941635391268;
lambda - 0.000005, MSE - 1.7920784925895148;

In [207]:
testMSE

1.7920784925895148

Complete latent factor model

In [208]:
alpha = ratingMean

In [209]:
userBiases = defaultdict(float)
itemBiases = defaultdict(float)

In [210]:
userGamma = {}
itemGamma = {}

In [211]:
K = 2

In [212]:
for u in reviewsPerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [213]:
for i in reviewsPerItem:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [214]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K

In [215]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

In [216]:
def prediction(user, item):
    return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])

In [217]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(d['user'], d['item']) for i,d in dataset.iterrows()]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost

In [218]:
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(dataset)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in reviewsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in reviewsPerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for i,d in dataset.iterrows():
        u,i = d['user'], d['item']
        pred = prediction(u, i)
        diff = pred - d['rating']
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

In [219]:
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                   [0.0]*(nUsers+nItems) + # Initialize beta
                                   [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                             derivative, args = (labels, 0.00001), maxiter=100)

MSE = 2.2419508202401017
MSE = 2.219519726058573
MSE = 2.139107699328173
MSE = 62.882200144465976
MSE = 2.1363735226294924
MSE = 1.9591604220810357
MSE = 1.935147936427749
MSE = 1.8496979270715528
MSE = 1.5259250208960227
MSE = 1.3854002400314538
MSE = 1.1890253015445091
MSE = 1.0867165115795807
MSE = 1.0231670969453386
MSE = 0.9452989584815246
MSE = 0.8761462329082713
MSE = 0.836671070854695
MSE = 0.7941845965183796
MSE = 13.002218718601576
MSE = 0.7941016980730479
MSE = 0.7798570047611408
MSE = 0.7676290915136273
MSE = 0.7435050444680528
MSE = 0.7211520486257665
MSE = 0.6867237533167948
MSE = 0.6786636866875376
MSE = 0.6814754703363217
MSE = 0.6668077476183875
MSE = 0.6450861704505375
MSE = 0.6368128598079392
MSE = 0.6224700607978556
MSE = 0.6210023663494477
MSE = 0.6128007210297303
MSE = 0.604265779344633
MSE = 0.604484452365466
MSE = 0.6035550107538353
MSE = 0.602551743435006
MSE = 0.592463331551036
MSE = 0.5883592074806431
MSE = 0.5856020823671253
MSE = 0.584880394312809
MSE = 0.5

(array([ 3.60775563e+00,  6.34782359e-01,  7.74801621e-01, ...,
        -5.12239342e-02, -2.87230518e-04,  1.43175827e-04]),
 0.9078102161697278,
 {'grad': array([-8.27611233e-04,  2.87815293e-07, -1.55626572e-06, ...,
          2.45248344e-07, -7.05043423e-09,  2.49129281e-09]),
  'task': 'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT',
  'funcalls': 120,
  'nit': 100,
  'warnflag': 1})

In [224]:
def predictionLFM(user, item):
    try:
        if user not in userBiases and item in itemBiases:
            return alpha + itemBiases[item]
        if item not in itemBiases and user in userBiases:
            return alpha + userBiases[user]
        if item not in itemBiases and user not in userBiases:
            return alpha
        if user not in userGamma or item not in itemGamma or userGamma[user].any==0 or itemGamma[item].any==0:
            return alpha + userBiases[user] + itemBiases[item]
        return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])
    except:
        import pdb;pdb.set_trace()

In [225]:
testpred = [predictionLFM(d['user'], d['item']) for i,d in test_dataset.iterrows()]
testlabels = [d['rating'] for i,d in test_dataset.iterrows()]

In [226]:
testMSELFM = MSE(testpred, testlabels)

In [227]:
testMSELFM

1.781005401382922

lambda = 0.00001, MSELFM = 1.781005401382922;