# Hybrid Models for Recommendation Systems

Load Pandas, we are going to need it for manipulating data

In [1]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)

Now load the data

In [9]:
user_ratings_df = pd.read_csv("user_ratings.csv")
user_features_df = pd.read_csv("user_features.csv")
item_features_df = pd.read_csv("item_features.csv")

In [10]:

print(user_ratings_df)

   The Call of Cthulhu   Frankenstein   Dracula   Neuromancer   Space Odyssey
0                  8.0            2.0       NaN           5.0             4.0
1                  3.0            2.0       NaN           7.0             7.0
2                  9.0            NaN       7.0           8.0             5.0
3                  NaN            NaN       7.0           8.0             9.0
4                  NaN            1.0       8.0           3.0             7.0
5                  2.0            3.0       5.0           NaN             NaN
6                  4.0            2.0       NaN           2.0             7.0
7                  7.0            1.0       2.0           7.0             9.0
8                  3.0            3.0       NaN           7.0             3.0
9                  4.0            NaN       5.0           3.0             3.0


In [11]:
print(user_features_df)

   Sex   Over60
0  1.0      0.0
1  0.0      1.0
2  0.0      0.0
3  1.0      0.0
4  0.0      1.0
5  0.0      0.0
6  0.0      0.0
7  1.0      0.0
8  0.0      1.0
9  1.0      0.0


In [12]:
print(item_features_df)

   Critic0   Critic1
0      0.3       0.9
1      0.9       0.3
2      0.6       0.4
3      0.2       0.1
4      0.7       0.8


In [14]:
user_features_df["key"] = 0
user_features_df["user_id"] = range(0,user_features_df.shape[0])
item_features_df["key"] = 0
item_features_df["item_id"] = range(0,item_features_df.shape[0])

merged_df = pd.merge(user_features_df,    item_features_df,left_index=True,on="key")
merged_df[["item_id", "user_id"]]




merged_df["rating"] = list(map(lambda ids: user_ratings_df.values[ids[1]][ids[2]], 
                          merged_df[["user_id", "item_id"]].itertuples()))

In [15]:
merged_df

Unnamed: 0,Sex,Over60,key,user_id,Critic0,Critic1,item_id,rating
0,1.0,0.0,0,0,0.3,0.9,0,8.0
1,1.0,0.0,0,0,0.9,0.3,1,2.0
2,1.0,0.0,0,0,0.6,0.4,2,
3,1.0,0.0,0,0,0.2,0.1,3,5.0
4,1.0,0.0,0,0,0.7,0.8,4,4.0
0,0.0,1.0,0,1,0.3,0.9,0,3.0
1,0.0,1.0,0,1,0.9,0.3,1,2.0
2,0.0,1.0,0,1,0.6,0.4,2,
3,0.0,1.0,0,1,0.2,0.1,3,7.0
4,0.0,1.0,0,1,0.7,0.8,4,7.0


In [16]:
train = merged_df.dropna()

test = merged_df[merged_df.isnull().any(axis=1)]

In [17]:

train.head(), test.head()

(   Sex   Over60  key  user_id  Critic0   Critic1  item_id  rating
 0  1.0      0.0    0        0      0.3       0.9        0     8.0
 1  1.0      0.0    0        0      0.9       0.3        1     2.0
 3  1.0      0.0    0        0      0.2       0.1        3     5.0
 4  1.0      0.0    0        0      0.7       0.8        4     4.0
 0  0.0      1.0    0        1      0.3       0.9        0     3.0,
    Sex   Over60  key  user_id  Critic0   Critic1  item_id  rating
 2  1.0      0.0    0        0      0.6       0.4        2     NaN
 2  0.0      1.0    0        1      0.6       0.4        2     NaN
 1  0.0      0.0    0        2      0.9       0.3        1     NaN
 0  1.0      0.0    0        3      0.3       0.9        0     NaN
 1  1.0      0.0    0        3      0.9       0.3        1     NaN)

In [24]:
n_latent_features = 2

user_ratings = user_ratings_df.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_latent_features))

In [25]:
user_features = user_features_df.values
item_features = item_features_df.values

In [28]:
user_features_weights = np.random.random((user_ratings.shape[0], user_features.shape[1] ))
item_features_weights = np.random.random((user_ratings.shape[1], item_features.shape[1] ))

In [29]:
def predict_rating(user_id,item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    
    user_score = user_features_weights[user_id].dot(user_features[user_id])
    item_score = item_features_weights[item_id].dot(item_features[item_id])
    #print user_preference.dot(item_preference), user_score, item_score
    return user_preference.dot(item_preference) + user_score + item_score

def train(user_id, item_id, rating,alpha = 0.001, 
                                   latent_feature_weight_decay = 0.1, 
                                   user_weight_decay = 0.01,
                                   item_weight_decay = 0.0001):
    
    #print item_id
    prediction_rating = predict_rating(user_id, item_id)
    err =  ( prediction_rating - rating );
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err *  ( latent_item_features[item_id] + latent_feature_weight_decay*latent_user_preferences[user_id])
    latent_item_features[item_id] -= alpha * err * ( user_pref_values + latent_feature_weight_decay*latent_item_features[item_id])
    
    user_features_weights[user_id] -=alpha * err *(  user_features[user_id] + user_weight_decay* user_features_weights[user_id])
    item_features_weights[item_id] -=alpha * err * ( item_features_weights[item_id] + item_weight_decay* item_features_weights[item_id])
    
    
    return err
    


def sgd(iterations = 30000):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0,iterations):
        error = []
        for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if(not np.isnan(rating)):
                    err = train(user_id,item_id,rating)
                    error.append(err)
    mse = (np.array(error) ** 2).mean()          
    print (mse)
                    
                    
    
                    
                    
    


In [30]:
for _ in range(0,10): 
    sgd()

0.28706190342743065
0.2779836113833252
0.2758470831451938
0.2748590151937973
0.2742834076789803
0.2739065416722693
0.2736424789034093
0.27344969844929196
0.27330560162535367
0.27319682024084574


In [32]:
predictions = np.zeros(shape = (latent_user_preferences.shape[0], latent_item_features.shape[0]) )
#print latent_user_preferences
print (user_features_weights)
print (item_features_weights)
for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                predictions[user_id,item_id] =  predict_rating(user_id,item_id)

  

[[ 2.874e+00  8.561e-01  5.160e-01  2.298e-01]
 [ 2.551e-01 -1.625e-01  9.546e-01  7.525e-01]
 [ 2.219e-03  3.761e-01  3.475e-01  2.196e+00]
 [ 1.039e+00  7.680e-01  7.699e-01  1.312e+00]
 [ 3.348e-01  5.607e-01  5.845e-01  2.026e-01]
 [ 7.180e-02  5.202e-01  3.576e-01 -1.758e+00]
 [ 7.973e-01  4.941e-01  9.666e-01  1.219e-01]
 [ 2.924e-02  5.550e-01  8.713e-02  4.074e-01]
 [ 6.513e-01  4.945e-01  3.758e-01 -1.837e-02]
 [ 2.464e-01  3.290e-01  4.652e-01 -2.833e-02]]
[[10.559  1.224 12.076 12.41 ]
 [ 0.406  0.05   0.529  0.532]
 [ 0.039  0.228  0.114  0.174]
 [ 1.49   1.462  0.203  1.268]
 [ 0.069  0.033  0.022  0.02 ]]


In [36]:
values = [zip(user_ratings[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns =user_ratings_df.columns
comparison_data
#applymap(lambda (x,y): "(%2.3f|%2.3f)"%(x,y))

Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,"(8.0, 7.8744560369573495)","(2.0, 2.398254149110194)","(nan, 17.34409980171558)","(5.0, 4.729478804353823)","(4.0, 3.9969556805549966)"
1,"(3.0, 2.912954765029872)","(2.0, 2.278324974222111)","(nan, -22.210181814349788)","(7.0, 6.810207347929204)","(7.0, 6.998020077024275)"
2,"(9.0, 8.745026830706609)","(nan, 5.024195638186227)","(7.0, 7.041683146639411)","(8.0, 8.18324805215062)","(5.0, 5.022305168814409)"
3,"(nan, 8.97550413746282)","(nan, 5.000654555472591)","(7.0, 7.000065093990401)","(8.0, 7.998585723813456)","(9.0, 9.000044461838547)"
4,"(nan, 5.492180717789763)","(1.0, 0.6877395589492674)","(8.0, 8.011836499491384)","(3.0, 3.2772569912773415)","(7.0, 7.010724453798114)"
5,"(2.0, 2.010483832890259)","(3.0, 2.9907829104305357)","(5.0, 4.998455488209695)","(nan, 9.12417958015444)","(nan, -70.37575393269238)"
6,"(4.0, 4.542105164642131)","(2.0, 0.2884650909463826)","(nan, 3.1036960312562676)","(2.0, 3.1278452948797173)","(7.0, 7.011651113578623)"
7,"(7.0, 6.5204386376784464)","(1.0, 2.747540795267148)","(2.0, 2.0474382022738173)","(7.0, 5.814192214132145)","(9.0, 8.981788750441641)"
8,"(3.0, 3.1249461010822515)","(3.0, 2.6123551826791407)","(nan, -20.661192730225405)","(7.0, 7.248036990240664)","(3.0, 3.002390774368593)"
9,"(4.0, 4.262215036263545)","(nan, -0.03464030785832506)","(5.0, 4.9775110738874115)","(3.0, 2.825461444501965)","(3.0, 2.978828739766917)"


In [37]:
comparison_data

Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,"(8.0, 7.8744560369573495)","(2.0, 2.398254149110194)","(nan, 17.34409980171558)","(5.0, 4.729478804353823)","(4.0, 3.9969556805549966)"
1,"(3.0, 2.912954765029872)","(2.0, 2.278324974222111)","(nan, -22.210181814349788)","(7.0, 6.810207347929204)","(7.0, 6.998020077024275)"
2,"(9.0, 8.745026830706609)","(nan, 5.024195638186227)","(7.0, 7.041683146639411)","(8.0, 8.18324805215062)","(5.0, 5.022305168814409)"
3,"(nan, 8.97550413746282)","(nan, 5.000654555472591)","(7.0, 7.000065093990401)","(8.0, 7.998585723813456)","(9.0, 9.000044461838547)"
4,"(nan, 5.492180717789763)","(1.0, 0.6877395589492674)","(8.0, 8.011836499491384)","(3.0, 3.2772569912773415)","(7.0, 7.010724453798114)"
5,"(2.0, 2.010483832890259)","(3.0, 2.9907829104305357)","(5.0, 4.998455488209695)","(nan, 9.12417958015444)","(nan, -70.37575393269238)"
6,"(4.0, 4.542105164642131)","(2.0, 0.2884650909463826)","(nan, 3.1036960312562676)","(2.0, 3.1278452948797173)","(7.0, 7.011651113578623)"
7,"(7.0, 6.5204386376784464)","(1.0, 2.747540795267148)","(2.0, 2.0474382022738173)","(7.0, 5.814192214132145)","(9.0, 8.981788750441641)"
8,"(3.0, 3.1249461010822515)","(3.0, 2.6123551826791407)","(nan, -20.661192730225405)","(7.0, 7.248036990240664)","(3.0, 3.002390774368593)"
9,"(4.0, 4.262215036263545)","(nan, -0.03464030785832506)","(5.0, 4.9775110738874115)","(3.0, 2.825461444501965)","(3.0, 2.978828739766917)"


In [38]:
d = comparison_data.to_latex()
text_file = open("comparison.txt", "w")
text_file.write(d)
text_file.close()