# Hybrid Recommender System
## Load data

In [33]:
import pickle
import pandas as pd
import numpy as np


train = pd.read_pickle("files/train.pkl")
train = train[["overall", "reviewerID", "asin"]]

df_test = pd.read_pickle("files/testset.pkl")[["reviewerID", "asin", "overall"]]
df_test = df_test.rename(columns={"reviewerID": "uid", "asin": "iid"})


#load preds
lf = pd.read_pickle("files/preds_svd.pkl").rename(columns={"score":"est"})
cont = pd.read_pickle("files/preds_content.pkl")
df = cont.merge(lf, how="left")[["uid","iid","score","est"]].rename(columns={"score":"cont_score","est":"lf_score"})

#CF is missing preds for an item 
df = df.fillna(train["overall"].mean())

#shuffle to make coin-toss ties 
# df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,uid,iid,cont_score,lf_score
0,A8D55T859ZQ6H,0321700945,0.462175,4.737528
1,A8D55T859ZQ6H,0321719816,0.437303,4.646644
2,A8D55T859ZQ6H,0321719824,0.400667,4.546008
3,A8D55T859ZQ6H,0763855553,0.502034,4.347595
4,A8D55T859ZQ6H,0982697813,0.466024,4.772099
...,...,...,...,...
1360952,AHQRU3MRORIWQ,B01F4PYHX6,0.418336,3.888339
1360953,AHQRU3MRORIWQ,B01FFVDY9M,0.672276,4.187779
1360954,AHQRU3MRORIWQ,B01H39M7ME,0.378600,3.663456
1360955,AHQRU3MRORIWQ,B01HAP47PQ,0.471874,3.835519


## Exercise 1
In this exercise, we are going to try out different methods, that can be used to combine rankings from multiple models.

Below is given a dataframe with the scores of how likely a user will like 5 different items estimated with 2 different models (rating predictions from a collaborative filtering model and cosine similarities from a content-based model).

### 1.1
Rank the 5 items according to the scores from model 1 and 2 respectively (in both cases higher score is better).

In [34]:
# weighted sum score
normalize = lambda x: (x - x.mean())/x.std()
alpha, beta = 0.75, 0.25
df[["cont_norm", "lf_norm"]] = df[["cont_score", "lf_score"]].apply(normalize)
df["weighted_sum"] =  alpha * df["cont_norm"] + beta * df["lf_norm"]


# rankings of scores
df[["cont_rank", "lf_rank"]] = df.groupby("uid")[["cont_score", "lf_score"]]\
                                 .rank(axis=0, method="first", ascending=False)

# Reciprocal Rank Fusion
k = 20
df["rrf"] =  1 / (k + df["cont_rank"])
df["rrf"] += 1 / (k + df["lf_rank"])

df.head()


Unnamed: 0,uid,iid,cont_score,lf_score,cont_norm,lf_norm,weighted_sum,cont_rank,lf_rank,rrf
0,A8D55T859ZQ6H,321700945,0.462175,4.737528,0.192756,1.341746,0.480003,75.0,46.0,0.025678
1,A8D55T859ZQ6H,321719816,0.437303,4.646644,-0.000354,1.210854,0.302448,120.0,77.0,0.017452
2,A8D55T859ZQ6H,321719824,0.400667,4.546008,-0.2848,1.065919,0.052879,223.0,141.0,0.010326
3,A8D55T859ZQ6H,763855553,0.502034,4.347595,0.502226,0.780165,0.571711,26.0,286.0,0.025007
4,A8D55T859ZQ6H,982697813,0.466024,4.772099,0.222639,1.391535,0.514863,71.0,35.0,0.029171


## Switching strategy

In [25]:
df["support"] = df.merge(train.groupby("asin")["asin"].count(), how="left", left_on="iid", right_index=True)["asin"].fillna(0)

def switch(m1, m2, support, threshold): 
    if support < threshold:
        return m1
    return m2
        

supports = [25, 50, 100]
for s in supports:
    df[f"switch{s}_score"] = df.apply(lambda x: switch(x["cont_score"], x["lf_score"], x["support"], s), axis=1)


In [32]:
from metrics import PatK, MAPatK, MRRatK, HRatK

# name = f"switch{supports[2]}_score"
name = "weighted_sum"
preds = df[["uid", "iid", name]].rename(columns={name:"score"})

ks = [5, 15]

for k in ks:
    P   = PatK(preds, df_test, k)  
    MAP = MAPatK(preds, df_test, k)
    MRR = MRRatK(preds, df_test, k)
    HR = HRatK(preds, df_test, k)
    print(f"  P@{k:2g} = {P  :.4f}")
    print(f"MAP@{k:2g} = {MAP:.4f}")
    print(f"MRR@{k:2g} = {MRR:.4f}")
    print(f" HR@{k:2g} = {HR :.4f}\n")



  P@ 5 = 0.033
MAP@ 5 = 0.125
MRR@ 5 = 0.125
 HR@ 5 = 0.164

  P@15 = 0.016
MAP@15 = 0.134
MRR@15 = 0.134
 HR@15 = 0.241



## Meta-level strategy

## Exercise 2
In this exercise, we are going to predict the rating of a single user-item pair using a hybrid method, where we use the user profiles from a content-based method as input to a (neighborhood-based) collaborative filtering method.

Below is given a dataframe with content-based user profiles of the user with 'reviewerID'='A25C2M3QF9G7OQ' and all users that have rated the item with 'asin'='B00EYZY6LQ'.

Compute the cosine similarities between user 'A25C2M3QF9G7OQ' and the other users based on their user profiles.<br>
What are the similarities and what are the ratings given by these users on item 'B00EYZY6LQ'?

Predict the rating for user 'A25C2M3QF9G7OQ' on item 'B00EYZY6LQ' based on the ratings from the $3$ most similar users, using a weighted (by similarity) average.<br>
What is the prediction?

In [35]:
user_profile = pd.read_pickle("files/user_profiles.pkl")
user_profile

Unnamed: 0_level_0,access,account,acroni,activ,address,adob,advanc,advantag,advisor,aftershot,...,290,291,292,293,294,295,296,297,298,299
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A100UD67AHFODS,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,-0.484395,0.176640,0.091954,0.109714,0.450016,-0.309309,-0.076063,-0.431012,-0.270338,0.175346
A105S56ODHGJEK,0.000000,0.035776,0.000000,0.0,0.0,0.0,0.0,0.0,0.090065,0.0,...,-0.308737,0.109594,0.004872,-0.097996,0.356185,-0.282419,-0.122896,-0.237245,-0.145221,0.210940
A1075X1Q4M3S78,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,-0.510763,-0.698051,0.042386,0.640352,0.732659,-0.548418,-0.543007,0.040734,-0.161328,0.274862
A10C5CJK1YKGV0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,-0.352237,0.178072,-0.012605,-0.128421,0.334304,-0.231970,-0.086603,-0.196387,-0.125653,0.284422
A10CRW7XRJBJ2G,0.000000,0.000000,0.116844,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,-0.162928,0.028587,-0.099716,-0.029412,0.349292,-0.382111,-0.252292,-0.244036,-0.007735,0.017381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZD8SMNGQI98O,0.000000,0.178945,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,-0.060097,0.201214,0.084046,-0.152527,0.176422,-0.286704,-0.006282,-0.078435,-0.155033,0.280384
AZQGJ5CEAJGXB,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,-0.263689,0.221452,0.021396,-0.065379,-0.043327,-0.369486,-0.088769,-0.198493,-0.193061,0.213204
AZW10G02DNJI4,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,-0.247090,0.153326,-0.180114,0.100705,0.301591,-0.417238,-0.302458,-0.442512,-0.185656,0.152711
AZYJE40XW6MFG,0.073855,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,-0.062859,0.112332,-0.062938,0.014893,0.048779,-0.213027,-0.056600,-0.182202,-0.080046,0.084134


In [36]:
from sklearn.metrics.pairwise import cosine_similarity

def weighted_avg(data, weights):
    return data.T @ (weights/weights.sum())

all_sims = pd.DataFrame(cosine_similarity(user_profile), index=user_profile.index, columns=user_profile.index)

i = 0
iid = 'B00MILM2FA'
uid = "A1E9UHMG4RSKUA"
def predict(uid, iid, all_sims): 
    global i 
    i += 1
    if i%1000 == 0: print(f"{i}/{1360957}, {i*100/1360957:.3f}%", end='\r')
    indices = train[train["asin"] == iid].reviewerID
    if len(indices) == 0: return np.nan
    
    # similarities between uid and users having rated iid 
    top_k_sims = all_sims.loc[uid].loc[indices].nlargest(3) 
    
    ids = (train["asin"] == iid) & (train["reviewerID"].isin(top_k_sims.index))  
    
    return weighted_avg(train[ids]["overall"], top_k_sims.values)
    
predict(uid, iid, all_sims)

4.665511228356268

In [37]:
result = df.apply(lambda x: predict(x["uid"], x["iid"], all_sims), axis=1)
df["meta"] = result 
result.to_pickle("files/meta_score.pkl")

result

850000/1360957, 62.456%

KeyboardInterrupt: 

## Evaluation

In [602]:

name = "meta"
preds = df[["uid", "iid", name]].rename(columns={name:"score"})

ks = [5, 15]

for k in ks:
    P   = PatK(preds, df_test, k)  
    MAP = MAPatK(preds, df_test, k)
    MRR = MRRatK(preds, df_test, k)
    HR = HRatK(preds, df_test, k)
    print(f"  P@{k:2g} = {P  :.4f}")
    print(f"MAP@{k:2g} = {MAP:.4f}")
    print(f"MRR@{k:2g} = {MRR:.4f}")
    print(f" HR@{k:2g} = {HR :.4f}\n")



  P@ 5 = 0.0014
MAP@ 5 = 0.0022
MRR@ 5 = 0.0022
 HR@ 5 = 0.0070

  P@15 = 0.0009
MAP@15 = 0.0029
MRR@15 = 0.0029
 HR@15 = 0.0140

