# Hybrid Recommender System
## Load data

In [1]:
import pickle
import pandas as pd
import numpy as np


train = pd.read_pickle("files/train.pkl")
train = train[["overall", "reviewerID", "asin"]]

df_test = pd.read_pickle("files/testset.pkl")[["reviewerID", "asin", "overall"]]
df_test = df_test.rename(columns={"reviewerID": "uid", "asin": "iid"})


#load preds
lf = pd.read_pickle("files/preds_svd.pkl").rename(columns={"score":"est"})
cont = pd.read_pickle("files/preds_content.pkl")
df = cont.merge(lf, how="left")[["uid","iid","score","est"]].rename(columns={"score":"cont_score","est":"lf_score"})

#CF is missing preds for an item 
df = df.fillna(train["overall"].mean())

#shuffle to make coin-toss ties (commented out since meta-model spends a longer time caching)
# df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,uid,iid,cont_score,lf_score
0,A2OL5WLCNZVD9K,0321700945,0.209099,3.939641
1,A2OL5WLCNZVD9K,0321719816,0.199832,3.822292
2,A2OL5WLCNZVD9K,0321719824,0.225583,3.814823
3,A2OL5WLCNZVD9K,0763855553,0.291439,3.530590
4,A2OL5WLCNZVD9K,0982697813,0.276935,4.227161
...,...,...,...,...
1360952,A10EIJM2C94M14,B01F7RJHIQ,0.435724,5.000000
1360953,A10EIJM2C94M14,B01FFVDY9M,0.493548,4.467216
1360954,A10EIJM2C94M14,B01H39M7ME,0.509866,3.793774
1360955,A10EIJM2C94M14,B01HAP47PQ,0.333174,4.144317


## Weighing strategies 


In [31]:
# weighted sum score
normalize = lambda x: (x - x.mean())/x.std()
df[["cont_norm", "lf_norm"]] = df[["cont_score", "lf_score"]].apply(normalize)

# weighted strategies
alpha, beta = 1, 2
df["weighted_sum"] =  alpha * df["lf_norm"] + beta * df["cont_norm"]
df["norm_sum"]     =          df["lf_norm"] +        df["cont_norm"]

# rankings of scores
df[["cont_rank", "lf_rank"]] = df.groupby("uid")[["cont_score", "lf_score"]]\
                                 .rank(axis=0, method="first", ascending=False)

# Reciprocal Rank Fusion
k = 20
df["rrf"] =  1 / (k + df["cont_rank"])
df["rrf"] += 1 / (k + df["lf_rank"])

df.head()


Unnamed: 0,uid,iid,cont_score,lf_score,cont_norm,lf_norm,weighted_sum,norm_sum,cont_rank,lf_rank,rrf,support,switch25_score,switch50_score,switch100_score,support_u,switch_u_score,meta
0,A2OL5WLCNZVD9K,321700945,0.209099,3.939641,-1.772148,0.192631,-3.351666,-1.579517,705.0,25.0,0.013072,7.0,-1.772148,-1.772148,-1.772148,4,-1.772148,5.0
1,A2OL5WLCNZVD9K,321719816,0.199832,3.822292,-1.844092,0.023625,-3.664558,-1.820467,723.0,56.0,0.009898,12.0,-1.844092,-1.844092,-1.844092,4,-1.844092,4.481068
2,A2OL5WLCNZVD9K,321719824,0.225583,3.814823,-1.644161,0.012868,-3.275454,-1.631293,662.0,59.0,0.009788,9.0,-1.644161,-1.644161,-1.644161,4,-1.644161,4.298179
3,A2OL5WLCNZVD9K,763855553,0.291439,3.53059,-1.132848,-0.396484,-2.66218,-1.529332,415.0,172.0,0.006416,30.0,-0.396484,-1.132848,-1.132848,4,-0.396484,4.363387
4,A2OL5WLCNZVD9K,982697813,0.276935,4.227161,-1.24546,0.606717,-1.884204,-0.638743,476.0,5.0,0.01725,9.0,-1.24546,-1.24546,-1.24546,4,-1.24546,4.740075


## Switching strategy

In [3]:
df["support"] = df.merge(train.groupby("asin")["asin"].count(), how="left", left_on="iid", right_index=True)["asin"].fillna(0)

def switch(m1, m2, support, threshold): 
    if support < threshold:
        return m1
    return m2
        

supports = [25, 50, 100]
for s in supports:
    df[f"switch{s}_score"] = df.apply(lambda x: switch(x["cont_norm"], x["lf_norm"], x["support"], s), axis=1)


In [4]:
u_threshold = 25
df["support_u"] = df.merge(train.groupby("reviewerID")["reviewerID"].count(), how="left", left_on="uid", right_index=True)["reviewerID"].fillna(0)
df["switch_u_score"] = df.apply(lambda x: switch(x["cont_norm"], x["lf_norm"], x["support"], u_threshold), axis=1)

## Meta-level strategy
User profiles from a content-based method as input to a (neighborhood-based) collaborative filtering method.

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# precompute all user similarities 
user_profile = pd.read_pickle("files/user_profiles.pkl").iloc[:,:646]
all_sims = pd.DataFrame(cosine_similarity(user_profile), index=user_profile.index, columns=user_profile.index)
all_sims

reviewerID,A100UD67AHFODS,A105S56ODHGJEK,A1075X1Q4M3S78,A10C5CJK1YKGV0,A10CRW7XRJBJ2G,A10EIJM2C94M14,A10G3LHNAK4GEH,A10G4BPT5MGBHY,A10GU5NVTA5I67,A10NC6ZVVMBHNH,...,AZ24FJKEJNSDX,AZ515FFZ7I2P7,AZ61PR2UGT3IP,AZ8NQPFR743F0,AZAIARFUW642T,AZD8SMNGQI98O,AZQGJ5CEAJGXB,AZW10G02DNJI4,AZYJE40XW6MFG,AZZ5ASC403N74
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A100UD67AHFODS,1.000000,0.146463,0.052788,0.110262,0.388075,0.055200,0.398965,0.061431,0.024957,0.125463,...,0.199157,0.116129,0.448898,0.108691,0.000000,0.095105,0.050915,0.393792,0.105584,0.087878
A105S56ODHGJEK,0.146463,1.000000,0.164345,0.525220,0.169406,0.147458,0.188130,0.308182,0.072684,0.135667,...,0.295615,0.452332,0.205975,0.207137,0.000000,0.418852,0.240531,0.116702,0.212105,0.329222
A1075X1Q4M3S78,0.052788,0.164345,1.000000,0.068598,0.303011,0.025660,0.360428,0.047738,0.015377,0.057087,...,0.178698,0.087438,0.606594,0.120891,0.000000,0.062593,0.031370,0.112238,0.068843,0.085237
A10C5CJK1YKGV0,0.110262,0.525220,0.068598,1.000000,0.132615,0.142269,0.167978,0.139645,0.044326,0.137862,...,0.366484,0.379346,0.168280,0.257237,0.000000,0.392438,0.138356,0.113502,0.257399,0.195970
A10CRW7XRJBJ2G,0.388075,0.169406,0.303011,0.132615,1.000000,0.098255,0.745823,0.086511,0.030017,0.248899,...,0.493949,0.147425,0.627532,0.193529,0.000000,0.156021,0.061237,0.560024,0.134117,0.121584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZD8SMNGQI98O,0.095105,0.418852,0.062593,0.392438,0.156021,0.214484,0.193592,0.473930,0.035383,0.117133,...,0.289748,0.566166,0.134327,0.328132,0.000000,1.000000,0.420209,0.143158,0.360340,0.290046
AZQGJ5CEAJGXB,0.050915,0.240531,0.031370,0.138356,0.061237,0.197932,0.074220,0.534284,0.020468,0.180066,...,0.072523,0.390495,0.077705,0.232724,0.000000,0.420209,1.000000,0.171582,0.468344,0.086887
AZW10G02DNJI4,0.393792,0.116702,0.112238,0.113502,0.560024,0.310212,0.364518,0.123387,0.020201,0.314697,...,0.295501,0.136407,0.295014,0.373010,0.000000,0.143158,0.171582,1.000000,0.208150,0.094487
AZYJE40XW6MFG,0.105584,0.212105,0.068843,0.257399,0.134117,0.261220,0.160378,0.166543,0.031422,0.306763,...,0.148223,0.215540,0.119289,0.411474,0.000000,0.360340,0.468344,0.208150,1.000000,0.248425


In [6]:

def weighted_avg(data, weights):
    return data.T @ (weights/weights.sum())

i = 0
iid = 'B00MILM2FA'
uid = "A100UD67AHFODS"
def predict(uid, iid, all_sims): 
    # progress bar
    global i; i += 1
    if i%1000 == 0: print(f"{i}/{1360957}, {i*100/1360957:.3f}%", end='\r')
    
    indices = train[train["asin"] == iid].reviewerID
    if len(indices) == 0: return np.nan
    
    # similarities between uid and users having rated iid 
    top_k_sims = all_sims.loc[uid].loc[indices].nlargest(3) 
    if top_k_sims[0]==0: return 0
    
    ids = (train["asin"] == iid) & (train["reviewerID"].isin(top_k_sims.index))  

    return weighted_avg(train[ids]["overall"], top_k_sims.values)
    
predict(uid, iid, all_sims)

3.620056120673675

In [7]:
# set true to run; takes around half an hour 
if False:
    result = df.apply(lambda x: predict(x["uid"], x["iid"], all_sims), axis=1)
    df["meta"] = result 
    result.to_pickle("files/meta_score.pkl", compression={'method': 'gzip', 'compresslevel': 8})

In [8]:
result = pd.read_pickle("files/meta_score.pkl", compression={'method': 'gzip'})
df["meta"] = result

## Evaluation

In [33]:
from metrics import PatK, MAPatK, MRRatK, HRatK

# change name to whichever score column to get results
# name = f"switch{supports[2]}_score"
name = "weighted_sum"
preds = df[["uid", "iid", name]].rename(columns={name:"score"})

ks = [5, 15]

for k in ks:
    P   = PatK(preds, df_test, k)  
    MAP = MAPatK(preds, df_test, k)
    MRR = MRRatK(preds, df_test, k)
    HR = HRatK(preds, df_test, k)
    print(f"  P@{k:2g} = {P  :.4f}")
    print(f"MAP@{k:2g} = {MAP:.4f}")
    print(f"MRR@{k:2g} = {MRR:.4f}")
    print(f" HR@{k:2g} = {HR :.4f}\n")



  P@ 5 = 0.0330
MAP@ 5 = 0.1244
MRR@ 5 = 0.1244
 HR@ 5 = 0.1648

  P@15 = 0.0159
MAP@15 = 0.1325
MRR@15 = 0.1325
 HR@15 = 0.2390



In [24]:
meta_merge = df[["uid","iid","meta"]].merge(df_test, how="inner", on=["uid", "iid"])
print("RMSE:", np.sqrt(np.mean((meta_merge["meta"] - meta_merge["overall"])**2)))

RMSE: 1.2053820228892569


## Time-sorted testset performance

In [37]:
from metrics import relevant_column, first

dft = pd.read_pickle("files/testset.pkl")[["reviewerID", "asin", "overall", "unixReviewTime"]] \
        .sort_values("unixReviewTime") \
        .reset_index(drop=True)

name, name2, name3 = "cont_score", "switch100_score", "lf_score"
preds = df[["uid", "iid", name]].rename(columns={name: "score"})
preds2 = df[["uid", "iid", name2]].rename(columns={name2: "score"})
preds3 = df[["uid", "iid", name3]].rename(columns={name3: "score"})


merged = relevant_column(preds, df_test, 5)
scores = merged[["uid", "iid", "relevant"]].groupby(by="uid")["relevant"].apply(lambda x: x.any()*1)

merged = relevant_column(preds2, df_test, 5)
scores2 = merged[["uid", "iid", "relevant"]].groupby(by="uid")["relevant"].apply(lambda x: x.any()*1)

merged = relevant_column(preds3, df_test, 5)
scores3 = merged[["uid", "iid", "relevant"]].groupby(by="uid")["relevant"].apply(lambda x: x.any()*1)
# scores2 = scores2/scores2.mean() * scores.mean()


In [42]:
import plotly.express as px
x  =  scores[dft.reviewerID].rolling(window=100, center=True).mean().dropna().rename("TFIDF-W2V")
x2 = scores2[dft.reviewerID].rolling(window=100, center=True).mean().dropna().rename("ISwitch100")
x3 = scores3[dft.reviewerID].rolling(window=100, center=True).mean().dropna().rename("SVD")
dfp = pd.concat([x2,x,x3], axis=1).reset_index(drop=True)

fig = px.line(dfp)
fig.update_layout(
    width=600,
    height=400,
    yaxis_title="Mean hit rate over window",
    xaxis_title="User window",
)

fig.show()
fig.write_image("images/testset.pdf")
