# Content-based recommendation

## Make item predictions based on user and item profile in vector space

In [2]:
import pickle
import pandas as pd
import numpy as np

item_profiles = pd.read_pickle("files/asin_tfidf3.pkl")
# i1 = item_profiles.iloc[:,:-303] # TFIDF
# i2 = item_profiles.iloc[:,-300:] # Word2Vec
 

train = pd.read_pickle("files/train.pkl")
train = train[["overall", "reviewerID", "asin"]]

df_test = pd.read_pickle("files/testset.pkl")[["reviewerID", "asin", "overall"]]
df_test = df_test.rename(columns={"reviewerID": "uid", "asin": "iid"})

currentI = item_profiles 
currentI.shape

(801, 949)

In [3]:
def mergeItemVectors(train, item_profiles):
    full = train.merge(item_profiles, on="asin", how="left").dropna()
    mask = ~full.columns.isin(["overall", "reviewerID", "asin"])
    full.loc[:, mask]  = full.loc[:, mask].mul(full["overall"], axis=0)
    
    mask2 = ~full.columns.isin(["overall", "asin"])
    full = full.loc[:, mask2].groupby("reviewerID").mean()
    return full 

full = mergeItemVectors(train, currentI)
full.to_pickle("files/user_profiles.pkl")


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

pred_list = []

for u in set(df_test["uid"]): 
    items = currentI.copy()
    items["sim"] = cosine_similarity(items, full[full.index == u])
    items = items.drop(train[train["reviewerID"]==u].asin)
    
    user_preds = pd.DataFrame(items["sim"].reset_index())
    user_preds["uid"] = u 
    pred_list.append(user_preds)
    
preds = pd.concat(pred_list).reset_index(drop=True)
preds = preds.rename(columns={"asin": "iid", "sim": "score"})
preds.to_pickle("files/preds_content.pkl")
preds

Unnamed: 0,iid,score,uid
0,0321700945,0.209099,A2OL5WLCNZVD9K
1,0321719816,0.199832,A2OL5WLCNZVD9K
2,0321719824,0.225583,A2OL5WLCNZVD9K
3,0763855553,0.291439,A2OL5WLCNZVD9K
4,0982697813,0.276935,A2OL5WLCNZVD9K
...,...,...,...
1360952,B01F7RJHIQ,0.435724,A10EIJM2C94M14
1360953,B01FFVDY9M,0.493548,A10EIJM2C94M14
1360954,B01H39M7ME,0.509866,A10EIJM2C94M14
1360955,B01HAP47PQ,0.333174,A10EIJM2C94M14


## Performance metrics



In [7]:
# commonly used functions (see metrics.py)
from metrics import PatK, MAPatK, MRRatK, HRatK

ks = [5, 15]

for k in ks:
    P   = PatK(preds, df_test, k)  
    MAP = MAPatK(preds, df_test, k)
    MRR = MRRatK(preds, df_test, k)
    HR = HRatK(preds, df_test, k)
    print(f"  P@{k:2g} = {P  :.4f}")
    print(f"MAP@{k:2g} = {MAP:.4f}")
    print(f"MRR@{k:2g} = {MRR:.4f}")
    print(f" HR@{k:2g} = {HR :.4f}\n")


  P@ 5 = 0.0309
MAP@ 5 = 0.0779
MRR@ 5 = 0.0779
 HR@ 5 = 0.1543

  P@15 = 0.0153
MAP@15 = 0.0863
MRR@15 = 0.0863
 HR@15 = 0.2297

