### 0. Import packages

In [1]:
import logging
import numpy as np
import os
import pandas as pd
import surprise

from collections import defaultdict
from pathlib import Path
from tqdm import tqdm

### 1. Load processed data pickle

In [2]:
# global variables
DATA_PATH = Path('data/processed/')
CATEGORY = 'Sports_and_Outdoors'

In [42]:
train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

In [4]:
print(f"Train: {train.shape}")
print(f"Test: {test.shape}")

Train: (2242666, 8)
Test: (434692, 8)


In [5]:
# summary
print(f"We have {train.shape[0]} ratings.")
print(f"The number of unique users we have is: {train['reviewerID'].nunique()}.")
print(f"The number of unique items we have is: {train['asin'].nunique()}")
print(f"The median user rated {train['reviewerID'].value_counts().median()} items.")
print(f"The max rating is {max(train['overall'])}, the min rating is {min(train['overall'])}.")

We have 2242666 ratings.
The number of unique users we have is: 332231.
The number of unique items we have is: 104536
The median user rated 5.0 items.
The max rating is 5.0, the min rating is 1.0.


### 2. Preparing the dataset for training with `Surprise`

In [6]:
# we need swap columns in the following order: `reviewerID`, `asin`, `overall`
train = train[['reviewerID', 'asin', 'overall']]
reader = surprise.Reader(rating_scale=(1.0,5.0))
data = surprise.Dataset.load_from_df(train, reader)
trainset = data.build_full_trainset()

In [7]:
# agg user/item mapping
MAP_PATH = Path("data/processed/mappings/")
agg_user_vectors = np.load(f'{MAP_PATH}/{CATEGORY}_agg_user_vectors.npy', allow_pickle=True).item()
agg_prod_vectors = np.load(f'{MAP_PATH}/{CATEGORY}_agg_prod_vectors.npy', allow_pickle=True).item()

# user/item index mapping
train_user_index_map = np.load(f'{MAP_PATH}/{CATEGORY}_train_user_index_map.npy', allow_pickle=True).item()
train_prod_index_map = np.load(f'{MAP_PATH}/{CATEGORY}_train_prod_index_map.npy', allow_pickle=True).item()

In [8]:
train_user_index = pd.DataFrame.from_dict(train_user_index_map, orient='index')
train_user_index.columns = ['reviewerID']

train_user_vectors = pd.DataFrame.from_dict(agg_user_vectors, orient='index')

# merging user vector and their D2V vectors
user_factors = pd.merge(train_user_index, train_user_vectors, left_index=True, right_index=True).set_index('reviewerID')
# user_factors.to_csv("user_factors.csv")
user_factors.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2ERDJDHL5250E,-0.001393,0.010706,0.057688,0.032961,0.055388,0.004883,-0.008693,-0.046846,0.082394,-0.010865,...,0.008918,-0.072811,0.075028,-0.031598,0.01331,-0.084575,-0.052641,0.13303,-0.047659,-0.11466
A3OT1IGGIRP4SX,0.042955,0.003859,-0.021228,0.035249,-0.021593,-0.009716,-0.072479,0.05478,0.042466,0.015016,...,0.03144,0.022644,0.037963,-0.000707,0.071702,0.025748,0.004151,0.015087,0.005266,-0.011319
AV7OE2XLHXNAX,-0.017765,-0.024089,-0.00395,-0.007635,-0.019528,-0.00637,-0.022975,0.025297,-0.015457,0.00804,...,-0.005269,0.039201,0.02133,0.016538,0.047936,-0.010439,-0.018338,0.020829,-0.014701,-0.012528
A3JXYXZHH8MXEL,0.029713,-0.093453,0.094964,-0.079168,-0.044471,0.01214,-0.125511,0.029961,-0.088458,0.018941,...,-0.009258,0.093152,-0.051942,0.03354,0.190262,0.025575,-0.043788,0.063773,-0.012849,-0.12054
A2W6QWMWVYS5JR,-0.000132,-0.168407,0.001205,0.029849,0.100621,0.006307,-0.093841,0.075829,-0.025472,0.21236,...,0.086917,0.055823,-0.030929,0.055677,0.049179,0.067016,0.046185,0.052996,0.04827,-0.025873


In [9]:
train_prod_index = pd.DataFrame.from_dict(train_prod_index_map, orient='index')
train_prod_index.columns = ['asin']

train_prod_vectors = pd.DataFrame.from_dict(agg_prod_vectors, orient='index')

# merging prod vector and their D2V vectors
prod_factors = pd.merge(train_prod_index, train_prod_vectors, left_index=True, right_index=True).set_index('asin')
# prod_factors.to_csv("prod_factors.csv")
prod_factors.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B00XVGXGB0,-0.028735,-0.056108,-0.03246,0.008095,-0.011243,0.003498,-0.037507,0.033322,-0.015832,0.026648,...,-0.000462,0.039632,0.029524,0.015292,0.077395,0.001761,-0.030874,0.033687,0.011769,0.000154
B004R7G33K,-0.03316,-0.049536,-0.007573,-0.036987,-0.009941,0.000644,-0.023759,0.044294,-0.009924,0.006038,...,-0.037824,0.05473,-0.004647,0.061898,0.057449,0.01652,-0.037705,0.024926,-0.016702,-0.019501
B00TK31ZY8,-0.090267,-0.073928,0.045832,-0.022734,-0.01038,-0.085362,-0.188818,0.041648,0.032018,0.029686,...,0.094035,0.07764,0.067982,0.06032,0.172293,-0.024297,-0.102928,0.085987,0.109246,0.005954
B0016PM67I,-0.037176,-0.0543,0.00329,0.000629,-0.018531,-0.015814,-0.044274,0.026129,-0.034617,0.050554,...,-0.019872,0.055698,0.017198,0.017568,0.084163,-0.0093,-0.037548,0.04907,-0.013736,-0.005158
B01D3ID6S8,-0.026588,-0.058351,0.032157,-0.00373,-0.002862,0.005076,0.023442,0.012216,-0.044792,0.043934,...,-0.012459,0.100076,0.042812,0.040686,0.058932,-0.008527,-0.028917,0.047138,-0.032607,0.025038


In [10]:
# setting locating by `reviewerID` 
user_factors = user_factors.to_numpy()
prod_factors = prod_factors.to_numpy()

In [11]:
user_factors[:1,]

array([[-0.00139326,  0.01070629,  0.05768845,  0.0329612 ,  0.0553879 ,
         0.00488296, -0.00869327, -0.04684592,  0.0823941 , -0.0108649 ,
         0.07763122, -0.02126908, -0.00717102, -0.04491281, -0.0852215 ,
        -0.00361429,  0.00978789,  0.02764019,  0.0751814 ,  0.03089333,
         0.04117684, -0.02936736, -0.03353109, -0.0487733 , -0.02659133,
         0.02188961,  0.05667667, -0.07675944, -0.0991408 , -0.04386722,
        -0.0236766 , -0.03231909,  0.01497382,  0.06736004, -0.1045417 ,
         0.06539939, -0.01146153, -0.08201144, -0.01348285, -0.10803379,
        -0.00763431, -0.01746873,  0.09882042,  0.07719737,  0.02758413,
        -0.00700441, -0.06778047, -0.0578411 ,  0.05345424,  0.01154411,
        -0.0755935 , -0.01014541,  0.10010977,  0.02350248, -0.03878367,
        -0.07902477, -0.05026945, -0.05100492, -0.01060421, -0.03957319,
        -0.02523186,  0.00523646, -0.07418991,  0.04406072,  0.00268527,
         0.05271197, -0.0352231 ,  0.00733327,  0.0

In [12]:
prod_factors[:1,]

array([[-0.028735  , -0.05610809, -0.03246031,  0.00809488, -0.01124282,
         0.00349831, -0.03750717,  0.03332187, -0.01583213,  0.0266481 ,
        -0.05674835,  0.02126554, -0.02077972, -0.00088326, -0.04692468,
        -0.03824457,  0.06779764, -0.00106158, -0.00204537,  0.03976614,
        -0.0072909 ,  0.00621938, -0.00539656, -0.0202763 ,  0.03027143,
        -0.00676155, -0.02396085,  0.01934781, -0.04280524, -0.00847415,
         0.02121852, -0.011075  ,  0.00964169, -0.00404767, -0.03746314,
        -0.04091518, -0.02265925,  0.00279981, -0.01768054, -0.05730085,
        -0.00446327,  0.00063171, -0.00562225,  0.0249576 , -0.02717521,
         0.03735483, -0.00190488, -0.01923786, -0.0243962 ,  0.03877954,
        -0.04356486,  0.04290394, -0.01410717, -0.00746643,  0.00715857,
         0.0245075 , -0.04210267,  0.00834428,  0.00451602, -0.00280327,
        -0.02343168, -0.03064212, -0.05588719,  0.03104548,  0.04021582,
         0.0024844 ,  0.03063204,  0.01558907, -0.0

#### Retrieving user and item embeddings as initialized latent factors

In [None]:
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(message)s',
                    handlers=[logging.FileHandler("epmf.log"),
                              logging.StreamHandler()])

In [None]:
class EmbeddedPMF(surprise.AlgoBase):
    """Latent factors of users and items are generated based off D2V embedding vectors.
       This in turns, allows us to create `P` and `Q` without needing random initialization.
    """
    def __init__(self, user_map, item_map, user_factor, item_factor, learning_rate, num_epochs, num_factors):
        surprise.AlgoBase.__init__(self)
        self.user_map = dict((v,k) for k,v in user_map.items())
        self.item_map = dict((v,k) for k,v in item_map.items())
        self.user_embedding = user_factor
        self.item_embedding = item_factor
        self.alpha = learning_rate
        self.num_epochs = num_epochs
        self.num_factors = num_factors

    def fit(self, train):
        # Instead of random initialization n-latent factors,
        # We initialiazed the latent factors using the D2V aggregated embedding vectors
        # By both user and items, where each embedding is represented by the content
        # of their reviews.
        # This is based on the idea: https://doi.org/10.1145/3383313.3412207
        # Where they initialized the latent factor models using topic vectors generated
        # through NMF.
        surprise.AlgoBase.fit(self, train)
        P = self.user_embedding
        Q = self.item_embedding

        for epoch in tqdm(range(self.num_epochs)):
            for u, i, r_ui in train.all_ratings():
                # retrieving raw uid, iid
                raw_uid = trainset.to_raw_uid(u)
                raw_iid = trainset.to_raw_iid(i)

                # locating the index of the user/item vector
                user_map_index = self.user_map[raw_uid]
                item_map_index = self.item_map[raw_iid]

                # casting to np.float128 to prevent memory overflow
                residual = r_ui - np.dot(np.array(P[user_map_index], dtype=np.float128), np.array(Q[item_map_index], dtype=np.float128))
                temp = np.array(P[user_map_index,:], dtype=np.float128)
                P[user_map_index,:] = np.array(P[user_map_index,:], dtype=np.float128) + self.alpha * residual * np.array(Q[item_map_index], dtype=np.float128)
                Q[item_map_index,:] = np.array(Q[item_map_index,:], dtype=np.float128) + self.alpha * residual * temp

            
        self.P = P
        self.Q = Q

        self.trainset = train

    def estimate(self, u, i):
        """Returns estimated rating for user u, and item i.

           Prerequisite: Algorithm must be fit to training set.
        """
        if self.trainset.knows_user(u) and self.trainset.knows_item(i):
            nanCheck = np.dot(self.P[u], self.Q[i])

            if np.isnan(nanCheck):
                return self.trainset.global_mean
            else:
                return np.dot(self.P[u,:], self.Q[i,:])
        else:
            return self.trainset.global_mean

In [None]:
epmf = EmbeddedPMF(train_user_index_map, train_prod_index_map, user_factors, prod_factors, 0.025, 5, 10)
epmf.fit(trainset)

In [None]:
# generate a random rating for a user given a product
random_user_index = np.random.randint(0, len(train_user_index_map))
random_prod_index = np.random.randint(0, len(train_prod_index_map))
random_user = train_user_index_map[random_prod_index]
random_prod = train_prod_index_map[random_prod_index]

print(f"User: {random_user}, {trainset.to_inner_uid(random_user)}")
print(f"Product: {random_prod}, {trainset.to_inner_iid(random_prod)}")

print(f"Estimated rating is: {epmf.estimate(trainset.to_inner_uid(random_user), trainset.to_inner_iid(random_prod))}")
print(f"Estimated rating is: {epmf.estimate(5000000, 5000000000)}\n")    # not available, hence return global mean
# train[train['asin'] == random_prod][['asin', 'title', 'brand', 'overall',]].groupby(['asin', 'title', 'brand']).agg({'overall': 'mean'})

In [None]:
train[train['reviewerID'] == 'A3MC5PWXS2FG5M'][['asin', 'title', 'brand', 'overall']]

#### Generating cross-validation

In [None]:
surprise.model_selection.cross_validate(epmf, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

#### Running baseline `SVD` model

In [13]:
svd = surprise.SVD(n_factors=150, lr_all=0.05, reg_all=0.01, n_epochs=1, verbose=True)
svd.fit(trainset)

Processing epoch 0


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x14e3e2fd0>

In [14]:
# generate a random rating for a user given a product
random_user_index = np.random.randint(0, len(train_user_index_map))
random_prod_index = np.random.randint(0, len(train_prod_index_map))
random_user = train_user_index_map[random_prod_index]
random_prod = train_prod_index_map[random_prod_index]

print(trainset.to_inner_uid(random_user))
print(trainset.to_inner_iid(random_prod))

print(f"Estimated rating is: {svd.estimate(trainset.to_inner_uid(random_user), trainset.to_inner_iid(random_prod))}")

281666
19448
Estimated rating is: 4.9303075092797854


In [None]:
surprise.model_selection.cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

#### Defining matrix factorization with `regularisation` and `biases`

In [None]:
# TODO: check if I can numba this class with @jitclass or @jit
class EmbeddedMF(surprise.AlgoBase):
    """Latent factors of users and items are generated based off D2V embedding vectors.
       This in turns, allows us to create `P` and `Q` without needing random initialization.

        Args:
            user_map ([dict]): Index-User mapping.
            item_map ([dict]): Index-Item mapping.
            user_factors ([np.array]): Predefined user latent factors initialized using Doc2Vec embeddings.
            item_factors ([np.array]): Predefined item latent factors initialized using Doc2Vec embeddings.
            learning_rate ([float]):
            beta ([float]):
            num_epochs ([int]): Number of training iterations.

        Returns:
            ([None]): Initialized model.
    """

    def __init__(
        self,
        user_map,
        item_map,
        user_factor,
        item_factor,
        learning_rate,
        beta,
        num_epochs,
        num_factors
    ):
        surprise.AlgoBase.__init__(self)
        self.user_map = {v: k for k, v in user_map.items()}
        self.item_map = {v: k for k, v in item_map.items()}
        self.user_embedding = user_factor
        self.item_embedding = item_factor
        self.alpha = learning_rate
        self.beta = beta
        self.num_epochs = num_epochs
        self.num_factors = num_factors

    def fit(self, train):
        # Instead of random initialization n-latent factors,
        # We initialiazed the latent factors using the D2V aggregated embedding vectors
        # By both user and items, where each embedding is represented by the content
        # of their reviews.
        # This is based on the idea: https://doi.org/10.1145/3383313.3412207
        # Where they initialized the latent factor models using topic vectors generated
        # through NMF.
        surprise.AlgoBase.fit(self, train)
        P = self.user_embedding
        Q = self.item_embedding
        bias_u = np.zeros(len(self.user_embedding))
        bias_i = np.zeros(len(self.item_embedding))
        bias_global = train.global_mean

        for _ in tqdm(range(self.num_epochs)):
            for u, i, r_ui in train.all_ratings():
                # retrieving raw uid, iid from iid
                raw_uid = train.to_raw_uid(u)
                raw_iid = train.to_raw_iid(i)

                # locating the index of the user/item vector
                ui = self.user_map[raw_uid]
                ii = self.item_map[raw_iid]

                # compute current error
                dot = 0 # <P_u, Q_i>
                for f in range(self.num_factors):
                    dot += P[ui, f] * Q[ii, f]
                err = r_ui - (bias_global + bias_u[ui] + bias_i[ii] + dot)

                # update biases
                bias_u[ui] += self.alpha * (err - self.beta * bias_u[ui])
                bias_i[ii] += self.alpha * (err - self.beta * bias_i[ii])

                # update user and iten latent feature matrices
                for f in range(self.num_factors):
                    P_uf = P[ui, f]
                    Q_if = Q[ii, f]
                    P[ui, f] += self.alpha * (err * Q_if - self.beta * P_uf)
                    Q[ii, f] += self.alpha * (err * P_uf - self.beta * Q_if)

                # print(P[ui, :], Q[ii, :], sep='\n')

        self.P = P
        self.Q = Q
        self.bias_u = bias_u
        self.bias_i = bias_i
        self.trainset = train

    def estimate(self, u, i, clip=True):
        """Returns estimated rating for user u, and item i.

           Prerequisite: Algorithm must be fit to training set.
        """
        known_user = self.trainset.knows_user(u)
        known_item = self.trainset.knows_item(i)
        
        est = self.trainset.global_mean

        if known_user:
            est += self.bias_u[u]
        
        if known_item:
            est += self.bias_i[i]

        if known_user and known_item:
            est += np.dot(self.P[u, :], self.Q[i, :])

        if clip:
            min_rating, max_rating = self.trainset.rating_scale
            est = max_rating if est > max_rating else est
            est = min_rating if est < min_rating else est

        return est

In [None]:
emf = EmbeddedMF(train_user_index_map, train_prod_index_map, user_factors, prod_factors, learning_rate=0.05, beta=0.01, num_epochs=5,  num_factors=150)

In [None]:
# takes about 10mins ~epoch
emf.fit(trainset)

In [None]:
# ~30mins per cv-fold
surprise.model_selection.cross_validate(emf, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

In [None]:
# generate a random rating for a user given a product
random_user_index = np.random.randint(0, len(train_user_index_map))
random_prod_index = np.random.randint(0, len(train_prod_index_map))
random_user = train_user_index_map[random_prod_index]
random_prod = train_prod_index_map[random_prod_index]

print(f"User: {random_user}, {trainset.to_inner_uid(random_user)}")
print(f"Product: {random_prod}, {trainset.to_inner_iid(random_prod)}")

print(f"Estimated rating is: {emf.estimate(trainset.to_inner_uid(random_user), trainset.to_inner_iid(random_prod))}")
print(f"Estimated rating is: {emf.estimate(5000000, 5000000000)}\n")    # not available, hence return global mean
train[train['asin'] == random_prod][['asin', 'title', 'brand', 'overall',]].groupby(['asin', 'title', 'brand']).agg({'overall': 'mean'})

### Generating top-N recommendations

In [15]:
def retrieve_top_n(predictions, n=5):

    # first map the predictions to the user
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # sort predictions for each user and retrieve k highest ones
    for uid, user_ratings in top_n.items():
        user_rating.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_rating[:n]

    return top_n

In [49]:
# get total unique items in train set
uniq_train_items = set(train['asin'])
# load test users
uniq_test_users = set(test['reviewerID'])

pd.DataFrame(list(zip(uniq_test_users, uniq_train_items)), columns=['reviewerID', 'asin'])

Unnamed: 0,reviewerID,asin
0,A1UUUZI6UL8LR,B00APXK61A
1,A3RSKVYE1PV3M4,B0043M4MPK
2,A1BHQ6GAA60YPB,B00943971M
3,A2OK32J1KDQ7F7,B00GL9WKE8
4,A10N7L0GMRODUO,B0055Q43XO
...,...,...
104531,AW1PCGDQSGC1B,B00B2PNHWK
104532,AMIP1FQNUYZ9S,B00070QE1W
104533,A1LR49AAUE0JIS,B00398FWLW
104534,AQPR0YGOZ6NXZ,B01CNDFS5I


In [None]:
list((x, y) for x in uniq_test_users for y in uniq_train_items)