In [1]:
from pathlib import Path
import pickle
import warnings

import gensim
import numpy as np
import pandas as pd
from surprise import Dataset, Reader
from tqdm import tqdm

from src.models import cf, evaluate_model, lda
from src.utilities import utilities

tqdm.pandas()
warnings.filterwarnings('ignore')



# Load Data

In [2]:
# global variables
DATA_PATH = "data/evaluation"
CATEGORY = "Grocery_and_Gourmet_Food"
MODEL_PATH = Path(f"models/ti_mf/ti_mf_{CATEGORY}.pkl")

# LDA parameters
EPOCHS = 10

# training parameters
N_EPOCHS = 15
LR_ALL = 0.01
BETA = 0.1

# reproducibility checks
SEED = 42
np.random.seed(42)

train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")

In [3]:
# checking train dataframe
train.head().append(train.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,0,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A23RYWDS884TUL,5.0,This curry paste makes a delicious curry. I j...,2013-05-28,curry paste delicious curry fry chicken vegeta...
1,1,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A945RBQWGZXCK,5.0,I've purchased different curries in the grocer...,2012-09-17,purchase different curry grocery store complet...
2,3,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3AMNY44OP8AOU,4.0,I started a new diet restricting all added sug...,2014-01-23,start new diet restrict added sugar brand suga...
3,4,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3IB4CQ2QEJLJ8,5.0,So many flavors. I can't begin to tell you how...,2014-04-27,flavor begin tell love mae ploy curry ask reci...
4,5,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",AQA5DF3RWKETQ,5.0,I've used this a lot recently in some of my ch...,2012-11-27,use lot recently chicken dish use lot like spi...
47769,77420,B00I33696K,Reese's Miniature Peanut Butter Cups .31oz - 1...,"['Grocery & Gourmet Food', 'Candy & Chocolate'...",A192LQZWDYPR4U,5.0,Another quality Reese Peanut Butter Cup produc...,2014-02-27,quality reese peanut butter cup product great ...
47770,77421,B00I33696K,Reese's Miniature Peanut Butter Cups .31oz - 1...,"['Grocery & Gourmet Food', 'Candy & Chocolate'...",A2QKXW3LDQ66P5,5.0,I purchased these for my husband who has every...,2013-02-20,purchase husband love reeses valentine day pre...
47771,77430,B00ID9VSOM,"Viva Labs Organic Coconut Sugar: Non-GMO, Low-...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A2P3TGJU301KXD,5.0,this stuff is INCREDIBILY yummy! SO much bette...,2014-07-15,stuff incredibily yummy good regular brown sug...
47772,77456,B00IRL93SY,Barrie House Kenya Estate - AA Single Cup Caps...,"['Grocery & Gourmet Food', 'Beverages', 'Coffe...",AEFE9VDHTQ199,5.0,"Very nice aroma, body and taste! Will buy this...",2014-05-24,nice aroma body taste buy coffee good coffee a...
47773,77508,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A2AEZQ3DGBBLPR,2.0,This is a no go for diabetics according to my ...,2014-06-26,diabetic accord wife doctor order intention us...


# Preparing Topic Vectors

In [4]:
# generating tokenized reviews
processed_reviews = train["processedReviewText"].apply(lambda x: x.split())

In [5]:
# instantiate lda model
lda_model = lda.LDA(reviews=processed_reviews, n_epochs=EPOCHS)

In [6]:
%%time
# training the LDA model
lda_model.train()



CPU times: user 1min 27s, sys: 8.66 s, total: 1min 35s
Wall time: 1min 36s


# Generating User/Item Topic Vectors

In [7]:
user_idx_map, user_vecs, item_idx_map, item_vecs = utilities.generate_user_item_vectors(lda_model, train)

100%|████████████████████████████████████████████| 13397/13397 [00:09<00:00, 1431.73it/s]
100%|███████████████████████████████████████████████| 4729/4729 [00:04<00:00, 964.75it/s]


In [8]:
# converting factors into numpy obj
user_factors = user_vecs.to_numpy()
item_factors = item_vecs.to_numpy()

In [9]:
# check user factors
user_factors[0,:]

array([0.0013339 , 0.0013339 , 0.0013339 , 0.0013339 , 0.12503904,
       0.0013339 , 0.0013339 , 0.0013339 , 0.0013339 , 0.0013339 ,
       0.0013339 , 0.0013339 , 0.0013339 , 0.0013339 , 0.0013339 ,
       0.0013339 , 0.09556343, 0.0013339 , 0.0013339 , 0.0013339 ,
       0.0013339 , 0.0013339 , 0.0013339 , 0.0013339 , 0.4521291 ,
       0.0013339 , 0.0013339 , 0.0013339 , 0.0013339 , 0.0013339 ,
       0.0013339 , 0.0013339 , 0.0013339 , 0.0013339 , 0.0013339 ,
       0.0013339 , 0.0013339 , 0.0013339 , 0.0013339 , 0.0013339 ,
       0.0013339 , 0.0013339 , 0.0013339 , 0.2659092 , 0.0013339 ,
       0.0013339 , 0.0013339 , 0.0013339 , 0.0013339 , 0.0013339 ],
      dtype=float32)

In [10]:
# check item factors
item_factors[0,:]

array([4.4113447e-05, 1.7057571e-01, 4.4113447e-05, 4.4113447e-05,
       7.9999510e-03, 2.4941336e-02, 4.4113447e-05, 4.4113447e-05,
       4.4113447e-05, 4.4113447e-05, 8.2395216e-03, 4.4113447e-05,
       1.8790852e-01, 4.4113447e-05, 2.0059188e-01, 3.9089434e-03,
       1.3248248e-02, 4.4113447e-05, 4.4113447e-05, 4.4113447e-05,
       4.4113447e-05, 4.4113447e-05, 4.4113447e-05, 1.7656349e-02,
       4.4113447e-05, 4.1113984e-02, 4.4113447e-05, 4.4113447e-05,
       3.0868517e-02, 4.4113447e-05, 4.4113447e-05, 4.4113447e-05,
       4.4113447e-05, 4.4113447e-05, 1.8986599e-01, 1.8907048e-02,
       5.6289136e-02, 4.4113447e-05, 4.4113447e-05, 4.4113447e-05,
       4.4113447e-05, 4.4113447e-05, 4.4113447e-05, 4.4113447e-05,
       2.6340883e-02, 4.4113447e-05, 4.4113447e-05, 4.4113447e-05,
       4.4113447e-05, 4.4113447e-05], dtype=float32)

# Utility Functions

# Generate N-Recommendations = {10, 25, 30, 45}

## Load Test Data

In [11]:
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

In [12]:
test.head().append(test.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,2,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A1TCSC0YWT82Q0,5.0,I love ethnic foods and to cook them. I recent...,2013-08-03,love ethnic food cook recently purchase produc...
1,8,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A1Z7Y2GMAP9SRY,5.0,I like to make my own curry but this is a tast...,2014-06-27,like curry tasty alternative use base kind dif...
2,23,B00004S1C5,"Ateco Food Coloring Kit, 6 colors","['Grocery & Gourmet Food', 'Cooking & Baking',...",A14YSMLYLJEMET,1.0,This product is no where near natural / organi...,2013-03-29,product near natural organic wish review purch...
3,31,B00005344V,Traditional Medicinals Organic Breathe Easy Se...,"['Grocery & Gourmet Food', 'Beverages', 'Coffe...",A2F488C4PLWGEI,5.0,If my wife drinks a cup of this tea when she f...,2014-03-23,wife drink cup tea feel attack come help avoid...
4,32,B00005344V,Traditional Medicinals Organic Breathe Easy Se...,"['Grocery & Gourmet Food', 'Beverages', 'Coffe...",AO1HXV7DWZZIR,5.0,I don't know about the medicinal aspects of th...,2014-02-06,know medicinal aspect tea flavor downright scr...
28001,77519,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A1WT3TVHANP7ZF,3.0,Hmmm. I really wanted to love this sweetener. ...,2014-07-22,hmmm want love sweetener half sugar half stevi...
28002,77520,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A3NEAETOSXDBOM,5.0,"I confess I have a sweet tooth, and love the t...",2014-06-30,confess sweet tooth love taste sugar recognize...
28003,77521,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",AD1ZOPB0BBEHB,4.0,"It has a little of the stevia aftertaste, but ...",2014-07-17,little stevia aftertaste fair compromise able ...
28004,77522,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A18ECVX2RJ7HUE,5.0,i love marinade for grilled flank steak or lon...,2014-05-30,love marinade grilled flank steak london broil...
28005,77523,B00ISVHJ3Y,"Wholesome Sweeteners, Organic Sweet and Lite S...","['Grocery & Gourmet Food', 'Cooking & Baking',...",A2G04D4QZAXL15,3.0,I've been using Truvia (a form of stevia) on m...,2014-05-27,use truvia form stevia cereal greek yogurt yea...


In [13]:
# generating test history
test_user_history = (pd.DataFrame(test.groupby(['reviewerID'])['asin']
                                  .apply(list).reset_index()))

In [14]:
print(test_user_history)

                  reviewerID  \
0      A00177463W0XWB16A9O05   
1      A022899328A0QROR32DCT   
2      A068255029AHTHDXZURNU   
3      A06944662TFWOKKV4GJKX   
4             A1004703RC79J9   
...                      ...   
13274          AZWRZZAMX90VT   
13275          AZXKAH2DE6C8A   
13276          AZXON596A1VXC   
13277          AZYXC63SS008M   
13278          AZZ5ASC403N74   

                                                    asin  
0                               [B00474OR8G, B00BFM6OAW]  
1                                           [B00CMQDKES]  
2                                           [B001FA1K2G]  
3                                           [B000GFYRHG]  
4                                           [B003GTR8IO]  
...                                                  ...  
13274  [B0007R9L4M, B000CN7BMA, B001EQ5D1K, B002VT3GX...  
13275   [B000MAK41I, B004X8TJP2, B006H34CUS, B007W14RMM]  
13276                           [B001EO5S0I, B00271QQ7Q]  
13277                    

## Preparing Dataset for Surprise's Algorithm

In [15]:
# create reader
reader = Reader(rating_scale=(1,5))
# generate data required for surprise
data = Dataset.load_from_df(train[["reviewerID", "asin", "overall"]], reader)
# generating trainset
trainset = data.build_full_trainset()

## Instantiate Pre-Initialised Matrix Factorization (Topic Modelling)

In [16]:
# instantiating ti_mf
ti_mf = cf.PreInitialisedMF(user_map=user_idx_map,
                            item_map=item_idx_map,
                            user_factor=user_factors,
                            item_factor=item_factors,
                            learning_rate=LR_ALL,
                            beta=BETA,
                            num_epochs=N_EPOCHS,
                            num_factors=50)

In [17]:
%%time
# fitting to training data
ti_mf.fit(trainset, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
CPU times: user 6min 16s, sys: 1.1 s, total: 6min 17s
Wall time: 6min 18s


In [18]:
%%time
# generate candidate items for user to predict rating
testset = trainset.build_anti_testset()

CPU times: user 31 s, sys: 1.41 s, total: 32.4 s
Wall time: 32.5 s


In [19]:
%%time
# predict ratings for all pairs (u, i) that are NOT in the training set
candidate_items = ti_mf.test(testset, verbose=False)

CPU times: user 8min 4s, sys: 1min 52s, total: 9min 56s
Wall time: 10min 33s


## Save Model
* Not adviced to save model due to long persistence time required to save model.

In [20]:
# MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
# dump.dump(MODEL_PATH, algo=ti_mf)

In [21]:
# load model
# _, ti_mf = dump.load(MODEL_PATH)

## Loop through N = {10, 25, 30, 45}

In [22]:
# generate item popularity
item_popularity = evaluate_model.generate_item_popularity(train)

In [23]:
n_recommendations = {}
for n in [10, 25, 30, 45]:
    # retrieve the top-n items based on similarities
    # top_ns = get_top_n(candidate_items, n)
    top_ns = ti_mf.get_top_n(candidate_items, n)
    # evaluate how well the recommended items predicted the future purchases
    n_recommended_items = (evaluate_model.
                           evaluate_recommendations(model_name = 'TI-MF',
                                                    top_ns = top_ns,
                                                    user_rating_history = test_user_history, 
                                                    item_popularity = item_popularity, 
                                                    n = n,
                                                    mf_based = True))
    # saving the n-value and recommended items
    n_recommendations[n] = (top_ns, n_recommended_items)

The TI-MF has an average recall@10: 0.00200, average novelty@10: 0.96868
The TI-MF has an average recall@25: 0.00681, average novelty@25: 0.96706
The TI-MF has an average recall@30: 0.00887, average novelty@30: 0.96597
The TI-MF has an average recall@45: 0.01622, average novelty@45: 0.96422


# Evaluate N-Recommendations

## N=10

In [24]:
top_ns_10 = n_recommendations[10][0]
utilities.retrieve_recommendations(train, top_ns_10, mf_based=True)

For user: A28GRFVAA4DU5P:
Purchase History:
            asin                                              title
5655  B000EMK4GY  Betty Crocker Warm Delights, Molten Caramel Ca...
6126  B000EVMNMI        Haribo Gummi Candy, Happy-Cola, 5-Pound Bag
6165  B000EVOSDU  Haribo Gummy Candy, Licorice Allsorts, 6.6-Pou...
8370  B000G7TBSO  Snyder's of Hanover Pretzel Sticks, 10-Ounce P...

Recommending:

         asin                                              title
0  B000JMAXMY                   Mustard Seeds 7oz by Spicy World
1  B001PEWJWC  Garbanzo Beans aka Chickpeas or Ceci Beans | N...
2  B000EDBPO8  Bob's Red Mill White Rice Flour, Organic, 24-O...
3  B003WEC9PU  JOLLY RANCHER Hard Candy, Watermelon, 160 Coun...
4  B0029JHHO2  Ricochet Candies with Xylitol, Grape Escape, 1...
5  B000G82L62  Lundberg Family Farms Wild Blend Rice, 16 Ounc...
6  B001PF1846  Green Split Peas | Non-GMO Project Verified | ...
7  B000BD0SDU  REDMOND Real Sea Salt - Natural Unrefined Orga...
8  B000EDK774  

## N=25

In [25]:
top_ns_25 = n_recommendations[25][0]
utilities.retrieve_recommendations(train, top_ns_25, mf_based=True)

For user: AW81W6T30JG8N:
Purchase History:
             asin                                              title
6338   B000EZOP0C  SnackMasters Salmon Jerky Original, 2-Ounce Pa...
13839  B000ZSZ5S4  Blue Diamond Almonds, Bold Salt &amp; Vinegar,...

Recommending:

          asin                                              title
0   B0000IJYK4  Maseca Instant Yellow Corn Masa Flour 4.84lb |...
1   B000FDMLUO  Hodgson Mill Whole Wheat Wild Blueberry Muffin...
2   B000JMAXMY                   Mustard Seeds 7oz by Spicy World
3   B0016BS3BK  Ghirardelli Double Chocolate Brownie Mix, 18-O...
4   B004AFODLI  Kodiak Cakes All Natural Frontier Pancake, Fla...
5   B000EDK6FM  Bob's Red Mill Whole Grain Organic Quinoa Flou...
6   B000HDK0DC  YumEarth Organic Lollipops, Assorted Flavors, ...
7   B000EDM6KU  Bob's Red Mill Arrowroot Starch/Flour, 16-ounc...
8   B000EDG598  Bob's Red Mill Super Fine Almond Flour, 16 Oun...
9   B004KUV136  GF Harvest Gluten Free Organic Rolled Oats, 41...
10  B001

## N=30

In [26]:
top_ns_30 = n_recommendations[30][0]
utilities.retrieve_recommendations(train, top_ns_30, mf_based=True)

For user: A3EAAFGS0DU8R6:
Purchase History:
             asin                                              title
16531  B001ACMCNA  Chebe Bread Pizza Crust Mix, Gluten Free, 7.5-...
16562  B001ACNWUC   Chebe Bread Focaccia Flat Bread Mix, Gluten F...
34472  B004AHCGI8  Pomi Tomatoes, Strained, 26.46-Ounce Carton (P...

Recommending:

          asin                                              title
0   B000FDOSN2  Jelly Belly Jelly Beans, 49 Flavors, 2-lb Stan...
1   B000JMAXMY                   Mustard Seeds 7oz by Spicy World
2   B00135XQCK         Barilla Plus Penna Pasta~2pk~14.5oz Boxes~
3   B001PEWJWC  Garbanzo Beans aka Chickpeas or Ceci Beans | N...
4   B003S1WSD0  FiberGourmet Light Lasagna, 8-Ounce Boxes (Pac...
5   B001PF1846  Green Split Peas | Non-GMO Project Verified | ...
6   B000HJNAXI  RiceSelect Couscous Variety Pack, 26.5 oz, 4-C...
7   B000EDG4V2       Bob's Red Mill Guar Gum, 8 Ounce (Case of 8)
8   B004YN7XKY  Dreamfields Pasta Angel Hair, 13.25 Ounce Boxe...
9   

## N=45

In [27]:
top_ns_45 = n_recommendations[45][0]
utilities.retrieve_recommendations(train, top_ns_45, mf_based=True)

For user: AF5AJ5V9T6LTS:
Purchase History:
             asin                                              title
15909  B0016CMVSK  ST. DALFOUR Strawberry Conserves, 1 Ounce Jars...
37122  B004LWP5YE  Made In Nature Organic Club Pack, Raisin, 48-O...
40196  B005C3IVN8  Anderson's Pure Maple Syrup, Grade A Very Dark...

Recommending:

          asin                                              title
0   B000JMAXMY                   Mustard Seeds 7oz by Spicy World
1   B001E50UEQ  Hormel Compleats Chicken &amp; Rice, 6-10-Ounc...
2   B001VNEHXG  Frontier Co-op Organic White Onion Powder, 2 1...
3   B003WEC9PU  JOLLY RANCHER Hard Candy, Watermelon, 160 Coun...
4   B004VLVOJ0  Bob's Red Mill Organic Oats Whole Groats, 29 O...
5   B0029JHHO2  Ricochet Candies with Xylitol, Grape Escape, 1...
6   B001PEWJWC  Garbanzo Beans aka Chickpeas or Ceci Beans | N...
7   B000EDBPO8  Bob's Red Mill White Rice Flour, Organic, 24-O...
8   B0029JASWA  Dove Dark Chocolate Promises, 9.5-Ounce Packag...
9   B