In [32]:
from pathlib import Path
import pickle
import warnings

import gensim
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from surprise import Dataset, Reader
from tqdm import tqdm

from src.models import cf, evaluate_model, lda
from src.utilities import utilities

tqdm.pandas()
warnings.filterwarnings('ignore')

# Load Data

In [2]:
# global variables
DATA_PATH = "data/evaluation"
CATEGORY = "Pet_Supplies"
MODEL_PATH = Path(f"models/saved_lda/{CATEGORY}_lda.model")

# LDA parameters
EPOCHS = 10

# training parameters
N_EPOCHS = 5
LR_ALL = 0.005
BETA = 0.1

# reproducibility checks
SEED = 42
np.random.seed(42)

train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")

In [3]:
# checking train dataframe
train.head().append(train.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,0,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A14CK12J7C7JRK,3.0,I purchased the Trilogy with hoping my two cat...,2011-01-12,purchase trilogy hop cat age interested yr old...
1,2,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A2CR37UY3VR7BN,4.0,I bought the triliogy and have tested out all ...,2012-12-19,buy triliogy test dvd appear volume receive re...
2,3,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A2A4COGL9VW2HY,4.0,My female kitty could care less about these vi...,2011-05-12,female kitty care video care little male dig a...
3,4,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A2UBQA85NIGLHA,3.0,"If I had gotten just volume two, I would have ...",2012-03-05,volume star trilogy star read review know vol ...
4,5,B00005MF9U,LitterMaid LM900 Mega Self-Cleaning Litter Box,"['Pet Supplies', 'Cats', 'Litter &amp; Housebr...",A2BH04B9G9LOYA,1.0,"First off, it seems that someone is spamming t...",2006-12-31,spamming review glow reviewer review amazon ba...
68865,111581,B00K3YPOO0,Brightest Black Light Flashlight on Amazon- UV...,[],A11J1FHCK5U06J,4.0,Now I know exactly where the trouble spots are...,2014-05-23,know exactly trouble spot sniffing guess invis...
68866,111585,B00K3YPOO0,Brightest Black Light Flashlight on Amazon- UV...,[],A18JF0T0GOCORW,4.0,I use this light to help me find stains when I...,2014-05-24,use light help stain carpet clean pre treat ca...
68867,111595,B00K7EG97C,Nutro Crunchy Dog Treats with Real Mixed Berri...,"['Pet Supplies', 'Dogs', 'Treats', 'Cookies, B...",A3GRPCW9DG427Z,5.0,We are owned by the 3 pickiest pooches in the ...,2013-07-27,pickiest pooch world love fool reject doggie t...
68868,111598,B00K7EG97C,Nutro Crunchy Dog Treats with Real Mixed Berri...,"['Pet Supplies', 'Dogs', 'Treats', 'Cookies, B...",A2X6TLAX3JEO1A,5.0,My highly allergic white boxer loves these tre...,2014-05-09,highly allergic white boxer love treat meat co...
68869,111602,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],A9PG9ODPPP31N,5.0,Works great on my medium sized dog. She has ve...,2014-07-09,work great medium size dog coarse hair work gr...


# Preparing Topic Vectors [Train/Load]

In [4]:
# # generating tokenized reviews
# processed_reviews = train["processedReviewText"].apply(lambda x: x.split())

In [5]:
# # instantiate lda model
# lda_model = lda.LDA(reviews=processed_reviews, n_epochs=EPOCHS)

In [6]:
# %%time
# # training the LDA model
# lda_model.train()

In [7]:
# # save model
# pickle.dump(lda_model, open(MODEL_PATH, "wb"))

## Load Trained LDA Model

In [8]:
lda_model = pickle.load(open(MODEL_PATH, "rb"))

# Generating User/Item Topic Vectors

In [9]:
user_idx_map, user_vecs, item_idx_map, item_vecs = utilities.generate_user_item_vectors(lda_model, train)

100%|████████████████████████████████████████████| 19058/19058 [00:14<00:00, 1336.99it/s]
100%|███████████████████████████████████████████████| 4878/4878 [00:05<00:00, 909.56it/s]


In [10]:
# converting factors into numpy obj
user_factors = user_vecs.to_numpy()
item_factors = item_vecs.to_numpy()

In [11]:
# check user factors
user_factors[0,:]

array([0.00036381, 0.00036381, 0.00036381, 0.00036381, 0.14345573,
       0.00036381, 0.00036381, 0.00036381, 0.00036381, 0.00036381,
       0.00036381, 0.00036381, 0.00036381, 0.14020865, 0.00036381,
       0.00036381, 0.00036381, 0.16489097, 0.00036381, 0.24272475,
       0.08820931, 0.10163331, 0.00036381, 0.00036381, 0.00036381,
       0.00036381, 0.00036381, 0.00036381, 0.00036381, 0.00036381,
       0.00036381, 0.00036381, 0.00036381, 0.00036381, 0.00036381,
       0.00036381, 0.00036381, 0.00036381, 0.00036381, 0.00036381,
       0.00036381, 0.00036381, 0.10323327, 0.00036381, 0.00036381,
       0.00036381, 0.00036381, 0.00036381, 0.00036381, 0.00036381],
      dtype=float32)

In [12]:
# check item factors
item_factors[0,:]

array([9.5528401e-02, 1.4507551e-04, 1.4507551e-04, 1.4507551e-04,
       1.4507551e-04, 1.4507551e-04, 1.4507551e-04, 1.4507551e-04,
       1.4507551e-04, 2.3869988e-02, 1.4507551e-04, 1.4507551e-04,
       1.4507551e-04, 1.4507551e-04, 1.4507551e-04, 1.4507551e-04,
       2.5293065e-02, 1.4507551e-04, 5.4683823e-02, 1.4507551e-04,
       1.4507551e-04, 1.2912405e-01, 1.4507551e-04, 1.4507551e-04,
       1.4507551e-04, 1.4507551e-04, 4.9176782e-02, 1.8134082e-02,
       4.2473695e-01, 1.4507551e-04, 1.4507551e-04, 1.4507551e-04,
       1.4507551e-04, 1.4507551e-04, 1.4507551e-04, 7.6067477e-02,
       7.1464084e-02, 1.4507551e-04, 1.4507551e-04, 1.4507551e-04,
       1.4507551e-04, 1.4507551e-04, 1.4507551e-04, 1.4507551e-04,
       1.4507551e-04, 1.4507551e-04, 1.4507551e-04, 2.6263399e-02,
       1.4507551e-04, 1.4507551e-04], dtype=float32)

# Generate N-Recommendations = {10, 25, 30, 45}

## Load Test Data

In [13]:
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

In [14]:
test.head().append(test.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,1,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A39QHP5WLON5HV,5.0,There are usually one or more of my cats watch...,2013-09-14,usually cat watch tv stay trouble dvd play lik...
1,104,B00005MF9V,LitterMaid Universal Cat Privacy Tent (LMT100),"['Pet Supplies', 'Cats', 'Litter & Housebreaki...",A366V0GCEPH5CX,5.0,My cats love it and so do I. I no longer have ...,2013-02-02,cat love longer cat litter fly floor litter fl...
2,133,B00005MF9T,LitterMaid LM500 Automated Litter Box,"['Pet Supplies', 'Cats', 'Litter & Housebreaki...",ALWWS8QBYN80B,1.0,I have one female cat that weighs under 10 pou...,2004-11-17,female cat weigh pound year old use everclean ...
3,153,B00005MF9W,LitterMaid Waste Receptacles Automatic Litter ...,"['Pet Supplies', 'Cats', 'Litter & Housebreaki...",A3PVI3NE7OY1SP,5.0,I love these. They make the clean up so much e...,2013-09-26,love clean easy clean box manually use issue w...
4,154,B00005MF9W,LitterMaid Waste Receptacles Automatic Litter ...,"['Pet Supplies', 'Cats', 'Litter & Housebreaki...",A2H83XMHUVDLJY,4.0,"I love this litter box. I do not use the lids,...",2014-06-26,love litter box use lid use receptacle tear cr...
41564,111601,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],AV34KNYW82YSS,4.0,Pulled lots of hair out of my Labs coat. Didn'...,2014-07-18,pulled lot hair labs coat think prove wrong co...
41565,111603,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],A1YMNTFLNDYQ1F,5.0,I have been trying to find a rubber bristle br...,2014-07-16,try rubber bristle brush persian year lose glo...
41566,111604,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],A1FQ3HRVXA4A5B,5.0,Great product to use on your pets knowing this...,2014-07-11,great product use pet know gentle rubber damag...
41567,111605,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],A3OP6CI0XCRQXO,5.0,I bought a second one because I have two cats ...,2014-07-22,buy second cat american short hair buy brush m...
41568,111606,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],A11LC938XF35XN,5.0,Our dogs love getting brushed with this. It m...,2014-07-17,dog love brush massage remove heavy undercoat ...


In [15]:
# generating test history
test_user_history = (pd.DataFrame(test.groupby(['reviewerID'])['asin']
                                  .apply(list).reset_index()))

In [16]:
print(test_user_history)

                  reviewerID                                  asin
0      A04173782GDZSQ91AJ7OD              [B0090Z9AYS, B00CPDWT2M]
1      A042274212BJJVOBS4Q85              [B005AZ4M3Q, B00771WQIY]
2       A0436342QLT4257JODYJ  [B0018CDR68, B003SJTM8Q, B00474A3DY]
3      A04795073FIBKY8GSLZYI              [B001PKT30M, B005DGI2RY]
4      A06658082A27F4VB5UG8E              [B000TZ1TTM, B0019VUHH0]
...                      ...                                   ...
18993          AZYJE40XW6MFG              [B00HVAKJZS, B00IDZT294]
18994          AZZ56WF4X19G2                          [B004A7X218]
18995          AZZNK89PXD006  [B0002DHV16, B005BP8MQ8, B009RTX4SU]
18996          AZZV9PDNMCOZW              [B007EQL390, B00ISBWVT6]
18997          AZZYW4YOE1B6E  [B0002AQPA2, B0002AQPA2, B0002ARQV4]

[18998 rows x 2 columns]


## Preparing Dataset for Surprise's Algorithm

In [17]:
# create reader
reader = Reader(rating_scale=(1,5))
# generate data required for surprise
data = Dataset.load_from_df(train[["reviewerID", "asin", "overall"]], reader)
# generating trainset
trainset = data.build_full_trainset()

## Instantiate Pre-Initialised Matrix Factorization (Topic Modelling)

In [18]:
# instantiating ti_mf
ti_mf = cf.PreInitialisedMF(user_map=user_idx_map,
                            item_map=item_idx_map,
                            user_factor=user_factors,
                            item_factor=item_factors,
                            learning_rate=LR_ALL,
                            beta=BETA,
                            num_epochs=N_EPOCHS,
                            num_factors=50)

In [19]:
%%time
# fitting to training data
ti_mf.fit(trainset, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
CPU times: user 2min 54s, sys: 898 ms, total: 2min 55s
Wall time: 2min 56s


In [20]:
%%time
# generate candidate items for user to predict rating
testset = trainset.build_anti_testset()

CPU times: user 45.9 s, sys: 2.05 s, total: 48 s
Wall time: 48.4 s


In [21]:
%%time
# predict ratings for all pairs (u, i) that are NOT in the training set
candidate_items = ti_mf.test(testset, verbose=False)

CPU times: user 11min 46s, sys: 4min 6s, total: 15min 53s
Wall time: 17min 18s


## Save Model
* Not adviced to save model due to long persistence time required to save model.

## Loop through N = {10, 25, 30, 45}

In [22]:
# generate item popularity
item_popularity = evaluate_model.generate_item_popularity(train)

In [23]:
n_recommendations = {}
for n in [10, 25, 30, 45]:
    # retrieve the top-n items based on similarities
    # top_ns = get_top_n(candidate_items, n)
    top_ns = ti_mf.get_top_n(candidate_items, n)
    # evaluate how well the recommended items predicted the future purchases
    n_recommended_items = (evaluate_model.
                           evaluate_recommendations(model_name = 'TI-MF',
                                                    top_ns = top_ns,
                                                    user_rating_history = test_user_history, 
                                                    item_popularity = item_popularity, 
                                                    n = n,
                                                    mf_based = True))
    # saving the n-value and recommended items
    n_recommendations[n] = (top_ns, n_recommended_items)

The TI-MF has an average recall@10: 0.00498, average novelty@10: 0.92980
The TI-MF has an average recall@25: 0.01086, average novelty@25: 0.93145
The TI-MF has an average recall@30: 0.01300, average novelty@30: 0.93102
The TI-MF has an average recall@45: 0.01919, average novelty@45: 0.93271


# Evaluate N-Recommendations

## N=10

In [24]:
top_ns_10 = n_recommendations[10][0]
utilities.retrieve_recommendations(train, top_ns_10, mf_based=True)

For user: A3T87QAUUPTMZK:
Purchase History:
             asin                                              title
43366  B0010OSIHW  Zoo Med Eco Earth Compressed Coconut Fiber Sub...
45100  B00167VVP4   Zoo Med Eco Earth Loose Coconut Fiber Substra...
46998  B0019IJXD2       Zoo Med Reptile Fogger Terrarium Humidifier 

Recommending:

         asin                                              title
0  B000MLHDS4  Wellness Pure Rewards Natural Grain Free Dog T...
1  B000255P9E                          Seachem Neutral Regulator
2  B001HN5Z4K  Bit-O-Luv Bistro Beef Recipe Dog Treats, 4.0-O...
3  B000ILEIUE  Blue Dog Bakery | Dog Treats | All-Natural | P...
4  B0012KB4D4  Purina Friskies Gravy Sensations Wet Cat Food ...
5  B000MLG4K2  Wellness Wellbites Soft Natural Dog Treats, Tu...
6  B000I82DU4                 Milk-Bone Flavor Snacks Dog Treats
7  B0002QX3SS          Bamboo  Combat Extreme Flying-Cow Dog Toy
8  B000N5Z5YI                  Merrick Lamb Hold EMS Filets 8 Oz
9  B00BC3ZB2G 

## N=25

In [25]:
top_ns_25 = n_recommendations[25][0]
utilities.retrieve_recommendations(train, top_ns_25, mf_based=True)

For user: A1XEZIHQIUAOR1:
Purchase History:
             asin                                              title
1487   B000084E6V                      Nylabone Dental Dinosaur Chew
1727   B000084E6V                      Nylabone Dental Dinosaur Chew
28789  B0006VMN4O  Pioneer Pet SmartCat Peek-A-Prize Toy Box with...
37658  B000JZ1WSU                     SmartCat 3836 Tick Tock Teaser
42603  B000XZDV44  Hill'S Science Diet Kitten Savory Salmon Entre...
43867  B0012KCUOG  Whisker Lickin'S Soft &amp; Delicious Chicken ...

Recommending:

          asin                                              title
0   B0002AQL5G  API REPLACEMENT TEST TUBES WITH CAPS For Any A...
1   B000HHSLEI                        Pet Stages Mini Jingle Cage
2   B000255MZG         API STRESS COAT Aquarium Water Conditioner
3   B0002DI1W4  CO2 Natural Plant System Bubble Counter with S...
4   B000YIYSH4  Acurel Premium Activated Filter Carbon Granule...
5   B0002563MM               Clear &amp; Flexible Air Line Tu

## N=30

In [26]:
top_ns_30 = n_recommendations[30][0]
utilities.retrieve_recommendations(train, top_ns_30, mf_based=True)

For user: A89LQAXW1IY6S:
Purchase History:
             asin                                              title
12819  B0002ARP2O  Marshall Ferret Deluxe Leisure Lounge, Pattern...
12825  B0002ARP2O  Marshall Ferret Deluxe Leisure Lounge, Pattern...
38837  B000MD3NLS  MidWest Homes for Pets Snap'y Fit Stainless St...
42683  B000Y8UNAU  Pro Select Fleece Cat Perch Covers - Comfortab...
46465  B0018CJZ32                         SmartCat Corner Litter Box

Recommending:

          asin                                              title
0   B00290K0C2                LitterLocker Refill Cartridge 10 pk
1   B001OQXEHK  Fresh Step Crystals, Premium Cat Litter, Scent...
2   B001U8FOES                       Curvations Litter Scoop Size
3   B004U8Z2YW  Arm &amp; Hammer Double Duty Clumping Litter, ...
4   B008W8IC4I  Cat's Pride Fresh and Light Multi-Cat Premium ...
5   B0014CHDYO  Breeze Tidy Cat Litter Pads 16.9&quot;x11.4&qu...
6   B001HSMYSU   LitterMaid Ultimate Accessories Kit for Elite...

## N=45

In [27]:
top_ns_45 = n_recommendations[45][0]
utilities.retrieve_recommendations(train, top_ns_45, mf_based=True)

For user: A3C2ECIXEQ0YFQ:
Purchase History:
             asin                                              title
30751  B0009YS4P0  Nutri-Vet Hip &amp; Joint Extra Strength Chewa...
37067  B000IBRI2Y              Dog Dazer II Ultrasonic Dog Deterrent
40175  B000OV4VAU                        Nutri-Vet Alaska Salmon Oil
53623  B0029NQTI8  Pedigree Choice Cuts Variety Pack Lamb/Vegetab...

Recommending:

          asin                                              title
0   B000F4AVPA                                Chuckit! Ultra Ball
1   B001LNUKE6                Purebites Cheddar Cheese Dog Treats
2   B000255MZG         API STRESS COAT Aquarium Water Conditioner
3   B003JFRQQ4  Scaredy Cut Tiny Trim by Small Pet Grooming Sa...
4   B0002DJVQY   JW Pet Company Activitoys Triple Mirror Bird Toy
5   B0002AROVQ                         Marshall Ferret Litter Pan
6   B00006IX59                         Chuckit! Dog Ball Launcher
7   B0006JKCN0              KONG Frog Dog Toy, Extra Small, Green
8

# Cross-Analysis for Cold-Start Users (<= 2 Purchased Items)

In [28]:
cold_start_users = utilities.generate_cold_start_users(train)

In [29]:
for n in tuple(zip([10, 25, 30, 45], [top_ns_10, top_ns_25, top_ns_30, top_ns_45])):
    cold_start_top_ns = dict(filter(lambda x: x[0] in cold_start_users, n[1].items()))
    # evaluate how well the recommended items predicted the future purchases
    # on cold start users
    n_recommended_items = (evaluate_model.
                           evaluate_recommendations(model_name = 'TI-MF',
                                                    top_ns = cold_start_top_ns,
                                                    user_rating_history = test_user_history, 
                                                    item_popularity = item_popularity, 
                                                    n = n[0],
                                                    mf_based = True))

The TI-MF has an average recall@10: 0.00452, average novelty@10: 0.92967
The TI-MF has an average recall@25: 0.01138, average novelty@25: 0.93145
The TI-MF has an average recall@30: 0.01367, average novelty@30: 0.93100
The TI-MF has an average recall@45: 0.02107, average novelty@45: 0.93273


# Generating Recommended Items DataFrame

In [30]:
max_recommendations = (utilities
                       .generate_recommendations_df(
                           train, 
                           n_recommendations, 
                           "TI-MF", mf_based=True))

In [31]:
max_recommendations

Unnamed: 0,reviewerID,item_rank,asin,algorithm,title
0,A04173782GDZSQ91AJ7OD,0,B000255MZG,TI-MF,API STRESS COAT Aquarium Water Conditioner
1,A04173782GDZSQ91AJ7OD,1,B00025YUR2,TI-MF,Marineland Magnum Dual Purpose Canister Filter
2,A04173782GDZSQ91AJ7OD,2,B0002DJVQY,TI-MF,JW Pet Company Activitoys Triple Mirror Bird Toy
3,A04173782GDZSQ91AJ7OD,3,B000F4AVPA,TI-MF,Chuckit! Ultra Ball
4,A04173782GDZSQ91AJ7OD,4,B003JFRQQ4,TI-MF,Scaredy Cut Tiny Trim by Small Pet Grooming Sa...
...,...,...,...,...,...
857605,AZZYW4YOE1B6E,40,B001OCJXRO,TI-MF,Forever Litter Tray Reusable Replacement for S...
857606,AZZYW4YOE1B6E,41,B0002AROVQ,TI-MF,Marshall Ferret Litter Pan
857607,AZZYW4YOE1B6E,42,B0002AQSAY,TI-MF,Premium Choice All Natural Unscented Scoopable...
857608,AZZYW4YOE1B6E,43,B001B4H6HY,TI-MF,Pet Buddies PB6510 Cat Litter Buster Mat 18&qu...


# Store in `SQLite` DB

In [33]:
engine = create_engine("sqlite:///recommender.db", echo=True)

In [34]:
max_recommendations.to_sql(f"{CATEGORY}", con=engine, if_exists="append")

2021-09-29 02:19:52,591 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("Pet_Supplies")
2021-09-29 02:19:52,597 INFO sqlalchemy.engine.Engine [raw sql] ()
2021-09-29 02:19:52,889 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2021-09-29 02:19:56,411 INFO sqlalchemy.engine.Engine INSERT INTO "Pet_Supplies" ("index", "reviewerID", item_rank, asin, algorithm, title) VALUES (?, ?, ?, ?, ?, ?)
2021-09-29 02:19:56,412 INFO sqlalchemy.engine.Engine [generated in 2.88443s] ((0, 'A04173782GDZSQ91AJ7OD', 0, 'B000255MZG', 'TI-MF', 'API STRESS COAT Aquarium Water Conditioner'), (1, 'A04173782GDZSQ91AJ7OD', 1, 'B00025YUR2', 'TI-MF', 'Marineland Magnum Dual Purpose Canister Filter'), (2, 'A04173782GDZSQ91AJ7OD', 2, 'B0002DJVQY', 'TI-MF', 'JW Pet Company Activitoys Triple Mirror Bird Toy'), (3, 'A04173782GDZSQ91AJ7OD', 3, 'B000F4AVPA', 'TI-MF', 'Chuckit! Ultra Ball'), (4, 'A04173782GDZSQ91AJ7OD', 4, 'B003JFRQQ4', 'TI-MF', 'Scaredy Cut Tiny Trim by Small Pet Grooming Safety Scissor - 4.5&quot; Ea