In [1]:
from collections import defaultdict
import logging
from pprint import pprint

import gensim
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVD
from tqdm import tqdm

tqdm.pandas()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



# Load Data

In [2]:
# global variable
CATEGORY = "Grocery_and_Gourmet_Food"
DATA_PATH = "data/evaluation"

In [3]:
train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")

In [4]:
train.head()

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,0,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A23RYWDS884TUL,5.0,This curry paste makes a delicious curry. I j...,2013-05-28,curry paste delicious curry fry chicken vegeta...
1,1,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A945RBQWGZXCK,5.0,I've purchased different curries in the grocer...,2012-09-17,purchase different curry grocery store complet...
2,3,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3AMNY44OP8AOU,4.0,I started a new diet restricting all added sug...,2014-01-23,start new diet restrict added sugar brand suga...
3,4,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3IB4CQ2QEJLJ8,5.0,So many flavors. I can't begin to tell you how...,2014-04-27,flavor begin tell love mae ploy curry ask reci...
4,5,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",AQA5DF3RWKETQ,5.0,I've used this a lot recently in some of my ch...,2012-11-27,use lot recently chicken dish use lot like spi...


# Preparing Review Text for LDA

In [5]:
# tokenization
processed_reviews = train["processedReviewText"].progress_apply(lambda x: x.split())

# creating a bag-of-words
dictionary = gensim.corpora.Dictionary(processed_reviews)

# filtering out tokens that appear in less than 15 reviews
# or more than 0.5 of the corpus
dictionary.filter_extremes(no_below=5, no_above=0.75)

# creating dict how many words and time the word appear
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_reviews]

100%|█████████████████████████████████████████████████████| 47774/47774 [00:00<00:00, 139790.53it/s]
2021-08-24 16:16:41,209 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-08-24 16:16:41,568 : INFO : adding document #10000 to Dictionary(14898 unique tokens: ['add', 'chicken', 'coconut', 'curry', 'delicious']...)
2021-08-24 16:16:41,956 : INFO : adding document #20000 to Dictionary(21785 unique tokens: ['add', 'chicken', 'coconut', 'curry', 'delicious']...)
2021-08-24 16:16:42,360 : INFO : adding document #30000 to Dictionary(27143 unique tokens: ['add', 'chicken', 'coconut', 'curry', 'delicious']...)
2021-08-24 16:16:42,779 : INFO : adding document #40000 to Dictionary(31844 unique tokens: ['add', 'chicken', 'coconut', 'curry', 'delicious']...)
2021-08-24 16:16:43,143 : INFO : built Dictionary(34875 unique tokens: ['add', 'chicken', 'coconut', 'curry', 'delicious']...) from 47774 documents (total 1942617 corpus positions)
2021-08-24 16:16:43,164 : INFO : Dictionary

# Training LDA Model

In [6]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=50, id2word=dictionary, passes=20, workers=8)

2021-08-24 16:16:44,632 : INFO : using symmetric alpha at 0.02
2021-08-24 16:16:44,633 : INFO : using symmetric eta at 0.02
2021-08-24 16:16:44,634 : INFO : using serial LDA version on this node
2021-08-24 16:16:44,674 : INFO : running online LDA training, 50 topics, 20 passes over the supplied corpus of 47774 documents, updating every 16000 documents, evaluating every ~47774 documents, iterating 50x with a convergence threshold of 0.001000
2021-08-24 16:16:44,676 : INFO : training LDA model using 8 processes
2021-08-24 16:16:49,427 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #2000/47774, outstanding queue size 1
2021-08-24 16:16:49,441 : INFO : PROGRESS: pass 0, dispatched chunk #1 = documents up to #4000/47774, outstanding queue size 2
2021-08-24 16:16:49,443 : INFO : PROGRESS: pass 0, dispatched chunk #2 = documents up to #6000/47774, outstanding queue size 3
2021-08-24 16:16:49,445 : INFO : PROGRESS: pass 0, dispatched chunk #3 = documents up to #8000/47774, ou

2021-08-24 16:16:52,052 : INFO : topic diff=29.581743, rho=1.000000
2021-08-24 16:16:55,107 : INFO : -8.107 per-word bound, 275.7 perplexity estimate based on a held-out corpus of 1774 documents with 80239 words
2021-08-24 16:16:55,224 : INFO : merging changes from 18000 documents into a model of 47774 documents
2021-08-24 16:16:55,262 : INFO : topic #8 (0.020): 0.016*"good" + 0.015*"like" + 0.015*"flavor" + 0.014*"coffee" + 0.012*"taste" + 0.012*"use" + 0.011*"product" + 0.010*"buy" + 0.008*"great" + 0.008*"love"
2021-08-24 16:16:55,264 : INFO : topic #49 (0.020): 0.015*"tea" + 0.015*"taste" + 0.014*"flavor" + 0.014*"like" + 0.013*"sauce" + 0.012*"honey" + 0.012*"good" + 0.012*"product" + 0.011*"use" + 0.009*"buy"
2021-08-24 16:16:55,266 : INFO : topic #25 (0.020): 0.018*"flavor" + 0.017*"good" + 0.012*"product" + 0.012*"like" + 0.011*"use" + 0.011*"taste" + 0.010*"try" + 0.008*"rice" + 0.006*"tea" + 0.006*"pasta"
2021-08-24 16:16:55,275 : INFO : topic #35 (0.020): 0.017*"good" + 0.01

2021-08-24 16:17:04,525 : INFO : topic #42 (0.020): 0.021*"taste" + 0.016*"oil" + 0.015*"like" + 0.014*"flavor" + 0.012*"good" + 0.011*"water" + 0.010*"coconut" + 0.010*"use" + 0.008*"fat" + 0.007*"product"
2021-08-24 16:17:04,527 : INFO : topic diff=0.507813, rho=0.196544
2021-08-24 16:17:06,928 : INFO : -7.693 per-word bound, 206.9 perplexity estimate based on a held-out corpus of 1774 documents with 80239 words
2021-08-24 16:17:06,994 : INFO : merging changes from 15774 documents into a model of 47774 documents
2021-08-24 16:17:07,017 : INFO : topic #48 (0.020): 0.028*"like" + 0.025*"taste" + 0.019*"drink" + 0.014*"good" + 0.012*"sugar" + 0.011*"use" + 0.011*"flavor" + 0.010*"product" + 0.009*"water" + 0.009*"try"
2021-08-24 16:17:07,018 : INFO : topic #49 (0.020): 0.036*"honey" + 0.016*"sauce" + 0.016*"flavor" + 0.016*"taste" + 0.013*"like" + 0.013*"use" + 0.011*"good" + 0.011*"mango" + 0.011*"product" + 0.010*"pork"
2021-08-24 16:17:07,019 : INFO : topic #18 (0.020): 0.020*"good" 

2021-08-24 16:17:16,682 : INFO : topic #13 (0.020): 0.031*"cereal" + 0.021*"like" + 0.021*"taste" + 0.015*"sugar" + 0.015*"good" + 0.011*"eat" + 0.011*"fiber" + 0.011*"flavor" + 0.009*"cinnamon" + 0.009*"calorie"
2021-08-24 16:17:16,683 : INFO : topic #20 (0.020): 0.051*"milk" + 0.016*"use" + 0.015*"taste" + 0.014*"like" + 0.013*"sugar" + 0.010*"good" + 0.009*"try" + 0.009*"little" + 0.009*"drink" + 0.008*"tea"
2021-08-24 16:17:16,684 : INFO : topic diff=0.585516, rho=0.192854
2021-08-24 16:17:17,776 : INFO : -7.485 per-word bound, 179.1 perplexity estimate based on a held-out corpus of 1774 documents with 80239 words
2021-08-24 16:17:17,778 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #2000/47774, outstanding queue size 1
2021-08-24 16:17:17,787 : INFO : PROGRESS: pass 3, dispatched chunk #1 = documents up to #4000/47774, outstanding queue size 2
2021-08-24 16:17:17,788 : INFO : PROGRESS: pass 3, dispatched chunk #2 = documents up to #6000/47774, outstanding queue 

2021-08-24 16:17:27,176 : INFO : PROGRESS: pass 4, dispatched chunk #4 = documents up to #10000/47774, outstanding queue size 5
2021-08-24 16:17:27,177 : INFO : PROGRESS: pass 4, dispatched chunk #5 = documents up to #12000/47774, outstanding queue size 6
2021-08-24 16:17:27,185 : INFO : PROGRESS: pass 4, dispatched chunk #6 = documents up to #14000/47774, outstanding queue size 7
2021-08-24 16:17:27,186 : INFO : PROGRESS: pass 4, dispatched chunk #7 = documents up to #16000/47774, outstanding queue size 8
2021-08-24 16:17:27,187 : INFO : PROGRESS: pass 4, dispatched chunk #8 = documents up to #18000/47774, outstanding queue size 9
2021-08-24 16:17:27,188 : INFO : PROGRESS: pass 4, dispatched chunk #9 = documents up to #20000/47774, outstanding queue size 10
2021-08-24 16:17:27,197 : INFO : PROGRESS: pass 4, dispatched chunk #10 = documents up to #22000/47774, outstanding queue size 11
2021-08-24 16:17:27,198 : INFO : PROGRESS: pass 4, dispatched chunk #11 = documents up to #24000/4777

2021-08-24 16:17:36,217 : INFO : PROGRESS: pass 5, dispatched chunk #13 = documents up to #28000/47774, outstanding queue size 14
2021-08-24 16:17:36,233 : INFO : PROGRESS: pass 5, dispatched chunk #14 = documents up to #30000/47774, outstanding queue size 15
2021-08-24 16:17:36,236 : INFO : PROGRESS: pass 5, dispatched chunk #15 = documents up to #32000/47774, outstanding queue size 16
2021-08-24 16:17:36,240 : INFO : PROGRESS: pass 5, dispatched chunk #16 = documents up to #34000/47774, outstanding queue size 17
2021-08-24 16:17:36,242 : INFO : PROGRESS: pass 5, dispatched chunk #17 = documents up to #36000/47774, outstanding queue size 18
2021-08-24 16:17:36,245 : INFO : PROGRESS: pass 5, dispatched chunk #18 = documents up to #38000/47774, outstanding queue size 19
2021-08-24 16:17:36,273 : INFO : PROGRESS: pass 5, dispatched chunk #19 = documents up to #40000/47774, outstanding queue size 20
2021-08-24 16:17:36,300 : INFO : PROGRESS: pass 5, dispatched chunk #20 = documents up to 

2021-08-24 16:17:45,946 : INFO : PROGRESS: pass 6, dispatched chunk #21 = documents up to #44000/47774, outstanding queue size 22
2021-08-24 16:17:45,971 : INFO : PROGRESS: pass 6, dispatched chunk #22 = documents up to #46000/47774, outstanding queue size 23
2021-08-24 16:17:45,990 : INFO : PROGRESS: pass 6, dispatched chunk #23 = documents up to #47774/47774, outstanding queue size 24
2021-08-24 16:17:47,918 : INFO : merging changes from 16000 documents into a model of 47774 documents
2021-08-24 16:17:47,956 : INFO : topic #48 (0.020): 0.029*"taste" + 0.028*"like" + 0.020*"water" + 0.015*"product" + 0.013*"drink" + 0.012*"good" + 0.011*"bottle" + 0.011*"use" + 0.010*"flavor" + 0.009*"try"
2021-08-24 16:17:47,957 : INFO : topic #17 (0.020): 0.070*"chocolate" + 0.028*"candy" + 0.025*"bar" + 0.022*"caramel" + 0.021*"dark" + 0.016*"good" + 0.012*"flavor" + 0.012*"taste" + 0.011*"like" + 0.011*"piece"
2021-08-24 16:17:47,958 : INFO : topic #29 (0.020): 0.017*"buy" + 0.016*"good" + 0.016*"

2021-08-24 16:17:56,447 : INFO : topic #5 (0.020): 0.094*"salt" + 0.053*"popcorn" + 0.022*"taste" + 0.020*"good" + 0.017*"pop" + 0.017*"like" + 0.016*"use" + 0.016*"flavor" + 0.015*"sea" + 0.014*"great"
2021-08-24 16:17:56,448 : INFO : topic #10 (0.020): 0.080*"free" + 0.074*"gluten" + 0.025*"good" + 0.024*"bread" + 0.020*"tuna" + 0.017*"pasta" + 0.016*"taste" + 0.016*"like" + 0.015*"product" + 0.015*"eat"
2021-08-24 16:17:56,451 : INFO : topic diff=0.560817, rho=0.177090
2021-08-24 16:17:59,482 : INFO : -7.298 per-word bound, 157.4 perplexity estimate based on a held-out corpus of 1774 documents with 80239 words
2021-08-24 16:17:59,601 : INFO : merging changes from 16000 documents into a model of 47774 documents
2021-08-24 16:17:59,649 : INFO : topic #34 (0.020): 0.032*"fat" + 0.023*"calorie" + 0.019*"high" + 0.019*"protein" + 0.017*"seed" + 0.014*"low" + 0.014*"consume" + 0.014*"food" + 0.011*"nutrient" + 0.011*"diet"
2021-08-24 16:17:59,654 : INFO : topic #43 (0.020): 0.055*"seed" +

2021-08-24 16:18:07,481 : INFO : topic #1 (0.020): 0.042*"add" + 0.024*"water" + 0.020*"minute" + 0.017*"chicken" + 0.016*"cook" + 0.014*"microwave" + 0.014*"soup" + 0.012*"stir" + 0.012*"use" + 0.011*"meal"
2021-08-24 16:18:07,491 : INFO : topic #38 (0.020): 0.031*"buy" + 0.026*"bag" + 0.019*"jar" + 0.018*"container" + 0.015*"open" + 0.015*"good" + 0.015*"use" + 0.014*"package" + 0.013*"plastic" + 0.012*"store"
2021-08-24 16:18:07,516 : INFO : topic #36 (0.020): 0.080*"coffee" + 0.078*"vanilla" + 0.043*"flavor" + 0.025*"taste" + 0.022*"starbucks" + 0.016*"like" + 0.015*"flavored" + 0.014*"bean" + 0.013*"good" + 0.011*"smell"
2021-08-24 16:18:07,533 : INFO : topic diff=0.479213, rho=0.174376
2021-08-24 16:18:09,420 : INFO : -7.282 per-word bound, 155.6 perplexity estimate based on a held-out corpus of 1774 documents with 80239 words
2021-08-24 16:18:09,499 : INFO : merging changes from 15774 documents into a model of 47774 documents
2021-08-24 16:18:09,523 : INFO : topic #33 (0.020): 0

2021-08-24 16:18:17,740 : INFO : topic #40 (0.020): 0.066*"almond" + 0.043*"flavor" + 0.017*"taste" + 0.017*"good" + 0.014*"try" + 0.014*"like" + 0.013*"lemon" + 0.012*"nut" + 0.009*"product" + 0.009*"fruit"
2021-08-24 16:18:17,741 : INFO : topic #27 (0.020): 0.046*"cherry" + 0.046*"mix" + 0.036*"pancake" + 0.024*"like" + 0.018*"tart" + 0.017*"taste" + 0.015*"muffin" + 0.015*"waffle" + 0.014*"pumpkin" + 0.012*"pie"
2021-08-24 16:18:17,742 : INFO : topic #32 (0.020): 0.133*"organic" + 0.025*"product" + 0.024*"food" + 0.022*"use" + 0.018*"company" + 0.016*"spice" + 0.015*"buy" + 0.010*"love" + 0.010*"brand" + 0.010*"great"
2021-08-24 16:18:17,743 : INFO : topic #12 (0.020): 0.050*"fruit" + 0.042*"oatmeal" + 0.030*"bean" + 0.024*"like" + 0.018*"dry" + 0.017*"dried" + 0.013*"cranberry" + 0.013*"great" + 0.012*"add" + 0.012*"taste"
2021-08-24 16:18:17,744 : INFO : topic diff=0.406959, rho=0.171784
2021-08-24 16:18:18,709 : INFO : -7.237 per-word bound, 150.8 perplexity estimate based on a h

2021-08-24 16:18:29,187 : INFO : PROGRESS: pass 11, dispatched chunk #0 = documents up to #2000/47774, outstanding queue size 1
2021-08-24 16:18:29,198 : INFO : PROGRESS: pass 11, dispatched chunk #1 = documents up to #4000/47774, outstanding queue size 2
2021-08-24 16:18:29,199 : INFO : PROGRESS: pass 11, dispatched chunk #2 = documents up to #6000/47774, outstanding queue size 3
2021-08-24 16:18:29,200 : INFO : PROGRESS: pass 11, dispatched chunk #3 = documents up to #8000/47774, outstanding queue size 4
2021-08-24 16:18:29,201 : INFO : PROGRESS: pass 11, dispatched chunk #4 = documents up to #10000/47774, outstanding queue size 5
2021-08-24 16:18:29,202 : INFO : PROGRESS: pass 11, dispatched chunk #5 = documents up to #12000/47774, outstanding queue size 6
2021-08-24 16:18:29,213 : INFO : PROGRESS: pass 11, dispatched chunk #6 = documents up to #14000/47774, outstanding queue size 7
2021-08-24 16:18:29,215 : INFO : PROGRESS: pass 11, dispatched chunk #7 = documents up to #16000/4777

2021-08-24 16:18:38,073 : INFO : PROGRESS: pass 12, dispatched chunk #8 = documents up to #18000/47774, outstanding queue size 9
2021-08-24 16:18:38,075 : INFO : PROGRESS: pass 12, dispatched chunk #9 = documents up to #20000/47774, outstanding queue size 10
2021-08-24 16:18:38,094 : INFO : PROGRESS: pass 12, dispatched chunk #10 = documents up to #22000/47774, outstanding queue size 11
2021-08-24 16:18:38,095 : INFO : PROGRESS: pass 12, dispatched chunk #11 = documents up to #24000/47774, outstanding queue size 12
2021-08-24 16:18:38,097 : INFO : PROGRESS: pass 12, dispatched chunk #12 = documents up to #26000/47774, outstanding queue size 13
2021-08-24 16:18:38,099 : INFO : PROGRESS: pass 12, dispatched chunk #13 = documents up to #28000/47774, outstanding queue size 14
2021-08-24 16:18:38,103 : INFO : PROGRESS: pass 12, dispatched chunk #14 = documents up to #30000/47774, outstanding queue size 15
2021-08-24 16:18:38,105 : INFO : PROGRESS: pass 12, dispatched chunk #15 = documents u

2021-08-24 16:18:46,932 : INFO : PROGRESS: pass 13, dispatched chunk #16 = documents up to #34000/47774, outstanding queue size 17
2021-08-24 16:18:46,935 : INFO : PROGRESS: pass 13, dispatched chunk #17 = documents up to #36000/47774, outstanding queue size 18
2021-08-24 16:18:46,938 : INFO : PROGRESS: pass 13, dispatched chunk #18 = documents up to #38000/47774, outstanding queue size 19
2021-08-24 16:18:46,961 : INFO : PROGRESS: pass 13, dispatched chunk #19 = documents up to #40000/47774, outstanding queue size 20
2021-08-24 16:18:46,974 : INFO : PROGRESS: pass 13, dispatched chunk #20 = documents up to #42000/47774, outstanding queue size 21
2021-08-24 16:18:47,004 : INFO : PROGRESS: pass 13, dispatched chunk #21 = documents up to #44000/47774, outstanding queue size 22
2021-08-24 16:18:47,044 : INFO : PROGRESS: pass 13, dispatched chunk #22 = documents up to #46000/47774, outstanding queue size 23
2021-08-24 16:18:47,071 : INFO : PROGRESS: pass 13, dispatched chunk #23 = document

2021-08-24 16:18:56,990 : INFO : merging changes from 16000 documents into a model of 47774 documents
2021-08-24 16:18:57,029 : INFO : topic #34 (0.020): 0.035*"fat" + 0.028*"calorie" + 0.022*"high" + 0.022*"low" + 0.022*"protein" + 0.017*"consume" + 0.016*"food" + 0.016*"diet" + 0.014*"seed" + 0.013*"nutrient"
2021-08-24 16:18:57,030 : INFO : topic #26 (0.020): 0.043*"olive" + 0.035*"salad" + 0.025*"use" + 0.024*"garlic" + 0.023*"oil" + 0.021*"good" + 0.020*"flavor" + 0.018*"mix" + 0.014*"like" + 0.013*"cheese"
2021-08-24 16:18:57,031 : INFO : topic #30 (0.020): 0.030*"grey" + 0.027*"earl" + 0.017*"flavor" + 0.015*"bring" + 0.014*"like" + 0.010*"bergamot" + 0.010*"memory" + 0.009*"stash" + 0.008*"law" + 0.008*"lady"
2021-08-24 16:18:57,033 : INFO : topic #49 (0.020): 0.138*"honey" + 0.051*"ginger" + 0.023*"raw" + 0.022*"mango" + 0.019*"pork" + 0.017*"taste" + 0.014*"sweet" + 0.014*"use" + 0.011*"flavor" + 0.011*"like"
2021-08-24 16:18:57,034 : INFO : topic #7 (0.020): 0.068*"price" + 

2021-08-24 16:19:05,351 : INFO : topic diff=0.199053, rho=0.158338
2021-08-24 16:19:08,077 : INFO : -7.206 per-word bound, 147.7 perplexity estimate based on a held-out corpus of 1774 documents with 80239 words
2021-08-24 16:19:08,223 : INFO : merging changes from 16000 documents into a model of 47774 documents
2021-08-24 16:19:08,279 : INFO : topic #5 (0.020): 0.122*"salt" + 0.058*"popcorn" + 0.023*"taste" + 0.022*"pop" + 0.021*"salty" + 0.020*"sea" + 0.020*"good" + 0.018*"use" + 0.017*"like" + 0.017*"flavor"
2021-08-24 16:19:08,291 : INFO : topic #3 (0.020): 0.130*"mint" + 0.086*"oreo" + 0.032*"creme" + 0.021*"scout" + 0.018*"mio" + 0.015*"peppermint" + 0.011*"flavor" + 0.010*"sandwich" + 0.010*"similar" + 0.009*"year"
2021-08-24 16:19:08,318 : INFO : topic #47 (0.020): 0.091*"use" + 0.047*"recipe" + 0.047*"bake" + 0.046*"flour" + 0.034*"bread" + 0.023*"powder" + 0.017*"add" + 0.016*"mix" + 0.014*"work" + 0.013*"yeast"
2021-08-24 16:19:08,332 : INFO : topic #35 (0.020): 0.061*"cake" 

2021-08-24 16:19:16,989 : INFO : topic #40 (0.020): 0.087*"almond" + 0.050*"flavor" + 0.029*"nut" + 0.017*"lemon" + 0.016*"taste" + 0.014*"good" + 0.014*"like" + 0.013*"try" + 0.011*"peach" + 0.011*"roast"
2021-08-24 16:19:16,992 : INFO : topic diff=0.169256, rho=0.156389
2021-08-24 16:19:19,096 : INFO : -7.212 per-word bound, 148.2 perplexity estimate based on a held-out corpus of 1774 documents with 80239 words
2021-08-24 16:19:19,188 : INFO : merging changes from 15774 documents into a model of 47774 documents
2021-08-24 16:19:19,217 : INFO : topic #47 (0.020): 0.092*"use" + 0.051*"bake" + 0.048*"flour" + 0.048*"recipe" + 0.035*"bread" + 0.022*"powder" + 0.017*"add" + 0.016*"mix" + 0.015*"shake" + 0.014*"work"
2021-08-24 16:19:19,218 : INFO : topic #10 (0.020): 0.142*"free" + 0.107*"gluten" + 0.028*"bread" + 0.024*"good" + 0.023*"tuna" + 0.019*"eat" + 0.018*"product" + 0.018*"like" + 0.017*"taste" + 0.016*"try"
2021-08-24 16:19:19,219 : INFO : topic #41 (0.020): 0.085*"drink" + 0.03

2021-08-24 16:19:28,502 : INFO : topic #21 (0.020): 0.199*"tea" + 0.036*"green" + 0.018*"taste" + 0.017*"drink" + 0.017*"bag" + 0.015*"flavor" + 0.013*"like" + 0.013*"cup" + 0.012*"good" + 0.010*"leaf"
2021-08-24 16:19:28,503 : INFO : topic #9 (0.020): 0.042*"star" + 0.036*"licorice" + 0.025*"good" + 0.011*"black" + 0.010*"rating" + 0.010*"tasteless" + 0.008*"assortment" + 0.008*"peppercorn" + 0.008*"twist" + 0.007*"clam"
2021-08-24 16:19:28,504 : INFO : topic diff=0.153651, rho=0.154511
2021-08-24 16:19:29,793 : INFO : -7.189 per-word bound, 145.9 perplexity estimate based on a held-out corpus of 1774 documents with 80239 words
2021-08-24 16:19:29,794 : INFO : PROGRESS: pass 18, dispatched chunk #0 = documents up to #2000/47774, outstanding queue size 1
2021-08-24 16:19:29,805 : INFO : PROGRESS: pass 18, dispatched chunk #1 = documents up to #4000/47774, outstanding queue size 2
2021-08-24 16:19:29,806 : INFO : PROGRESS: pass 18, dispatched chunk #2 = documents up to #6000/47774, outs

2021-08-24 16:19:37,945 : INFO : PROGRESS: pass 19, dispatched chunk #3 = documents up to #8000/47774, outstanding queue size 4
2021-08-24 16:19:37,946 : INFO : PROGRESS: pass 19, dispatched chunk #4 = documents up to #10000/47774, outstanding queue size 5
2021-08-24 16:19:37,954 : INFO : PROGRESS: pass 19, dispatched chunk #5 = documents up to #12000/47774, outstanding queue size 6
2021-08-24 16:19:37,958 : INFO : PROGRESS: pass 19, dispatched chunk #6 = documents up to #14000/47774, outstanding queue size 7
2021-08-24 16:19:37,959 : INFO : PROGRESS: pass 19, dispatched chunk #7 = documents up to #16000/47774, outstanding queue size 8
2021-08-24 16:19:37,960 : INFO : PROGRESS: pass 19, dispatched chunk #8 = documents up to #18000/47774, outstanding queue size 9
2021-08-24 16:19:37,961 : INFO : PROGRESS: pass 19, dispatched chunk #9 = documents up to #20000/47774, outstanding queue size 10
2021-08-24 16:19:37,974 : INFO : PROGRESS: pass 19, dispatched chunk #10 = documents up to #22000

In [7]:
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx}, Words: {topic}")

2021-08-24 16:19:46,288 : INFO : topic #0 (0.020): 0.073*"box" + 0.022*"day" + 0.021*"packet" + 0.019*"time" + 0.018*"open" + 0.016*"arrive" + 0.015*"good" + 0.015*"package" + 0.014*"pack" + 0.011*"inside"
2021-08-24 16:19:46,289 : INFO : topic #1 (0.020): 0.052*"add" + 0.034*"water" + 0.028*"minute" + 0.019*"cook" + 0.017*"microwave" + 0.016*"chicken" + 0.015*"cup" + 0.014*"stir" + 0.014*"meal" + 0.013*"use"
2021-08-24 16:19:46,290 : INFO : topic #2 (0.020): 0.065*"flavor" + 0.060*"tea" + 0.032*"like" + 0.025*"taste" + 0.017*"orange" + 0.017*"bag" + 0.016*"blackberry" + 0.015*"vanilla" + 0.013*"chai" + 0.013*"try"
2021-08-24 16:19:46,292 : INFO : topic #3 (0.020): 0.146*"mint" + 0.099*"oreo" + 0.034*"creme" + 0.026*"scout" + 0.023*"peppermint" + 0.021*"mio" + 0.011*"flavor" + 0.011*"year" + 0.011*"similar" + 0.010*"sandwich"
2021-08-24 16:19:46,293 : INFO : topic #4 (0.020): 0.025*"pu" + 0.024*"costco" + 0.023*"erh" + 0.017*"english" + 0.016*"tea" + 0.016*"breakfast" + 0.011*"try" + 0

2021-08-24 16:19:46,331 : INFO : topic #40 (0.020): 0.086*"almond" + 0.055*"flavor" + 0.034*"nut" + 0.019*"lemon" + 0.015*"taste" + 0.014*"peach" + 0.014*"like" + 0.013*"good" + 0.013*"aid" + 0.012*"try"
2021-08-24 16:19:46,332 : INFO : topic #41 (0.020): 0.087*"drink" + 0.039*"energy" + 0.038*"taste" + 0.030*"like" + 0.027*"juice" + 0.024*"flavor" + 0.017*"good" + 0.015*"caffeine" + 0.012*"apple" + 0.012*"try"
2021-08-24 16:19:46,333 : INFO : topic #42 (0.020): 0.126*"oil" + 0.104*"coconut" + 0.036*"use" + 0.012*"skin" + 0.012*"palm" + 0.011*"fat" + 0.010*"hair" + 0.009*"cook" + 0.008*"food" + 0.008*"product"
2021-08-24 16:19:46,334 : INFO : topic #43 (0.020): 0.078*"seed" + 0.022*"raw" + 0.020*"eat" + 0.019*"nut" + 0.019*"chia" + 0.019*"add" + 0.015*"product" + 0.014*"flax" + 0.014*"salmon" + 0.014*"good"
2021-08-24 16:19:46,334 : INFO : topic #44 (0.020): 0.045*"gift" + 0.043*"box" + 0.031*"wrap" + 0.022*"basket" + 0.020*"individually" + 0.015*"item" + 0.015*"nice" + 0.014*"look" + 

Topic: 0, Words: 0.073*"box" + 0.022*"day" + 0.021*"packet" + 0.019*"time" + 0.018*"open" + 0.016*"arrive" + 0.015*"good" + 0.015*"package" + 0.014*"pack" + 0.011*"inside"
Topic: 1, Words: 0.052*"add" + 0.034*"water" + 0.028*"minute" + 0.019*"cook" + 0.017*"microwave" + 0.016*"chicken" + 0.015*"cup" + 0.014*"stir" + 0.014*"meal" + 0.013*"use"
Topic: 2, Words: 0.065*"flavor" + 0.060*"tea" + 0.032*"like" + 0.025*"taste" + 0.017*"orange" + 0.017*"bag" + 0.016*"blackberry" + 0.015*"vanilla" + 0.013*"chai" + 0.013*"try"
Topic: 3, Words: 0.146*"mint" + 0.099*"oreo" + 0.034*"creme" + 0.026*"scout" + 0.023*"peppermint" + 0.021*"mio" + 0.011*"flavor" + 0.011*"year" + 0.011*"similar" + 0.010*"sandwich"
Topic: 4, Words: 0.025*"pu" + 0.024*"costco" + 0.023*"erh" + 0.017*"english" + 0.016*"tea" + 0.016*"breakfast" + 0.011*"try" + 0.010*"cardamom" + 0.010*"brand" + 0.009*"american"
Topic: 5, Words: 0.124*"salt" + 0.066*"popcorn" + 0.023*"pop" + 0.023*"salty" + 0.023*"taste" + 0.021*"sea" + 0.020*"go

In [8]:
# Compute Coherence Score
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

2021-08-24 16:19:46,395 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2021-08-24 16:19:46,412 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2021-08-24 16:19:46,434 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2021-08-24 16:19:46,453 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2021-08-24 16:19:46,478 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2021-08-24 16:19:46,497 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2021-08-24 16:19:46,516 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2021-08-24 16:19:46,538 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2021-08-24 16:19:46,560 : INFO : CorpusAccumulator accumulated stats from 9000 documents
2021-08-24 16:19:46,579 : INFO : CorpusAccumulator accumulated stats from 10000 documents
2021-08-24 16:19:46,596 : INFO : CorpusAccumulator accumulated stats from 11000 documents
2021-08-24 16:19:46


Coherence Score:  -2.773395173084744


# Retrieving User/Item Topic Vectors

In [9]:
user_reviews = train.groupby(["reviewerID"])['processedReviewText'].apply(lambda x: ' '.join(x))
item_reviews = train.groupby(["asin"])["processedReviewText"].apply(lambda x: ' '.join(x))

# get unique users and items
unique_users = user_reviews.index.tolist()
unique_items = item_reviews.index.tolist()

# tokenize reviews
user_reviews_list = user_reviews.apply(lambda x: x.split()).tolist()
item_reviews_list = item_reviews.apply(lambda x: x.split()).tolist()

In [10]:
print(f"Random user:\n{user_reviews[np.random.randint(0, user_reviews.shape[0])]}")
print(f"\nRandom item:\n{item_reviews[np.random.randint(0, item_reviews.shape[0])]}")

Random user:
happy amazon com store area buy year come price right peanut addict love product lightly salt ounce jar cheap big discount store handy way sparkling water ups man product truly delicious taste like plain carbonated water add flavor thoroughly mix little serve package wyler crystal light miro little plain water pour talkingrain orange lemon calorie light soda shipment package properly damage leak amazon care send replacement shipment right away arrive thoroughly bubble wrapped proper size box reorder fine product

Random item:
start love annie chun udon soup march amazon grocery list month read bad review guy know great order chun soup noodle bowl order future annie chun cute talented woman st asian female admire read story soup noodle second time amazon com normally like udon noodle noodle annie chun kim chi soup noodle bowl good broth sour spicy taste like kim chi great udon noodle noodle dehydrate like instant soup pretty good consider bowl require refrigeration hotness 

In [11]:
user_corpus = [dictionary.doc2bow(doc) for doc in user_reviews_list]
item_corpus = [dictionary.doc2bow(doc) for doc in item_reviews_list]

In [12]:
def get_topic_vectors(model, corpus, n_topics=50):
    """
    """
    topic_vecs = []
    for i in tqdm(range(len(corpus))):
        top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0)
        topic_vecs.append([top_topics[i][1] for i in range(n_topics)])
        
    return topic_vecs

In [13]:
user_vecs = get_topic_vectors(lda_model, user_corpus)
item_vecs = get_topic_vectors(lda_model, item_corpus)

100%|███████████████████████████████████████████████████████| 13397/13397 [00:10<00:00, 1281.27it/s]
100%|██████████████████████████████████████████████████████████| 4729/4729 [00:05<00:00, 832.46it/s]


In [14]:
# checking topic vector
pprint(item_vecs[:1])

[[4.412026e-05,
  0.037068404,
  4.412026e-05,
  4.412026e-05,
  0.01118332,
  4.412026e-05,
  4.412026e-05,
  0.056985307,
  0.37342215,
  4.412026e-05,
  4.412026e-05,
  0.011684398,
  0.0029754133,
  4.412026e-05,
  4.412026e-05,
  0.015933922,
  0.03278494,
  4.412026e-05,
  0.018550407,
  4.412026e-05,
  0.021376897,
  0.05969913,
  4.412026e-05,
  0.041945748,
  0.013643295,
  4.412026e-05,
  4.412026e-05,
  4.412026e-05,
  4.412026e-05,
  4.412026e-05,
  4.412026e-05,
  4.412026e-05,
  4.412026e-05,
  4.412026e-05,
  4.412026e-05,
  0.040703386,
  4.412026e-05,
  4.412026e-05,
  0.17367385,
  0.0230405,
  4.412026e-05,
  4.412026e-05,
  0.016442873,
  4.412026e-05,
  0.0057348763,
  4.412026e-05,
  0.012069327,
  0.019473264,
  0.010329086,
  4.412026e-05]]


# Create Topic Vector Mappings

In [15]:
user_idx_map = {k: unique_users[k] for k in range(len(unique_users))}
item_idx_map = {k: unique_items[k] for k in range(len(unique_items))}

user_vec_map = {k: v for k, v in zip(unique_users, user_vecs)}
item_vec_map = {k: v for k, v in zip(unique_items, item_vecs)}

In [16]:
# loading user topic vectors into DF
user_vecs = pd.DataFrame.from_dict(user_vec_map, orient='index')
user_vecs.index.name = 'reviewerID'
# loading item topic vectors into DF
item_vecs = pd.DataFrame.from_dict(item_vec_map, orient='index')
item_vecs.index.name = 'asin'

In [17]:
user_vecs

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00177463W0XWB16A9O05,0.001334,0.001334,0.001334,0.001334,0.001334,0.321427,0.001334,0.001334,0.001334,0.001334,...,0.001334,0.001334,0.001334,0.001334,0.001334,0.001334,0.001334,0.001334,0.001334,0.001334
A022899328A0QROR32DCT,0.150856,0.000239,0.000239,0.013963,0.000239,0.000239,0.000239,0.079691,0.024830,0.000239,...,0.019680,0.000239,0.000239,0.000239,0.000239,0.000239,0.011307,0.000239,0.000239,0.000239
A068255029AHTHDXZURNU,0.099152,0.000572,0.000572,0.000572,0.000572,0.000572,0.000572,0.000572,0.000572,0.000572,...,0.000572,0.000572,0.000572,0.000572,0.149055,0.000572,0.000572,0.000572,0.236662,0.000572
A06944662TFWOKKV4GJKX,0.000800,0.246271,0.166885,0.000800,0.000800,0.000800,0.000800,0.000800,0.000800,0.000800,...,0.000800,0.000800,0.000800,0.000800,0.000800,0.000800,0.000800,0.000800,0.000800,0.000800
A1004703RC79J9,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429,0.532996,0.001429,0.001429,...,0.001429,0.059955,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZWRZZAMX90VT,0.000080,0.224511,0.000080,0.000080,0.008113,0.000080,0.014185,0.000080,0.183947,0.086676,...,0.000080,0.026617,0.018546,0.000080,0.000080,0.000080,0.050363,0.010761,0.047481,0.000080
AZXKAH2DE6C8A,0.000147,0.000147,0.000147,0.000147,0.000147,0.000147,0.040951,0.190155,0.000147,0.009686,...,0.000147,0.000147,0.010761,0.000147,0.000147,0.073099,0.000147,0.058788,0.000147,0.000147
AZXON596A1VXC,0.000173,0.029897,0.000173,0.012432,0.000173,0.000173,0.000173,0.000173,0.104411,0.000173,...,0.000173,0.022724,0.000173,0.111096,0.000173,0.000173,0.000173,0.143816,0.000173,0.000173
AZYXC63SS008M,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.111081,0.001667,...,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.195631,0.001667,0.001667,0.001667


In [18]:
item_vecs

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9742356831,0.000044,0.037068,0.000044,0.000044,0.011183,0.000044,0.000044,0.056985,0.373422,0.000044,...,0.000044,0.000044,0.016443,0.000044,0.005735,0.000044,0.012069,0.019473,0.010329,0.000044
B00004S1C5,0.000056,0.000056,0.000056,0.000056,0.000056,0.016450,0.000056,0.000056,0.000056,0.000056,...,0.000056,0.000056,0.000056,0.000056,0.017596,0.000056,0.221324,0.275595,0.166724,0.000056
B00005344V,0.000076,0.000076,0.310664,0.000076,0.000076,0.000076,0.000076,0.000076,0.000076,0.018822,...,0.000076,0.000076,0.000076,0.000076,0.000076,0.000076,0.012768,0.018308,0.000076,0.018326
B0000CDEPD,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.000128,0.270056,0.000128,0.000128,...,0.000128,0.000128,0.000128,0.021182,0.000128,0.000128,0.000128,0.000128,0.100708,0.000128
B0000CFPI2,0.000082,0.000082,0.000082,0.000082,0.000082,0.053972,0.000082,0.067632,0.491923,0.000082,...,0.000082,0.000082,0.000082,0.000082,0.067539,0.000082,0.000082,0.097631,0.000348,0.018163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B00I08JNWU,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,...,0.000455,0.196998,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455
B00I33696K,0.118955,0.000303,0.000303,0.000303,0.000303,0.000303,0.127053,0.188531,0.000303,0.000303,...,0.000303,0.000303,0.000303,0.000303,0.155561,0.000303,0.000303,0.000303,0.000303,0.000303
B00ID9VSOM,0.000370,0.068766,0.000370,0.000370,0.000370,0.000370,0.000370,0.000370,0.000370,0.039394,...,0.000370,0.000370,0.033863,0.106754,0.000370,0.000370,0.000370,0.302539,0.000370,0.000370
B00IRL93SY,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,...,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667


In [19]:
# converting factors into numpy obj
user_factors = user_vecs.to_numpy()
item_factors = item_vecs.to_numpy()

In [20]:
user_factors[0,:]

array([0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 ,
       0.3214275 , 0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 ,
       0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 ,
       0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 ,
       0.0013337 , 0.0013337 , 0.53154725, 0.0013337 , 0.0013337 ,
       0.0013337 , 0.0013337 , 0.08434129, 0.0013337 , 0.0013337 ,
       0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 ,
       0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 ,
       0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 ,
       0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 , 0.0013337 ],
      dtype=float32)

In [21]:
item_factors[0,:]

array([4.4120261e-05, 3.7068404e-02, 4.4120261e-05, 4.4120261e-05,
       1.1183320e-02, 4.4120261e-05, 4.4120261e-05, 5.6985307e-02,
       3.7342215e-01, 4.4120261e-05, 4.4120261e-05, 1.1684398e-02,
       2.9754133e-03, 4.4120261e-05, 4.4120261e-05, 1.5933922e-02,
       3.2784939e-02, 4.4120261e-05, 1.8550407e-02, 4.4120261e-05,
       2.1376897e-02, 5.9699129e-02, 4.4120261e-05, 4.1945748e-02,
       1.3643295e-02, 4.4120261e-05, 4.4120261e-05, 4.4120261e-05,
       4.4120261e-05, 4.4120261e-05, 4.4120261e-05, 4.4120261e-05,
       4.4120261e-05, 4.4120261e-05, 4.4120261e-05, 4.0703386e-02,
       4.4120261e-05, 4.4120261e-05, 1.7367385e-01, 2.3040500e-02,
       4.4120261e-05, 4.4120261e-05, 1.6442873e-02, 4.4120261e-05,
       5.7348763e-03, 4.4120261e-05, 1.2069327e-02, 1.9473264e-02,
       1.0329086e-02, 4.4120261e-05], dtype=float32)

# Train `EmbeddedMF` Class w/ Initialized Topic Vectors

In [64]:
from src.models import emf

In [78]:
ti_mf = emf.EmbeddedMF(user_map=user_idx_map,
                       item_map=item_idx_map,
                       user_factor=user_factors,
                       item_factor=item_factors,
                       learning_rate=.01,
                       beta=.02,
                       num_epochs=20,
                       num_factors=50)

In [79]:
# generate data required for surprise

# create reader
reader = Reader(rating_scale=(1,5))
# generate data required for surprise
data = Dataset.load_from_df(train[['reviewerID', 'asin', 'overall']], reader)
# generating training set
trainset = data.build_full_trainset()

In [80]:
%%time
# fitting algo to training set
ti_mf.fit(trainset)

100%|███████████████████████████████████████████████████████████████| 20/20 [08:11<00:00, 24.60s/it]

CPU times: user 8min 7s, sys: 1.7 s, total: 8min 9s
Wall time: 8min 12s





In [81]:
%%time
# generate candidate items for user to predict rating
testset = trainset.build_anti_testset()

CPU times: user 33.6 s, sys: 11.5 s, total: 45.1 s
Wall time: 54.9 s


In [82]:
%%time
# predict ratings for all pairs (u, i) that are NOT in the training set
predictions = ti_mf.test(testset, verbose=False)

CPU times: user 8min 36s, sys: 6min 18s, total: 14min 55s
Wall time: 18min 43s


# Evaluate Top-N Recommendations

### Defining Evaluation Metrics

In [83]:
def precision_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating precision@K - relevant / total recommended
    precision_at_k = num_relevant / k
    
    return precision_at_k

def recall_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating recall@K - relevant / total relevant items
    recall_at_k = num_relevant / len(asins)
    
    return recall_at_k

In [84]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [85]:
# loading test dataset
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

# generating test rating history
test_user_history = (pd.DataFrame(test.groupby(['reviewerID'])['asin']
                                  .apply(list).reset_index()))

In [57]:
top_ns = get_top_n(predictions)

In [58]:
# generating a random user
random_user = np.random.choice(list(train['reviewerID'].unique()), 1)[0]
print(f"For user: {random_user}:")
print(f"Purchase History:\n{train[train['reviewerID'] == random_user][['asin', 'title']]}")

# find the recommendations
# NOTE: This is not sorted in ranking
print(f"\nRecommending:\n")
print(f"{train[train['asin'].isin([i[0] for i in top_ns[random_user]])][['asin', 'title']].drop_duplicates(subset='asin')}")
print(f"\n{top_ns[random_user]}")

For user: AWY8ZY16DPTVE:
Purchase History:
             asin                                              title
19630  B001E6K6B2  Kellogg's Raisin Bran Crunch, Breakfast Cereal...
33274  B0040J01KQ  Pop-Tarts Toaster Pastries Variety Pack, 48-Co...

Recommending:

             asin                                              title
344    B00014HS2S  Prince of Peace Oolong Tea - 100 Tea Bags net ...
1336   B00028PVA4  Tazo Refresh Mint Herbal Tea Filterbags (20 co...
2542   B000CMD64S  Yogi Rejuvenation Green Tea, 16 Tea Bags (Pack...
14852  B0013L0C6W                      Soda Stream Tonic Syrup 440ml
19011  B001E5E10K   White Gold Honey, 23-Ounce Container (Pack of 2)
19038  B001E5E12I   Maine Coast Sea Vegetables Kombu, Wild Atlant...
21967  B001EQ5IPQ  O.N.E. 100% Natural Coconut Water, 33.8 Ounce ...
21970  B001EQ5IBU  NOW Foods Macadamia Nuts Roasted-salted, 12-Ou...
25988  B001TNXSZG   Guayaki Traditional Organic Yerba Mate, Loose...
27609  B00286KM8E  Lipton Black Tea Bags, Am

### N=10

In [86]:
top_ns = get_top_n(predictions, n=10)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 10
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 41617.66it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 48716.33it/s]

The MEM-ECF has a average precision@10: 0.00060, average recall@10: 0.00321.





### N=25

In [87]:
top_ns = get_top_n(predictions, n=25)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 25
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 45502.25it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 45615.50it/s]

The MEM-ECF has a average precision@25: 0.00061, average recall@25: 0.00789.





### N=30

In [88]:
top_ns = get_top_n(predictions, n=30)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 30
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 32080.60it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 28881.46it/s]

The MEM-ECF has a average precision@30: 0.00059, average recall@30: 0.00911.





### N=45

In [89]:
top_ns = get_top_n(predictions, n=45)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 45
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 37503.26it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 39617.26it/s]

The MEM-ECF has a average precision@45: 0.00061, average recall@45: 0.01380.





In [90]:
# looking at how many get correct
test_merged[test_merged['recall@k'] > 0]

Unnamed: 0,reviewerID,asin,pred_asin,precision@k,recall@k
42,A10BD0288TGRVS,"[B00017LEXE, B00017LEXE, B001D3K2GA, B0029XLH4Y]","[B000JMAXMY, B000EDG4V2, B001G2F5R4, B0029JHHO...",0.022222,0.250000
94,A10UIIZS7YW78Y,[B005C3IVN8],"[B000EDG4V2, B000EDM6PA, B000H1195C, B000HVX6N...",0.022222,1.000000
195,A11WS6V544IWEM,"[B0033HGLTG, B005K4Q1YA]","[B000CQ01GU, B000E123H8, B000E1BL5S, B000E1FXR...",0.022222,0.500000
202,A11YOTONCPRQ9S,"[B000CQBZQK, B000EVLZCW, B001SB4OV6, B002YR7A7...","[B001PEWJWC, B00271OPVU, B000EDG4V2, B000JMAXM...",0.022222,0.166667
251,A12KUNIKXGX3U1,[B005HGAVD8],"[B001E5E12I, B001E5E10A, B002859GAU, B00286KM8...",0.022222,1.000000
...,...,...,...,...,...
13093,AXVNVV5VH5XZY,"[B000EIZ8FA, B000LKV2KQ, B000O3QD2C]","[B001ET5XVW, B001PF1846, B001PEWJWC, B003VTG7R...",0.022222,0.333333
13144,AYB4ELCS5AM8P,"[B005GNXIYQ, B009M516HU, B009M515HQ, B00BNR7I1...","[B000GAT6NG, B003OGKCDC, B000F4D5GC, B00DS842H...",0.022222,0.090909
13236,AZFHSPEZUPGD2,"[B002DM62BY, B00A64NLOM, B00BHIS6MI, B00FPNVTU...","[B0013L0C6W, B0014ET2MI, B000UXDHGQ, B001LO37P...",0.022222,0.200000
13255,AZOTVJHNSAQXG,"[B001EQ4RWQ, B005CULMWI]","[B000216O16, B000DZDJ0K, B000DZKKKC, B000ED9LS...",0.022222,0.500000
