In [1]:
from collections import defaultdict
import logging
from pprint import pprint

import gensim
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVD
from tqdm import tqdm

tqdm.pandas()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



# Load Data

In [2]:
# global variable
CATEGORY = "Grocery_and_Gourmet_Food"
DATA_PATH = "data/evaluation"

In [3]:
train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")

In [4]:
train.head()

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,0,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A23RYWDS884TUL,5.0,This curry paste makes a delicious curry. I j...,2013-05-28,curry paste delicious curry fry chicken vegeta...
1,1,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A945RBQWGZXCK,5.0,I've purchased different curries in the grocer...,2012-09-17,purchase different curry grocery store complet...
2,3,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3AMNY44OP8AOU,4.0,I started a new diet restricting all added sug...,2014-01-23,start new diet restrict added sugar brand suga...
3,4,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",A3IB4CQ2QEJLJ8,5.0,So many flavors. I can't begin to tell you how...,2014-04-27,flavor begin tell love mae ploy curry ask reci...
4,5,9742356831,"Mae Ploy Green Curry Paste, 14 oz","['Grocery & Gourmet Food', 'Sauces, Gravies & ...",AQA5DF3RWKETQ,5.0,I've used this a lot recently in some of my ch...,2012-11-27,use lot recently chicken dish use lot like spi...


# Preparing Review Text for LDA

In [5]:
# tokenization
processed_reviews = train["processedReviewText"].progress_apply(lambda x: x.split())

# creating a bag-of-words
dictionary = gensim.corpora.Dictionary(processed_reviews)

# filtering out tokens that appear in less than 15 reviews
# or more than 0.5 of the corpus
dictionary.filter_extremes(no_below=15, no_above=0.75)

# creating dict how many words and time the word appear
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_reviews]

100%|█████████████████████████████████████████████████████| 47774/47774 [00:00<00:00, 120004.19it/s]
2021-08-23 22:45:55,002 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-08-23 22:45:55,399 : INFO : adding document #10000 to Dictionary(14898 unique tokens: ['add', 'chicken', 'coconut', 'curry', 'delicious']...)
2021-08-23 22:45:55,827 : INFO : adding document #20000 to Dictionary(21785 unique tokens: ['add', 'chicken', 'coconut', 'curry', 'delicious']...)
2021-08-23 22:45:56,278 : INFO : adding document #30000 to Dictionary(27143 unique tokens: ['add', 'chicken', 'coconut', 'curry', 'delicious']...)
2021-08-23 22:45:56,883 : INFO : adding document #40000 to Dictionary(31844 unique tokens: ['add', 'chicken', 'coconut', 'curry', 'delicious']...)
2021-08-23 22:45:57,268 : INFO : built Dictionary(34875 unique tokens: ['add', 'chicken', 'coconut', 'curry', 'delicious']...) from 47774 documents (total 1942617 corpus positions)
2021-08-23 22:45:57,309 : INFO : Dictionary

# Training LDA Model

In [6]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=50, id2word=dictionary, passes=20, workers=8)

2021-08-23 22:45:58,834 : INFO : using symmetric alpha at 0.02
2021-08-23 22:45:58,835 : INFO : using symmetric eta at 0.02
2021-08-23 22:45:58,837 : INFO : using serial LDA version on this node
2021-08-23 22:45:58,862 : INFO : running online LDA training, 50 topics, 20 passes over the supplied corpus of 47774 documents, updating every 16000 documents, evaluating every ~47774 documents, iterating 50x with a convergence threshold of 0.001000
2021-08-23 22:45:58,865 : INFO : training LDA model using 8 processes
2021-08-23 22:46:04,068 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #2000/47774, outstanding queue size 1
2021-08-23 22:46:04,080 : INFO : PROGRESS: pass 0, dispatched chunk #1 = documents up to #4000/47774, outstanding queue size 2
2021-08-23 22:46:04,082 : INFO : PROGRESS: pass 0, dispatched chunk #2 = documents up to #6000/47774, outstanding queue size 3
2021-08-23 22:46:04,092 : INFO : PROGRESS: pass 0, dispatched chunk #3 = documents up to #8000/47774, ou

2021-08-23 22:46:07,668 : INFO : topic diff=20.570000, rho=1.000000
2021-08-23 22:46:11,335 : INFO : -7.644 per-word bound, 200.1 perplexity estimate based on a held-out corpus of 1774 documents with 78463 words
2021-08-23 22:46:11,399 : INFO : merging changes from 18000 documents into a model of 47774 documents
2021-08-23 22:46:11,421 : INFO : topic #43 (0.020): 0.018*"use" + 0.016*"good" + 0.016*"like" + 0.014*"great" + 0.012*"taste" + 0.011*"love" + 0.011*"tea" + 0.011*"price" + 0.010*"product" + 0.009*"coffee"
2021-08-23 22:46:11,423 : INFO : topic #42 (0.020): 0.019*"good" + 0.013*"taste" + 0.013*"buy" + 0.012*"tea" + 0.010*"chocolate" + 0.010*"soup" + 0.010*"like" + 0.009*"flavor" + 0.009*"try" + 0.008*"use"
2021-08-23 22:46:11,425 : INFO : topic #44 (0.020): 0.019*"good" + 0.016*"taste" + 0.014*"product" + 0.013*"tea" + 0.012*"like" + 0.012*"use" + 0.011*"flavor" + 0.009*"buy" + 0.008*"try" + 0.007*"chocolate"
2021-08-23 22:46:11,427 : INFO : topic #6 (0.020): 0.023*"use" + 0.01

2021-08-23 22:46:21,942 : INFO : topic #7 (0.020): 0.028*"like" + 0.026*"good" + 0.019*"taste" + 0.017*"cheese" + 0.016*"flavor" + 0.012*"chip" + 0.011*"sauce" + 0.011*"cracker" + 0.009*"little" + 0.009*"love"
2021-08-23 22:46:21,943 : INFO : topic diff=0.526771, rho=0.196544
2021-08-23 22:46:24,608 : INFO : -7.346 per-word bound, 162.7 perplexity estimate based on a held-out corpus of 1774 documents with 78463 words
2021-08-23 22:46:24,646 : INFO : merging changes from 15774 documents into a model of 47774 documents
2021-08-23 22:46:24,659 : INFO : topic #15 (0.020): 0.040*"sugar" + 0.019*"like" + 0.019*"taste" + 0.016*"sweet" + 0.015*"flavor" + 0.015*"use" + 0.012*"good" + 0.011*"drink" + 0.011*"product" + 0.007*"cooky"
2021-08-23 22:46:24,660 : INFO : topic #49 (0.020): 0.072*"coffee" + 0.022*"taste" + 0.019*"like" + 0.018*"flavor" + 0.015*"good" + 0.014*"cup" + 0.014*"vanilla" + 0.011*"try" + 0.010*"use" + 0.010*"roast"
2021-08-23 22:46:24,661 : INFO : topic #17 (0.020): 0.023*"tas

2021-08-23 22:46:33,902 : INFO : topic #8 (0.020): 0.025*"flavor" + 0.025*"salt" + 0.024*"chocolate" + 0.020*"taste" + 0.017*"like" + 0.017*"bar" + 0.016*"good" + 0.014*"dark" + 0.012*"little" + 0.011*"great"
2021-08-23 22:46:33,903 : INFO : topic #33 (0.020): 0.028*"good" + 0.021*"tuna" + 0.020*"like" + 0.019*"love" + 0.019*"eat" + 0.014*"buy" + 0.013*"taste" + 0.012*"great" + 0.012*"popcorn" + 0.012*"bag"
2021-08-23 22:46:33,904 : INFO : topic diff=0.634059, rho=0.192854
2021-08-23 22:46:34,941 : INFO : -7.164 per-word bound, 143.4 perplexity estimate based on a held-out corpus of 1774 documents with 78463 words
2021-08-23 22:46:34,942 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #2000/47774, outstanding queue size 1
2021-08-23 22:46:34,951 : INFO : PROGRESS: pass 3, dispatched chunk #1 = documents up to #4000/47774, outstanding queue size 2
2021-08-23 22:46:34,952 : INFO : PROGRESS: pass 3, dispatched chunk #2 = documents up to #6000/47774, outstanding queue size

2021-08-23 22:46:43,483 : INFO : PROGRESS: pass 4, dispatched chunk #4 = documents up to #10000/47774, outstanding queue size 5
2021-08-23 22:46:43,484 : INFO : PROGRESS: pass 4, dispatched chunk #5 = documents up to #12000/47774, outstanding queue size 6
2021-08-23 22:46:43,485 : INFO : PROGRESS: pass 4, dispatched chunk #6 = documents up to #14000/47774, outstanding queue size 7
2021-08-23 22:46:43,496 : INFO : PROGRESS: pass 4, dispatched chunk #7 = documents up to #16000/47774, outstanding queue size 8
2021-08-23 22:46:43,499 : INFO : PROGRESS: pass 4, dispatched chunk #8 = documents up to #18000/47774, outstanding queue size 9
2021-08-23 22:46:43,500 : INFO : PROGRESS: pass 4, dispatched chunk #9 = documents up to #20000/47774, outstanding queue size 10
2021-08-23 22:46:43,501 : INFO : PROGRESS: pass 4, dispatched chunk #10 = documents up to #22000/47774, outstanding queue size 11
2021-08-23 22:46:43,524 : INFO : PROGRESS: pass 4, dispatched chunk #11 = documents up to #24000/4777

2021-08-23 22:46:52,021 : INFO : PROGRESS: pass 5, dispatched chunk #13 = documents up to #28000/47774, outstanding queue size 14
2021-08-23 22:46:52,036 : INFO : PROGRESS: pass 5, dispatched chunk #14 = documents up to #30000/47774, outstanding queue size 15
2021-08-23 22:46:52,038 : INFO : PROGRESS: pass 5, dispatched chunk #15 = documents up to #32000/47774, outstanding queue size 16
2021-08-23 22:46:52,039 : INFO : PROGRESS: pass 5, dispatched chunk #16 = documents up to #34000/47774, outstanding queue size 17
2021-08-23 22:46:52,042 : INFO : PROGRESS: pass 5, dispatched chunk #17 = documents up to #36000/47774, outstanding queue size 18
2021-08-23 22:46:52,062 : INFO : PROGRESS: pass 5, dispatched chunk #18 = documents up to #38000/47774, outstanding queue size 19
2021-08-23 22:46:52,066 : INFO : PROGRESS: pass 5, dispatched chunk #19 = documents up to #40000/47774, outstanding queue size 20
2021-08-23 22:46:52,071 : INFO : PROGRESS: pass 5, dispatched chunk #20 = documents up to 

2021-08-23 22:47:01,051 : INFO : PROGRESS: pass 6, dispatched chunk #21 = documents up to #44000/47774, outstanding queue size 22
2021-08-23 22:47:01,094 : INFO : PROGRESS: pass 6, dispatched chunk #22 = documents up to #46000/47774, outstanding queue size 23
2021-08-23 22:47:01,110 : INFO : PROGRESS: pass 6, dispatched chunk #23 = documents up to #47774/47774, outstanding queue size 24
2021-08-23 22:47:03,562 : INFO : merging changes from 16000 documents into a model of 47774 documents
2021-08-23 22:47:03,584 : INFO : topic #3 (0.020): 0.060*"organic" + 0.036*"spice" + 0.018*"taste" + 0.017*"like" + 0.015*"ring" + 0.015*"curry" + 0.014*"good" + 0.014*"flavor" + 0.013*"great" + 0.008*"food"
2021-08-23 22:47:03,586 : INFO : topic #18 (0.020): 0.052*"bag" + 0.018*"use" + 0.014*"bread" + 0.013*"good" + 0.013*"box" + 0.012*"bake" + 0.011*"mix" + 0.010*"time" + 0.010*"crust" + 0.009*"easy"
2021-08-23 22:47:03,587 : INFO : topic #2 (0.020): 0.057*"pasta" + 0.050*"oil" + 0.033*"olive" + 0.021

2021-08-23 22:47:12,363 : INFO : topic #33 (0.020): 0.037*"good" + 0.032*"tuna" + 0.032*"eat" + 0.023*"love" + 0.022*"like" + 0.020*"buy" + 0.018*"great" + 0.015*"taste" + 0.014*"product" + 0.012*"kid"
2021-08-23 22:47:12,364 : INFO : topic #47 (0.020): 0.056*"use" + 0.045*"flour" + 0.028*"recipe" + 0.028*"mix" + 0.015*"bake" + 0.015*"pancake" + 0.014*"product" + 0.013*"add" + 0.012*"gluten" + 0.012*"bob"
2021-08-23 22:47:12,366 : INFO : topic diff=0.719308, rho=0.177090
2021-08-23 22:47:14,984 : INFO : -7.013 per-word bound, 129.2 perplexity estimate based on a held-out corpus of 1774 documents with 78463 words
2021-08-23 22:47:15,051 : INFO : merging changes from 16000 documents into a model of 47774 documents
2021-08-23 22:47:15,083 : INFO : topic #2 (0.020): 0.065*"pasta" + 0.052*"oil" + 0.032*"olive" + 0.019*"use" + 0.018*"spaghetti" + 0.018*"barilla" + 0.017*"good" + 0.017*"taste" + 0.015*"like" + 0.015*"cook"
2021-08-23 22:47:15,087 : INFO : topic #16 (0.020): 0.062*"price" + 0.

2021-08-23 22:47:25,469 : INFO : topic #26 (0.020): 0.099*"free" + 0.079*"gluten" + 0.024*"good" + 0.020*"mix" + 0.020*"taste" + 0.019*"product" + 0.019*"like" + 0.015*"great" + 0.013*"try" + 0.012*"eat"
2021-08-23 22:47:25,474 : INFO : topic #41 (0.020): 0.052*"like" + 0.038*"spicy" + 0.029*"flavor" + 0.025*"good" + 0.020*"try" + 0.019*"truffle" + 0.019*"taste" + 0.018*"biscuit" + 0.012*"think" + 0.011*"bit"
2021-08-23 22:47:25,480 : INFO : topic #27 (0.020): 0.045*"jerky" + 0.034*"beef" + 0.025*"fish" + 0.015*"salmon" + 0.014*"taste" + 0.012*"flavor" + 0.012*"good" + 0.011*"product" + 0.011*"eat" + 0.011*"like"
2021-08-23 22:47:25,486 : INFO : topic diff=0.648490, rho=0.174376
2021-08-23 22:47:28,084 : INFO : -7.002 per-word bound, 128.1 perplexity estimate based on a held-out corpus of 1774 documents with 78463 words
2021-08-23 22:47:28,134 : INFO : merging changes from 15774 documents into a model of 47774 documents
2021-08-23 22:47:28,151 : INFO : topic #39 (0.020): 0.052*"candy" 

2021-08-23 22:47:38,759 : INFO : topic #0 (0.020): 0.039*"like" + 0.029*"piece" + 0.022*"sweet" + 0.022*"gum" + 0.022*"flavor" + 0.020*"taste" + 0.019*"eat" + 0.018*"cinnamon" + 0.017*"chew" + 0.015*"hard"
2021-08-23 22:47:38,760 : INFO : topic #40 (0.020): 0.055*"gift" + 0.031*"basket" + 0.026*"item" + 0.024*"low" + 0.024*"size" + 0.019*"calorie" + 0.018*"good" + 0.015*"carb" + 0.015*"nice" + 0.013*"small"
2021-08-23 22:47:38,761 : INFO : topic #20 (0.020): 0.073*"sauce" + 0.021*"use" + 0.019*"flavor" + 0.017*"hot" + 0.017*"like" + 0.015*"taste" + 0.014*"add" + 0.012*"good" + 0.012*"chicken" + 0.012*"heat"
2021-08-23 22:47:38,762 : INFO : topic #49 (0.020): 0.126*"coffee" + 0.026*"flavor" + 0.024*"like" + 0.023*"cup" + 0.023*"taste" + 0.016*"good" + 0.014*"roast" + 0.014*"strong" + 0.013*"vanilla" + 0.012*"blend"
2021-08-23 22:47:38,764 : INFO : topic diff=0.581152, rho=0.171784
2021-08-23 22:47:40,093 : INFO : -6.962 per-word bound, 124.6 perplexity estimate based on a held-out corpu

2021-08-23 22:47:50,099 : INFO : PROGRESS: pass 11, dispatched chunk #0 = documents up to #2000/47774, outstanding queue size 1
2021-08-23 22:47:50,111 : INFO : PROGRESS: pass 11, dispatched chunk #1 = documents up to #4000/47774, outstanding queue size 2
2021-08-23 22:47:50,112 : INFO : PROGRESS: pass 11, dispatched chunk #2 = documents up to #6000/47774, outstanding queue size 3
2021-08-23 22:47:50,114 : INFO : PROGRESS: pass 11, dispatched chunk #3 = documents up to #8000/47774, outstanding queue size 4
2021-08-23 22:47:50,115 : INFO : PROGRESS: pass 11, dispatched chunk #4 = documents up to #10000/47774, outstanding queue size 5
2021-08-23 22:47:50,129 : INFO : PROGRESS: pass 11, dispatched chunk #5 = documents up to #12000/47774, outstanding queue size 6
2021-08-23 22:47:50,131 : INFO : PROGRESS: pass 11, dispatched chunk #6 = documents up to #14000/47774, outstanding queue size 7
2021-08-23 22:47:50,133 : INFO : PROGRESS: pass 11, dispatched chunk #7 = documents up to #16000/4777

2021-08-23 22:48:00,021 : INFO : PROGRESS: pass 12, dispatched chunk #8 = documents up to #18000/47774, outstanding queue size 9
2021-08-23 22:48:00,022 : INFO : PROGRESS: pass 12, dispatched chunk #9 = documents up to #20000/47774, outstanding queue size 10
2021-08-23 22:48:00,023 : INFO : PROGRESS: pass 12, dispatched chunk #10 = documents up to #22000/47774, outstanding queue size 11
2021-08-23 22:48:00,038 : INFO : PROGRESS: pass 12, dispatched chunk #11 = documents up to #24000/47774, outstanding queue size 12
2021-08-23 22:48:00,041 : INFO : PROGRESS: pass 12, dispatched chunk #12 = documents up to #26000/47774, outstanding queue size 13
2021-08-23 22:48:00,043 : INFO : PROGRESS: pass 12, dispatched chunk #13 = documents up to #28000/47774, outstanding queue size 14
2021-08-23 22:48:00,057 : INFO : PROGRESS: pass 12, dispatched chunk #14 = documents up to #30000/47774, outstanding queue size 15
2021-08-23 22:48:00,061 : INFO : PROGRESS: pass 12, dispatched chunk #15 = documents u

2021-08-23 22:48:09,120 : INFO : PROGRESS: pass 13, dispatched chunk #16 = documents up to #34000/47774, outstanding queue size 17
2021-08-23 22:48:09,131 : INFO : PROGRESS: pass 13, dispatched chunk #17 = documents up to #36000/47774, outstanding queue size 18
2021-08-23 22:48:09,153 : INFO : PROGRESS: pass 13, dispatched chunk #18 = documents up to #38000/47774, outstanding queue size 19
2021-08-23 22:48:09,172 : INFO : PROGRESS: pass 13, dispatched chunk #19 = documents up to #40000/47774, outstanding queue size 20
2021-08-23 22:48:09,176 : INFO : PROGRESS: pass 13, dispatched chunk #20 = documents up to #42000/47774, outstanding queue size 21
2021-08-23 22:48:09,178 : INFO : PROGRESS: pass 13, dispatched chunk #21 = documents up to #44000/47774, outstanding queue size 22
2021-08-23 22:48:09,207 : INFO : PROGRESS: pass 13, dispatched chunk #22 = documents up to #46000/47774, outstanding queue size 23
2021-08-23 22:48:09,226 : INFO : PROGRESS: pass 13, dispatched chunk #23 = document

2021-08-23 22:48:20,768 : INFO : merging changes from 16000 documents into a model of 47774 documents
2021-08-23 22:48:20,790 : INFO : topic #21 (0.020): 0.034*"peppermint" + 0.018*"mocha" + 0.018*"hot" + 0.018*"like" + 0.015*"flavor" + 0.014*"latte" + 0.014*"good" + 0.013*"house" + 0.011*"powder" + 0.010*"drink"
2021-08-23 22:48:20,792 : INFO : topic #43 (0.020): 0.102*"snack" + 0.036*"great" + 0.027*"healthy" + 0.023*"good" + 0.021*"eat" + 0.017*"lunch" + 0.016*"tasty" + 0.014*"pack" + 0.014*"like" + 0.014*"little"
2021-08-23 22:48:20,794 : INFO : topic #17 (0.020): 0.057*"caramel" + 0.024*"product" + 0.021*"original" + 0.017*"taste" + 0.013*"werther" + 0.012*"candy" + 0.010*"think" + 0.010*"like" + 0.009*"try" + 0.009*"hard"
2021-08-23 22:48:20,795 : INFO : topic #37 (0.020): 0.099*"product" + 0.028*"company" + 0.022*"review" + 0.022*"amazon" + 0.019*"receive" + 0.019*"order" + 0.012*"quality" + 0.011*"good" + 0.010*"send" + 0.009*"vine"
2021-08-23 22:48:20,797 : INFO : topic #35 (0

2021-08-23 22:48:28,986 : INFO : topic diff=0.290955, rho=0.158338
2021-08-23 22:48:31,911 : INFO : -6.938 per-word bound, 122.6 perplexity estimate based on a held-out corpus of 1774 documents with 78463 words
2021-08-23 22:48:31,988 : INFO : merging changes from 16000 documents into a model of 47774 documents
2021-08-23 22:48:32,028 : INFO : topic #10 (0.020): 0.121*"peanut" + 0.114*"butter" + 0.058*"popcorn" + 0.021*"taste" + 0.021*"like" + 0.019*"nut" + 0.014*"pop" + 0.012*"good" + 0.012*"flavor" + 0.012*"jar"
2021-08-23 22:48:32,031 : INFO : topic #33 (0.020): 0.044*"eat" + 0.040*"tuna" + 0.040*"good" + 0.024*"like" + 0.020*"love" + 0.020*"buy" + 0.016*"taste" + 0.016*"great" + 0.013*"product" + 0.012*"thanks"
2021-08-23 22:48:32,057 : INFO : topic #14 (0.020): 0.044*"taste" + 0.029*"like" + 0.020*"review" + 0.018*"know" + 0.017*"bad" + 0.014*"product" + 0.014*"aloe" + 0.014*"think" + 0.012*"star" + 0.011*"good"
2021-08-23 22:48:32,072 : INFO : topic #1 (0.020): 0.081*"drink" + 0.

2021-08-23 22:48:40,331 : INFO : topic #12 (0.020): 0.084*"ingredient" + 0.064*"corn" + 0.036*"syrup" + 0.034*"oat" + 0.034*"list" + 0.024*"oil" + 0.023*"sugar" + 0.023*"natural" + 0.020*"fructose" + 0.018*"contain"
2021-08-23 22:48:40,335 : INFO : topic diff=0.245067, rho=0.156389
2021-08-23 22:48:42,337 : INFO : -6.941 per-word bound, 122.8 perplexity estimate based on a held-out corpus of 1774 documents with 78463 words
2021-08-23 22:48:42,384 : INFO : merging changes from 15774 documents into a model of 47774 documents
2021-08-23 22:48:42,404 : INFO : topic #30 (0.020): 0.109*"chip" + 0.050*"package" + 0.032*"small" + 0.024*"bag" + 0.022*"size" + 0.017*"ahoy" + 0.017*"gooey" + 0.015*"like" + 0.014*"good" + 0.014*"open"
2021-08-23 22:48:42,405 : INFO : topic #35 (0.020): 0.139*"love" + 0.066*"flavor" + 0.066*"try" + 0.046*"like" + 0.045*"great" + 0.044*"taste" + 0.039*"buy" + 0.024*"husband" + 0.016*"good" + 0.016*"brand"
2021-08-23 22:48:42,406 : INFO : topic #11 (0.020): 0.101*"wa

2021-08-23 22:48:51,418 : INFO : topic #4 (0.020): 0.085*"fruit" + 0.057*"apple" + 0.029*"flavor" + 0.027*"juice" + 0.022*"natural" + 0.021*"ingredient" + 0.019*"blueberry" + 0.018*"dry" + 0.018*"dried" + 0.016*"like"
2021-08-23 22:48:51,419 : INFO : topic #29 (0.020): 0.146*"honey" + 0.048*"raw" + 0.031*"jar" + 0.018*"use" + 0.011*"lid" + 0.010*"process" + 0.008*"organic" + 0.008*"yeast" + 0.008*"leak" + 0.007*"taste"
2021-08-23 22:48:51,420 : INFO : topic diff=0.222899, rho=0.154511
2021-08-23 22:48:52,621 : INFO : -6.919 per-word bound, 121.0 perplexity estimate based on a held-out corpus of 1774 documents with 78463 words
2021-08-23 22:48:52,622 : INFO : PROGRESS: pass 18, dispatched chunk #0 = documents up to #2000/47774, outstanding queue size 1
2021-08-23 22:48:52,633 : INFO : PROGRESS: pass 18, dispatched chunk #1 = documents up to #4000/47774, outstanding queue size 2
2021-08-23 22:48:52,635 : INFO : PROGRESS: pass 18, dispatched chunk #2 = documents up to #6000/47774, outstan

2021-08-23 22:49:00,771 : INFO : PROGRESS: pass 19, dispatched chunk #4 = documents up to #10000/47774, outstanding queue size 5
2021-08-23 22:49:00,781 : INFO : PROGRESS: pass 19, dispatched chunk #5 = documents up to #12000/47774, outstanding queue size 6
2021-08-23 22:49:00,783 : INFO : PROGRESS: pass 19, dispatched chunk #6 = documents up to #14000/47774, outstanding queue size 7
2021-08-23 22:49:00,784 : INFO : PROGRESS: pass 19, dispatched chunk #7 = documents up to #16000/47774, outstanding queue size 8
2021-08-23 22:49:00,795 : INFO : PROGRESS: pass 19, dispatched chunk #8 = documents up to #18000/47774, outstanding queue size 9
2021-08-23 22:49:00,797 : INFO : PROGRESS: pass 19, dispatched chunk #9 = documents up to #20000/47774, outstanding queue size 10
2021-08-23 22:49:00,798 : INFO : PROGRESS: pass 19, dispatched chunk #10 = documents up to #22000/47774, outstanding queue size 11
2021-08-23 22:49:00,799 : INFO : PROGRESS: pass 19, dispatched chunk #11 = documents up to #24

In [7]:
# Compute Coherence Score
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

2021-08-23 22:49:08,786 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2021-08-23 22:49:08,805 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2021-08-23 22:49:08,823 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2021-08-23 22:49:08,848 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2021-08-23 22:49:08,872 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2021-08-23 22:49:08,897 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2021-08-23 22:49:08,927 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2021-08-23 22:49:08,955 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2021-08-23 22:49:08,981 : INFO : CorpusAccumulator accumulated stats from 9000 documents
2021-08-23 22:49:09,003 : INFO : CorpusAccumulator accumulated stats from 10000 documents
2021-08-23 22:49:09,026 : INFO : CorpusAccumulator accumulated stats from 11000 documents
2021-08-23 22:49:09


Coherence Score:  -2.621079025395095


# Retrieving User/Item Topic Vectors

In [8]:
user_reviews = train.groupby(["reviewerID"])['processedReviewText'].apply(lambda x: ' '.join(x))
item_reviews = train.groupby(["asin"])["processedReviewText"].apply(lambda x: ' '.join(x))

# get unique users and items
unique_users = user_reviews.index.tolist()
unique_items = item_reviews.index.tolist()

# tokenize reviews
user_reviews_list = user_reviews.apply(lambda x: x.split()).tolist()
item_reviews_list = item_reviews.apply(lambda x: x.split()).tolist()

In [9]:
print(f"Random user:\n{user_reviews[np.random.randint(0, user_reviews.shape[0])]}")
print(f"\nRandom item:\n{item_reviews[np.random.randint(0, item_reviews.shape[0])]}")

Random user:
like dark chocolate bar sweet creamy taste come score square break piece rip curtain window chocolate day purchase prefer organic hate love cooky like cracker crunchy slightly sweet sweetness come fig smell wonderfully honey great tea

Random item:
love chocolate hazelnut crispy wafer heaven careful togobble time little appetizing world gluten free product happy order frequent treat good product great individualized packaging crisp good chocolate flavor gluten free adult love gluten free kid like look gluten free snack fatten candy wonderful light snack satisfy sweet tooth


In [10]:
user_corpus = [dictionary.doc2bow(doc) for doc in user_reviews_list]
item_corpus = [dictionary.doc2bow(doc) for doc in item_reviews_list]

In [11]:
def get_topic_vectors(model, corpus, n_topics=50):
    """
    """
    topic_vecs = []
    for i in tqdm(range(len(corpus))):
        top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0)
        topic_vecs.append([top_topics[i][1] for i in range(n_topics)])
        
    return topic_vecs

In [12]:
user_vecs = get_topic_vectors(lda_model, user_corpus)
item_vecs = get_topic_vectors(lda_model, item_corpus)

100%|███████████████████████████████████████████████████████| 13397/13397 [00:10<00:00, 1274.06it/s]
100%|██████████████████████████████████████████████████████████| 4729/4729 [00:05<00:00, 906.25it/s]


In [13]:
# checking topic vector
pprint(item_vecs[:1])

[[4.4983804e-05,
  4.4983804e-05,
  0.022746114,
  0.2064711,
  4.4983804e-05,
  4.4983804e-05,
  4.4983804e-05,
  4.4983804e-05,
  0.0055703362,
  0.07597777,
  4.4983804e-05,
  0.02293062,
  4.4983804e-05,
  4.4983804e-05,
  4.4983804e-05,
  0.021429373,
  4.4983804e-05,
  4.4983804e-05,
  0.04324505,
  4.4983804e-05,
  0.13243595,
  4.4983804e-05,
  0.055566017,
  4.4983804e-05,
  4.4983804e-05,
  4.4983804e-05,
  4.4983804e-05,
  4.4983804e-05,
  4.4983804e-05,
  4.4983804e-05,
  4.4983804e-05,
  4.4983804e-05,
  0.118809834,
  4.4983804e-05,
  0.006932182,
  0.032745745,
  4.4983804e-05,
  0.0055222968,
  4.4983804e-05,
  0.011608914,
  4.4983804e-05,
  0.014234855,
  0.079787165,
  4.4983804e-05,
  4.4983804e-05,
  4.4983804e-05,
  0.073859334,
  0.032581452,
  0.036151372,
  4.4983804e-05]]


# Create Topic Vector Mappings

In [14]:
user_idx_map = {k: unique_users[k] for k in range(len(unique_users))}
item_idx_map = {k: unique_items[k] for k in range(len(unique_items))}

user_vec_map = {k: v for k, v in zip(unique_users, user_vecs)}
item_vec_map = {k: v for k, v in zip(unique_items, item_vecs)}

In [15]:
# loading user topic vectors into DF
user_vecs = pd.DataFrame.from_dict(user_vec_map, orient='index')
user_vecs.index.name = 'reviewerID'
# loading item topic vectors into DF
item_vecs = pd.DataFrame.from_dict(item_vec_map, orient='index')
item_vecs.index.name = 'asin'

In [16]:
user_vecs

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00177463W0XWB16A9O05,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429,...,0.001429,0.001429,0.001429,0.001429,0.138876,0.001429,0.001429,0.001429,0.001429,0.434725
A022899328A0QROR32DCT,0.051083,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247,0.027636,...,0.000247,0.000247,0.024822,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247
A068255029AHTHDXZURNU,0.000572,0.000572,0.000572,0.000572,0.000572,0.181789,0.000572,0.000572,0.000572,0.000572,...,0.000572,0.000572,0.000572,0.000572,0.000572,0.000572,0.000572,0.000572,0.000572,0.000572
A06944662TFWOKKV4GJKX,0.000800,0.000800,0.000800,0.000800,0.000800,0.181364,0.000800,0.000800,0.000800,0.000800,...,0.000800,0.000800,0.000800,0.000800,0.000800,0.000800,0.126235,0.000800,0.050043,0.000800
A1004703RC79J9,0.001429,0.055492,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429,...,0.001429,0.001429,0.721624,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429,0.104630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZWRZZAMX90VT,0.052026,0.000083,0.000083,0.023446,0.032928,0.020232,0.000083,0.083292,0.000083,0.016636,...,0.000083,0.024583,0.000083,0.000083,0.000083,0.000083,0.409704,0.000083,0.005510,0.000083
AZXKAH2DE6C8A,0.000150,0.000150,0.000150,0.000150,0.041372,0.000150,0.086734,0.108827,0.000150,0.000150,...,0.000150,0.000150,0.210080,0.089864,0.000150,0.000150,0.000150,0.000150,0.000150,0.101732
AZXON596A1VXC,0.000179,0.029245,0.000179,0.000179,0.000179,0.000179,0.000179,0.000179,0.000179,0.000179,...,0.000179,0.000179,0.021816,0.000179,0.035655,0.000179,0.244129,0.040758,0.028318,0.000179
AZYXC63SS008M,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,...,0.001667,0.132769,0.001667,0.161001,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667


In [17]:
item_vecs

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9742356831,0.000045,0.000045,0.022746,0.206471,0.000045,0.000045,0.000045,0.000045,0.005570,0.075978,...,0.000045,0.014235,0.079787,0.000045,0.000045,0.000045,0.073859,0.032581,0.036151,0.000045
B00004S1C5,0.000057,0.000057,0.000057,0.000057,0.083453,0.000057,0.000057,0.000057,0.019323,0.277205,...,0.016198,0.000057,0.000057,0.000057,0.000057,0.000057,0.000057,0.245093,0.000057,0.000057
B00005344V,0.000081,0.000081,0.000081,0.000081,0.000081,0.000081,0.000081,0.000081,0.000081,0.000081,...,0.000081,0.230676,0.000081,0.000081,0.000081,0.000081,0.000081,0.026756,0.116510,0.000081
B0000CDEPD,0.000128,0.000128,0.000128,0.076923,0.000128,0.000128,0.102506,0.000128,0.000128,0.000128,...,0.030720,0.000128,0.061988,0.000128,0.000128,0.000128,0.000128,0.080238,0.000128,0.000128
B0000CFPI2,0.000083,0.000083,0.000083,0.000083,0.000083,0.000083,0.000083,0.000083,0.028132,0.000083,...,0.035172,0.000083,0.083337,0.000083,0.000083,0.000083,0.000083,0.091257,0.000083,0.021162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B00I08JNWU,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,...,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,0.000455,0.138002,0.840174
B00I33696K,0.000308,0.000308,0.000308,0.000308,0.000308,0.063299,0.000308,0.000308,0.000308,0.000308,...,0.168045,0.000308,0.053929,0.000308,0.000308,0.017794,0.000308,0.000308,0.038334,0.000308
B00ID9VSOM,0.000371,0.000371,0.000371,0.000371,0.000371,0.000371,0.000371,0.000371,0.000371,0.000371,...,0.094652,0.000371,0.000371,0.000371,0.000371,0.000371,0.000371,0.119456,0.000371,0.000371
B00IRL93SY,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,...,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.683272


In [18]:
# converting factors into numpy obj
user_factors = user_vecs.to_numpy()
item_factors = item_vecs.to_numpy()

In [19]:
user_factors[0,:]

array([0.0014288 , 0.0014288 , 0.0014288 , 0.0014288 , 0.0014288 ,
       0.0014288 , 0.0014288 , 0.0014288 , 0.0014288 , 0.0014288 ,
       0.0014288 , 0.0014288 , 0.0014288 , 0.0014288 , 0.0014288 ,
       0.0014288 , 0.0014288 , 0.0014288 , 0.0014288 , 0.0014288 ,
       0.0014288 , 0.0014288 , 0.0014288 , 0.0014288 , 0.0014288 ,
       0.0014288 , 0.0014288 , 0.0014288 , 0.0014288 , 0.0014288 ,
       0.0014288 , 0.12296482, 0.0014288 , 0.0014288 , 0.0014288 ,
       0.0014288 , 0.0014288 , 0.11177909, 0.12735918, 0.0014288 ,
       0.0014288 , 0.0014288 , 0.0014288 , 0.0014288 , 0.1388757 ,
       0.0014288 , 0.0014288 , 0.0014288 , 0.0014288 , 0.43472537],
      dtype=float32)

In [20]:
item_factors[0,:]

array([4.49838044e-05, 4.49838044e-05, 2.27461141e-02, 2.06471100e-01,
       4.49838044e-05, 4.49838044e-05, 4.49838044e-05, 4.49838044e-05,
       5.57033625e-03, 7.59777725e-02, 4.49838044e-05, 2.29306202e-02,
       4.49838044e-05, 4.49838044e-05, 4.49838044e-05, 2.14293730e-02,
       4.49838044e-05, 4.49838044e-05, 4.32450511e-02, 4.49838044e-05,
       1.32435948e-01, 4.49838044e-05, 5.55660166e-02, 4.49838044e-05,
       4.49838044e-05, 4.49838044e-05, 4.49838044e-05, 4.49838044e-05,
       4.49838044e-05, 4.49838044e-05, 4.49838044e-05, 4.49838044e-05,
       1.18809834e-01, 4.49838044e-05, 6.93218177e-03, 3.27457450e-02,
       4.49838044e-05, 5.52229676e-03, 4.49838044e-05, 1.16089135e-02,
       4.49838044e-05, 1.42348548e-02, 7.97871649e-02, 4.49838044e-05,
       4.49838044e-05, 4.49838044e-05, 7.38593340e-02, 3.25814523e-02,
       3.61513719e-02, 4.49838044e-05], dtype=float32)

# Train `EmbeddedMF` Class w/ Initialized Topic Vectors

In [37]:
from src.models import emf

In [38]:
emf = emf.EmbeddedMF(user_map=user_idx_map,
                     item_map=item_idx_map,
                     user_factor=user_factors,
                     item_factor=item_factors,
                     learning_rate=.005,
                     beta=.02,
                     num_epochs=5,
                     num_factors=50)

In [39]:
# generate data required for surprise

# create reader
reader = Reader(rating_scale=(1,5))
# generate data required for surprise
data = Dataset.load_from_df(train[['reviewerID', 'asin', 'overall']], reader)
# generating training set
trainset = data.build_full_trainset()

In [40]:
%%time
# fitting algo to training set
emf.fit(trainset)

100%|█████████████████████████████████████████████████████████████████| 5/5 [02:12<00:00, 26.45s/it]

CPU times: user 2min 8s, sys: 1.26 s, total: 2min 10s
Wall time: 2min 12s





In [41]:
%%time
# generate candidate items for user to predict rating
testset = trainset.build_anti_testset()

CPU times: user 37.1 s, sys: 10.5 s, total: 47.6 s
Wall time: 55.4 s


In [42]:
%%time
# predict ratings for all pairs (u, i) that are NOT in the training set
predictions = emf.test(testset, verbose=False)

CPU times: user 8min 35s, sys: 6min 6s, total: 14min 42s
Wall time: 18min 39s


# Evaluate Top-N Recommendations

### Defining Evaluation Metrics

In [43]:
def precision_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating precision@K - relevant / total recommended
    precision_at_k = num_relevant / k
    
    return precision_at_k

def recall_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating recall@K - relevant / total relevant items
    recall_at_k = num_relevant / len(asins)
    
    return recall_at_k

In [44]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in tqdm(predictions):
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in tqdm(top_n.items()):
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [45]:
# loading test dataset
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

# generating test rating history
test_user_history = (pd.DataFrame(test.groupby(['reviewerID'])['asin']
                                  .apply(list).reset_index()))

In [46]:
top_ns = get_top_n(predictions)

100%|███████████████████████████████████████████████| 63307346/63307346 [01:45<00:00, 599445.62it/s]
100%|████████████████████████████████████████████████████████| 13397/13397 [00:50<00:00, 264.03it/s]


In [47]:
# generating a random user
random_user = np.random.choice(list(train['reviewerID'].unique()), 1)[0]
print(f"For user: {random_user}:")
print(f"Purchase History:\n{train[train['reviewerID'] == random_user][['asin', 'title']]}")

# find the recommendations
# NOTE: This is not sorted in ranking
print(f"\nRecommending:\n")
print(f"{train[train['asin'].isin([i[0] for i in top_ns[random_user]])][['asin', 'title']].drop_duplicates(subset='asin')}")
print(f"\n{top_ns[random_user]}")

For user: A2VIX3WXF4HG9T:
Purchase History:
             asin                                              title
33340  B0041CIR62  Peacock Brown Rice Vermicelli, 7-Ounce Package...
40399  B005DVVB9K              Chocolate Sampler Gift Basket by ig4U

Recommending:

             asin                                              title
354    B00014JNI0  YS Organic Bee Farms CERTIFIED ORGANIC RAW HON...
1115   B0001M0Z6Q  Spicy World Peppercorn (Whole)-Black Tellicher...
4768   B000EDG4V2       Bob's Red Mill Guar Gum, 8 Ounce (Case of 8)
4819   B000EDBPO8  Bob's Red Mill White Rice Flour, Organic, 24-O...
6712   B000F4D5GC  Let's Do Organic Shredded, Unsweetened Coconut...
8390   B000G82L62  Lundberg Family Farms Wild Blend Rice, 16 Ounc...
9469   B000HDJXH6  Enjoy Life Chewy Bars, Soy free, Nut free, Glu...
13711  B000Z93FQC               Y.S. Eco Bee Farms Raw Honey - 22 oz
25518  B001PEWJWC  Garbanzo Beans aka Chickpeas or Ceci Beans | N...
47552  B00DS842HS  Viva Naturals Organic Ex

### N=10

In [48]:
top_ns = get_top_n(predictions, n=10)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 10
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|███████████████████████████████████████████████| 63307346/63307346 [02:00<00:00, 525528.39it/s]
100%|████████████████████████████████████████████████████████| 13397/13397 [00:43<00:00, 306.64it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 46043.05it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 47577.53it/s]

The MEM-ECF has a average precision@10: 0.00233, average recall@10: 0.01200.





### N=25

In [49]:
top_ns = get_top_n(predictions, n=25)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 25
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|███████████████████████████████████████████████| 63307346/63307346 [01:31<00:00, 693803.14it/s]
100%|████████████████████████████████████████████████████████| 13397/13397 [01:06<00:00, 202.34it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 43452.64it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 46870.10it/s]

The MEM-ECF has a average precision@25: 0.00155, average recall@25: 0.01998.





### N=30

In [50]:
top_ns = get_top_n(predictions, n=30)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 30
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|███████████████████████████████████████████████| 63307346/63307346 [01:38<00:00, 642650.49it/s]
100%|████████████████████████████████████████████████████████| 13397/13397 [00:48<00:00, 276.60it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 41152.35it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 39409.04it/s]

The MEM-ECF has a average precision@30: 0.00143, average recall@30: 0.02194.





### N=45

In [51]:
top_ns = get_top_n(predictions, n=45)

test_recommendations = pd.DataFrame(top_ns.items(), columns=['reviewerID', 'pred_asin'])
test_recommendations['pred_asin'] = test_recommendations['pred_asin'].apply(lambda x: [i[0] for i in x])

# combined test dataset and recommendations
test_merged = pd.merge(test_user_history, test_recommendations, on='reviewerID', how='inner')

k = 45
test_merged['precision@k'] = test_merged.progress_apply(lambda x: precision_at_k(x.asin, x.pred_asin, k=k), axis=1)
test_merged['recall@k'] = test_merged.progress_apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)

average_precision_at_k = test_merged["precision@k"].mean()
average_recall_at_k = test_merged["recall@k"].mean()

print(f"The MEM-ECF has a average precision@{k}: {average_precision_at_k:.5f}, average recall@{k}: {average_recall_at_k:.5f}.")

100%|███████████████████████████████████████████████| 63307346/63307346 [01:34<00:00, 670253.03it/s]
100%|████████████████████████████████████████████████████████| 13397/13397 [01:04<00:00, 208.24it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 45297.16it/s]
100%|██████████████████████████████████████████████████████| 13279/13279 [00:00<00:00, 47126.37it/s]

The MEM-ECF has a average precision@45: 0.00118, average recall@45: 0.02697.





In [52]:
# looking at how many get correct
test_merged[test_merged['recall@k'] > 0]

Unnamed: 0,reviewerID,asin,pred_asin,precision@k,recall@k
16,A101RRYMZM4KYV,"[B000HDI5O8, B00C1LXBFC]","[B00014JNI0, B000EDG4V2, B00D1G7LZM, B000Z93FQ...",0.022222,0.500000
21,A1047EDJ84IMAS,"[B00014JNI0, B00014JNI0, B004CWO9Y0, B004I5KO9...","[B00014JNI0, B000Z93FQC, B0001M0Z6Q, B000EDG4V...",0.022222,0.166667
44,A10BWUA2MGA9BK,[B000S8593W],"[B001MSZK04, B000Z93FQC, B00014JNI0, B000EDG4V...",0.022222,1.000000
45,A10C4O0Q0TWXOL,"[B001E5E1L4, B001P74NXM, B00DS842HS]","[B00014JNI0, B004VLVFFI, B00DS842HS, B000Z93FQ...",0.022222,0.333333
99,A10Y058K7B96C6,"[B000HDK0DC, B004U49QU2]","[B00014JNI0, B00DS842HS, B000EDG4V2, B000Z93FQ...",0.022222,0.500000
...,...,...,...,...,...
13238,AZGV51M0UUJ8B,[B00DS842HS],"[B00014JNI0, B000Z93FQC, B000EDG4V2, B001PEWJW...",0.022222,1.000000
13251,AZNS7TH82KH9K,[B00DS842HS],"[B00014JNI0, B00DS842HS, B0001M0Z6Q, B000EDBPO...",0.022222,1.000000
13260,AZQGJ5CEAJGXB,"[B005A1LINC, B00DS842HS]","[B00014JNI0, B00DS842HS, B000EDG4V2, B000Z93FQ...",0.022222,0.500000
13270,AZVJHW8TARWV9,[B001PEWJWC],"[B00014JNI0, B000EDG4V2, B00DS842HS, B000Z93FQ...",0.022222,1.000000
