In [54]:
import fastparquet
import pickle
import numpy as np
import pandas as pd
import nltk
import re
import heapq

In [55]:
prodDF = pd.read_parquet("prod1587Final.parquet", engine="fastparquet")
userDF = pd.read_parquet("userFinal.parquet", engine="fastparquet")
prodReview = pd.read_parquet("product_reviews.parquet", engine="fastparquet")
with open("aspect_freq/final_prods_aspects.pickle", "rb") as f:
    DictProd = pickle.load(f)
with open("aspect_freq/final_users_aspects.pickle", "rb") as f:
    DictUser = pickle.load(f)
with open("utils/revDictProd.pickle", "rb") as f:
    revDictProd = pickle.load(f)
with open("utils/revDictProd.pickle", "rb") as f:
    revDictProd = pickle.load(f)

### Summarizing reviews of a product 

In [None]:

#! https://stackabuse.com/text-summarization-with-nltk-in-python/

def get_summary(review):
    sentence_list = nltk.sent_tokenize(review)
    review = re.sub(r'\[[0-9]*\]', ' ', review)
    review = re.sub(r'\s+', ' ', review)
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = re.sub(r'\s+', ' ', review)


    stopwords = nltk.corpus.stopwords.words('english')

    word_frequencies = {}
    for word in nltk.word_tokenize(review):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
    sentence_scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
    summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)

    summary = ' '.join(summary_sentences)
    return summary

### Recommending beers for a given user based on his profile (history of consumption)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


def get_recommendations(usr_id, topk=10, needed_aspects = None):
    usr_df = userDF[
        (userDF["user/Id"] == usr_id) & (userDF["review/overall"] > 4.0)
    ].sort_values(by=["review/overall"], ascending=False)
    print(f"Found {len(usr_df)} liked items.\n")
    usr_df = usr_df.head(n=10)
    
    if len(usr_df) == 0:
        return None
    indexes = usr_df["beer/id"]
    if needed_aspects is None:
        candidates = prodDF.iloc[:, 10:]
    else:
        #! Explicit Candidate Generation Code
        indices = [] 
        for _asp in needed_aspects:
            if (x:= revDictProd.get(_asp.lower(), None)) is not None:
                indices.extend(x)
        indices = list(set(indices))
        candidates = prodDF.iloc[indices, 10:]
        print(f"Found {len(indices)} items matching criteria\n")
    sim = np.mean(
        cosine_similarity(
            prodDF.iloc[indexes, 10:],
            candidates
        ),
        axis=0,
    )
    sorted_indices = np.argsort(sim)[::-1]
    return sorted_indices[:topk], np.sort(sim)[::-1][:topk], list(indexes)


### using overlap coefficient (instead of Jaccard) to measure similarity b/w two products based on their sentiment-aspect pairs

In [None]:

def overlap_coefficient(l1: list, l2: list):
    #! https://developer.nvidia.com/blog/similarity-in-graphs-jaccard-versus-the-overlap-coefficient/
    s1 = set(l1)
    s2 = set(l2)
    return len(s1.intersection(s2)) / min(len(s1), len(s2))

In [None]:
from random import choice
usr_id = choice(range(19058))
# usr_id = 7225
needed_aspects=["dark chocolate","light finish", "creamy head"]
print(f"Recommending for user: {usr_id}\n")
# rcs = get_recommendations(usr_id=usr_id, topk=10, needed_aspects=needed_aspects)
rcs = get_recommendations(usr_id=usr_id, topk=10)
if rcs is not None:
    usr_most_recc, cs_sim, usr_most_liked = rcs
else:
    print("No recommendations can be made.")
ml_len = len(usr_most_liked)
print(f"Similarity of top-k recommendations: {cs_sim}")


Recommending for user: 5980

Found 5 liked items.

Similarity of top-k recommendations: [0.65072328 0.62932256 0.62334357 0.62209874 0.61857345 0.6153405
 0.60984928 0.60812632 0.59566811 0.59100558]


In [43]:
print(f"Index of top user preferences: {usr_most_liked}")

Index of top user preferences: [5330, 20550, 22631, 50795, 15463, 15460]


In [44]:
print(f"Index of top recommended: {usr_most_recc}")

Index of top recommended: [22631 22641 47991 21230 50795 45655 22645 13716 31372 18665]


### Review Summarization of recommended products

In [45]:
for beerid in usr_most_recc:
    review = prodReview.iloc[beerid]["review/text"]
    name = prodDF.iloc[beerid]["beer/name"]
    print(f"Product recommended: {name}\n")
    print(f"Review:\t{get_summary(review)}")
    print("---"*20)

Product recommended: Guinness Draught

Review:	on tap. i love this beer. bottle. thick creamy head. dark brown, practically black hue, w/a creamy head, real smooth beer, really light coffee flavor....use to love it, now tastes like cold watered down coffee w/a hint of alcohol. great beer. black with a creamy tan head that lasts until the beer is gone.
------------------------------------------------------------
Product recommended: Guinness Extra Stout (Original)

Review:	the beer pours a deep dark brown/black appropriate for the style with a thick and creamy beige-colored head. pours a rich dark brown (pretty much black) color with a nice, big, creamy head with good retention and lacing (poured from a bottle into an imperial pint glass). guiness extra stout pours a very dark brown virtually black color. great beer...personal favorite poured into a red wine glass, this beer had a nice one-finger foamy light coffee colored head with nice retention. taste is nice and full bodied as oppos

### Evaluation


|-|  Recommended|Not Recommended|
|:----------:|:----:|:-------:|
|Used (Liked)|TP|FN|
|Not Used (Not Liked)|FP|TN|


$$Precision = \frac{TP}{TP+FP}$$
$$Recall = \frac{TP}{TP+FN}$$
$$F = \frac{2 \cdot precision \cdot recall}{precision+recall}$$

In [46]:
for index in [1,3,5,10]: 
    usr_recc_ = usr_most_recc[:index]
    print(f"\nMeasures @ n = {index}")
    tp = set(usr_most_liked).intersection(set(usr_recc_))
    print(f"  True Positive: {tp}")
    recall = len(tp)/len(usr_most_liked)
    precision = len(tp)/len(usr_recc_)
    print(f"  Recall   : {round(recall,4)}")
    print(f"  Precision: {round(precision,4)}")
    #! Adding 1e-10 to avoid ZeroDivisionError when precision = recall = 0
    print(f"  F-measure: {round((2*precision*recall)/(precision+recall+1e-010),4)}")


Measures @ n = 1
  True Positive: {22631}
  Recall   : 0.1667
  Precision: 1.0
  F-measure: 0.2857

Measures @ n = 3
  True Positive: {22631}
  Recall   : 0.1667
  Precision: 0.3333
  F-measure: 0.2222

Measures @ n = 5
  True Positive: {50795, 22631}
  Recall   : 0.3333
  Precision: 0.4
  F-measure: 0.3636

Measures @ n = 10
  True Positive: {50795, 22631}
  Recall   : 0.3333
  Precision: 0.2
  F-measure: 0.25


### Intra-List Diversity

In [47]:
_recc_vec = prodDF.iloc[usr_most_recc,10:]
_den = 2*len(usr_most_recc)*(len(usr_most_recc)-1)
recc_recc_dist = pd.DataFrame(np.round(1-cosine_similarity(_recc_vec,_recc_vec),4)) 
ild = recc_recc_dist.sum().sum()/_den
print(f"ILD among {len(usr_most_recc)} recommendations: {round(ild,5)}")

ILD among 10 recommendations: 0.09674


### Unexpectedness

In [48]:
recc_like_dist = pd.DataFrame(1-cosine_similarity(prodDF.iloc[usr_most_recc,10:],prodDF.iloc[usr_most_liked,10:])) 
unexp = round(np.mean(np.round(np.min(recc_like_dist,axis=1),5)),5)
print(f"Measure of unexpectedness for user {usr_id}: {unexp}")

Measure of unexpectedness for user 15078: 0.08008


### Overlap Coefficient of (sentiment, aspect) pairs between recommended & liked items

In [49]:
pref_prod_asp = []
for pid in usr_most_liked:
        pref_prod_asp.append(list(DictProd[pid].keys()))
recc_prod_asp = []
for i in prodDF.iloc[usr_most_recc]['beer/id']:
        recc_prod_asp.append(list(DictProd[i].keys()))


sim_matrix = [[] for _ in  recc_prod_asp]
for index_i, i in enumerate(recc_prod_asp):
    sim_matrix[index_i] = []
    for index_j, j in enumerate(pref_prod_asp):
        sim_matrix[index_i].append(overlap_coefficient(i,j))

overlap_coeff_matrix = pd.DataFrame(sim_matrix, index=usr_most_recc, columns=usr_most_liked)
overlap_coeff_matrix

#! Row Index are the recommended beer id
#! Column Index are the user-preferred beer id

Unnamed: 0,5330,20550,22631,50795,15463,15460
22631,0.427545,0.435544,1.0,0.347599,0.614583,0.616034
22641,0.439101,0.417605,0.426716,0.36845,0.59375,0.675105
47991,0.448854,0.328327,0.459117,0.417722,0.583333,0.624473
21230,0.527968,0.324772,0.530251,0.47774,0.510417,0.540084
50795,0.397876,0.319983,0.347599,1.0,0.6875,0.594937
45655,0.469246,0.318315,0.488975,0.422437,0.541667,0.603376
22645,0.459596,0.261249,0.472452,0.432966,0.5,0.565401
13716,0.480526,0.249179,0.444862,0.453308,0.489583,0.607595
31372,0.471236,0.26377,0.435129,0.486536,0.458333,0.506329
18665,0.468457,0.336254,0.36008,0.417833,0.604167,0.649789


In [50]:

# ! Mean overlap of each recommended beer with user-preferred beer 
np.mean(overlap_coeff_matrix, axis=1)

22631    0.573551
22641    0.486788
47991    0.476971
21230    0.485205
50795    0.557983
45655    0.474002
22645    0.448611
13716    0.454175
31372    0.436889
18665    0.472763
dtype: float64

### DataFrame of recommended Products

In [51]:
prodDF.iloc[usr_most_recc,:10]

Unnamed: 0,beer/id,beer/name,beer/style,beer/ABV,review/count,review/appearance,review/aroma,review/palate,review/taste,review/overall
22631,22631,Guinness Draught,Dry Stout,4.1,3586,4.455661,3.141662,3.505298,3.326408,3.514501
22641,22641,Guinness Extra Stout (Original),Irish Dry Stout,6.0,1220,4.108607,3.608607,3.715164,3.74918,3.739344
47991,47991,Sierra Nevada Stout,Stout,5.8,1513,3.925975,3.395902,3.538004,3.506279,3.646067
21230,21230,Goose Island Oatmeal Stout,Sweet Stout,5.1,753,3.709163,3.405046,3.424967,3.449535,3.588977
50795,50795,Stone Sublimely Self Righteous Ale,Black IPA,8.7,1449,4.149758,3.910973,3.913043,3.942719,4.068323
45655,45655,Samuel Adams Cream Stout,Sweet Stout,4.9,1407,3.836532,3.375622,3.491116,3.455224,3.557214
22645,22645,Guinness Foreign Extra Stout,Foreign Stout,7.5,750,4.085333,3.47,3.641333,3.556,3.663667
13716,13716,Dark Horse Reserve Special Black Bier,Porter,7.5,765,4.056209,3.760131,3.775163,3.788235,3.902288
31372,31372,Left Hand SmokeJumper,Smoked,9.8,485,4.059794,3.853608,3.771134,3.853608,3.969588
18665,18665,Flying Dog Gonzo Imperial Porter,Imperial/Strong Porter,9.2,1869,4.153023,3.728464,3.848047,3.814874,3.94275


### DataFrame of products liked by user

In [52]:
prodDF.iloc[usr_most_liked,:10]

Unnamed: 0,beer/id,beer/name,beer/style,beer/ABV,review/count,review/appearance,review/aroma,review/palate,review/taste,review/overall
5330,5330,Bells Kalamazoo Stout,Stout,6.0,1668,4.253597,3.858813,4.020983,3.958933,4.096373
20550,20550,George Killians Irish Red,American Dark Lager,4.9,1675,3.406567,2.408358,2.821493,2.684179,2.73194
22631,22631,Guinness Draught,Dry Stout,4.1,3586,4.455661,3.141662,3.505298,3.326408,3.514501
50795,50795,Stone Sublimely Self Righteous Ale,Black IPA,8.7,1449,4.149758,3.910973,3.913043,3.942719,4.068323
15463,15463,Dragonmead Juggernaut Double Red,American Strong Ale,9.0,17,3.764706,3.352941,3.588235,3.705882,3.720588
15460,15460,Dragonmead Imperial Stout,Imperial Stout,7.5,51,3.941176,3.343137,3.235294,3.303922,3.45098
