In [38]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
import string
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.corpus import stopwords
from nltk import word_tokenize, Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.read_csv('beer_reviews.csv')
data = data.iloc[:, 1:]
data.shape

(6227, 3)

In [3]:
data.head()

Unnamed: 0,product_name,product_review,user_rating
0,Kentucky Brunch Brand Stout,"Long time waiting to tick this one, and I have...",4.56
1,Kentucky Brunch Brand Stout,This review is for the 2019 batch. It was bott...,5.0
2,Kentucky Brunch Brand Stout,Supreme maple OD! Soooo easy drinking & well-t...,5.0
3,Kentucky Brunch Brand Stout,I have now had 4 different years of KBBS and c...,5.0
4,Kentucky Brunch Brand Stout,2020 Bottle. Absolutely bonkers Maple Syrup o...,5.0


In [4]:
data.isna().sum()

product_name      0
product_review    1
user_rating       0
dtype: int64

In [5]:
data = data.dropna()
data.reset_index(drop=True, inplace=True)
data.shape

(6226, 3)

In [6]:
# function to lemmatize all words in captions
def lemmatization(text):
    text = nlp(text)
    text_lemma = [word.lemma_ for word in text]
    return " ".join(text_lemma)

In [7]:
# stripping the review
data['product_review'] = data['product_review'].astype(str).str.strip()

# lemmatizing the words
data['product_review'] = data['product_review'].map(lemmatization)

# Task B

### Cleaning the data

In [8]:
# creating the text from comments 
text = ' '.join(data['product_review'])
text = text.lower()

# getting stopword list from nltk
stopwords_list = stopwords.words('english')
digit_list = list(string.digits)
punctuation_list = list(string.punctuation)

# tokenizing words from latest 5000 comments for ziffs law
text_token = word_tokenize(text)
text_token_counter = Counter(text_token)

# creating dataframe for frequency table
text_token_df = pd.DataFrame(data=None, columns=['words', 'frequency'])
text_token_df['words'] = text_token_counter.keys()
text_token_df['frequency'] = text_token_counter.values()

# sorting the text token df and getting ranks
text_token_df.sort_values('frequency', inplace=True, ascending=False)
text_token_df.reset_index(drop=True, inplace=True)
text_token_df['rank'] = text_token_df['frequency'].rank(method='min', ascending=False).astype(int)

# removing stop words/punctuations/limit from word lists
mask = ~(text_token_df['words'].isin(stopwords_list) | 
          text_token_df['words'].isin(digit_list) |
          text_token_df['words'].isin(punctuation_list))
token_cleaned_df = text_token_df[mask]

token_cleaned_df

Unnamed: 0,words,frequency,rank
14,beer,5652,15
21,taste,3871,22
22,head,3837,23
23,pour,3714,24
28,chocolate,2916,29
...,...,...,...
13919,coloured,1,7412
13920,kilbarrack,1,7412
13921,mchugh,1,7412
13922,unintentionally,1,7412


In [9]:
# token_cleaned_df.head(100).to_csv('beer_review_tokens.csv', index=False)

### Creating words to attributes mapping

In [10]:
# attribute list from the problem statement
attributes_to_word_dict = {
    'aggressive': ['boldly', 'assertive', 'aroma', 'taste'],
    'balanced': ['malt', 'hops', 'malt', 'sweetness', 'hop', 'bitterness', 'balance'],
    'complex': ['multidimensional', 'flavors', 'sensations', 'palate'],
    'crisp':  ['carbonated', 'effervescent'],
    'fruity': ['flavors', 'fruits'],
    'hoppy': ['herbal', 'earthy', 'spicy', 'citric', 'citrus', 'aromas', 'flavors', 'hop'],
    'malty': ['grainy', 'caramel', 'sweet', 'dry'],
    'robust': ['rich', 'bodied']}

# adding more attributes from term frequency analysis
attributes_to_word_dict['aggressive'].append('sour')
attributes_to_word_dict['balanced'].append('bitter')
attributes_to_word_dict['fruity'].append('grapefruit')
attributes_to_word_dict['fruity'].append('pineapple')
attributes_to_word_dict['fruity'].append('mango')
attributes_to_word_dict['fruity'].append('coconut')
attributes_to_word_dict['fruity'].append('tropical')
attributes_to_word_dict['hoppy'].append('maple')
attributes_to_word_dict['malty'].append('bourbon') 
attributes_to_word_dict['malty'].append('vanilla') 
attributes_to_word_dict['malty'].append('oak') 

# lemmatizing keys and values
for k, v in attributes_to_word_dict.items():
    attributes_to_word_dict[k] = [lemmatization(w) for w in v] + [lemmatization(k)]
attributes_to_word_dict

{'aggressive': ['boldly', 'assertive', 'aroma', 'taste', 'sour', 'aggressive'],
 'balanced': ['malt',
  'hop',
  'malt',
  'sweetness',
  'hop',
  'bitterness',
  'balance',
  'bitter',
  'balanced'],
 'complex': ['multidimensional', 'flavor', 'sensation', 'palate', 'complex'],
 'crisp': ['carbonate', 'effervescent', 'crisp'],
 'fruity': ['flavor',
  'fruit',
  'grapefruit',
  'pineapple',
  'mango',
  'coconut',
  'tropical',
  'fruity'],
 'hoppy': ['herbal',
  'earthy',
  'spicy',
  'citric',
  'citrus',
  'aromas',
  'flavor',
  'hop',
  'maple',
  'hoppy'],
 'malty': ['grainy',
  'caramel',
  'sweet',
  'dry',
  'bourbon',
  'vanilla',
  'oak',
  'malty'],
 'robust': ['rich', 'body', 'robust']}

# Task C

Three customer attributes - 
* Aggresive
* Balanced
* Hoppy

In [11]:
input_dict = {
    'aggressive': 1,
    'balanced': 1,
    'complex': 0,
    'crisp':  0,
    'fruity': 0,
    'hoppy': 1,
    'malty': 0,
    'robust': 0}

input_attributes = pd.DataFrame(input_dict, index=[0])
input_attributes


Unnamed: 0,aggressive,balanced,complex,crisp,fruity,hoppy,malty,robust
0,1,1,0,0,0,1,0,0


### Attribute identification from reviews - TF matrix

In [12]:
# creating attribute wise columns to get occurence into a dataframe
attribute_occurence_df = pd.DataFrame(np.zeros((data.shape[0], len(attributes_to_word_dict.keys()))))
attribute_occurence_df.columns = attributes_to_word_dict.keys()

# appending product_review df with brand occurence df
attribute_occurence_df = pd.concat([data, attribute_occurence_df], axis=1)

# geting occurence columns populated
for c in attribute_occurence_df.iloc[:, 3:]:
    model_list = list(attributes_to_word_dict[c])
    print('calculating tf in product_review for attributes - ', c, model_list)
    
    attribute_occurence_df[c] = attribute_occurence_df['product_review'].str.\
        findall('( ' + '|'.join(model_list) + ' )').map(lambda lst: len(lst))

calculating tf in product_review for attributes -  aggressive ['boldly', 'assertive', 'aroma', 'taste', 'sour', 'aggressive']
calculating tf in product_review for attributes -  balanced ['malt', 'hop', 'malt', 'sweetness', 'hop', 'bitterness', 'balance', 'bitter', 'balanced']
calculating tf in product_review for attributes -  complex ['multidimensional', 'flavor', 'sensation', 'palate', 'complex']
calculating tf in product_review for attributes -  crisp ['carbonate', 'effervescent', 'crisp']
calculating tf in product_review for attributes -  fruity ['flavor', 'fruit', 'grapefruit', 'pineapple', 'mango', 'coconut', 'tropical', 'fruity']
calculating tf in product_review for attributes -  hoppy ['herbal', 'earthy', 'spicy', 'citric', 'citrus', 'aromas', 'flavor', 'hop', 'maple', 'hoppy']
calculating tf in product_review for attributes -  malty ['grainy', 'caramel', 'sweet', 'dry', 'bourbon', 'vanilla', 'oak', 'malty']
calculating tf in product_review for attributes -  robust ['rich', 'bod

In [15]:
attribute_occurence_df = attribute_occurence_df[
    (attribute_occurence_df.iloc[:, 3:].sum(axis = 1) != 0)]
attribute_occurence_df.head()

Unnamed: 0,product_name,product_review,user_rating,aggressive,balanced,complex,crisp,fruity,hoppy,malty,robust
0,Kentucky Brunch Brand Stout,"long time wait to tick this one , and I have t...",4.56,0,0,0,0,0,1,0,0
1,Kentucky Brunch Brand Stout,this review be for the 2019 batch . it be bott...,5.0,1,0,1,0,1,2,1,0
2,Kentucky Brunch Brand Stout,Supreme maple OD ! Soooo easy drinking & well ...,5.0,0,0,0,0,0,1,0,0
3,Kentucky Brunch Brand Stout,I have now have 4 different year of KBBS and c...,5.0,1,0,0,0,0,1,1,0
4,Kentucky Brunch Brand Stout,2020 Bottle . absolutely bonker Maple Syrup ...,5.0,0,0,0,0,0,0,1,0


### Getting cosine similarity

In [16]:
def cosineSimilarity(review_att, cust_input):
    return np.dot(review_att, cust_input) / (norm(review_att) * norm(cust_input))

attribute_occurence_df['similarity_score'] = attribute_occurence_df.iloc[:, 3:].apply(
    lambda x : cosineSimilarity(x.values, input_attributes.values[0]), axis = 1).\
sort_values(ascending=False)
attribute_occurence_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attribute_occurence_df['similarity_score'] = attribute_occurence_df.iloc[:, 3:].apply(


Unnamed: 0,product_name,product_review,user_rating,aggressive,balanced,complex,crisp,fruity,hoppy,malty,robust,similarity_score
0,Kentucky Brunch Brand Stout,"long time wait to tick this one , and I have t...",4.56,0,0,0,0,0,1,0,0,0.57735
1,Kentucky Brunch Brand Stout,this review be for the 2019 batch . it be bott...,5.0,1,0,1,0,1,2,1,0,0.612372
2,Kentucky Brunch Brand Stout,Supreme maple OD ! Soooo easy drinking & well ...,5.0,0,0,0,0,0,1,0,0,0.57735
3,Kentucky Brunch Brand Stout,I have now have 4 different year of KBBS and c...,5.0,1,0,0,0,0,1,1,0,0.666667
4,Kentucky Brunch Brand Stout,2020 Bottle . absolutely bonker Maple Syrup ...,5.0,0,0,0,0,0,0,1,0,0.0


In [17]:
attribute_occurence_df.sort_values('similarity_score', ascending=False)

Unnamed: 0,product_name,product_review,user_rating,aggressive,balanced,complex,crisp,fruity,hoppy,malty,robust,similarity_score
460,Julius,pour a deep flat orange color with a bit of se...,4.76,1,1,0,0,0,1,0,0,1.0
961,Ann,2014 vintage from the taproom . finally get to...,4.49,1,1,0,0,0,1,0,0,1.0
5794,Emerald Grouper,"beer look okay , but it taste kind of like old...",2.71,1,1,0,0,0,1,0,0,1.0
5781,Emerald Grouper,"pour a cleanse , golden orange with a thin , w...",4.80,1,1,0,0,0,1,0,0,1.0
1863,Sip Of Sunshine,"dark gold appearance , clear not hazy smell of...",3.58,1,1,0,0,0,1,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2534,Plead The 5th - Bourbon Barrel-Aged,"02/2016 - vintage 2015 - a lot of brownie , fu...",4.46,0,0,1,0,0,0,3,2,0.0
3784,Society & Solitude #6,this beer out - tree house Tree House . it be ...,4.71,0,0,0,0,1,0,0,0,0.0
5578,Foggier Window,opaque yellow with white foam . sweet cut be...,4.65,0,0,1,1,0,0,1,0,0.0
3763,Society & Solitude #6,"can Hazy bronze ; big pineapple , kind bud , l...",4.45,0,0,0,0,4,0,0,0,0.0


In [18]:
output_df = attribute_occurence_df[['product_name', 'product_review', 'similarity_score']]
output_df.to_csv('customer_review_similarity.csv', index=False)

In [19]:
avg_review_similarity = output_df.groupby('product_name')['similarity_score'].mean().reset_index()
avg_review_similarity.sort_values('similarity_score', ascending=False)

Unnamed: 0,product_name,similarity_score
128,Hop JuJu,0.746869
89,Double Sunshine,0.743651
148,Keene Idea,0.722941
193,Pseudo Sue - Double Dry-Hopped,0.722805
248,Zombie Dust,0.718228
...,...,...
106,Fundamental Forces,0.235745
195,Resolute - Coconut,0.233555
34,Black Tuesday,0.222012
35,Black Tuesday - Reserve,0.218091


# Task D

In [20]:
senti_analyzer = SentimentIntensityAnalyzer()

def review_sentiment(review):
    score = senti_analyzer.polarity_scores(review)
    return score['compound']

attribute_occurence_df['sentiment_score'] = attribute_occurence_df['product_review'].map(review_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attribute_occurence_df['sentiment_score'] = attribute_occurence_df['product_review'].map(review_sentiment)


In [22]:
attribute_occurence_df.head()

Unnamed: 0,product_name,product_review,user_rating,aggressive,balanced,complex,crisp,fruity,hoppy,malty,robust,similarity_score,sentiment_score
0,Kentucky Brunch Brand Stout,"long time wait to tick this one , and I have t...",4.56,0,0,0,0,0,1,0,0,0.57735,0.6369
1,Kentucky Brunch Brand Stout,this review be for the 2019 batch . it be bott...,5.0,1,0,1,0,1,2,1,0,0.612372,0.8194
2,Kentucky Brunch Brand Stout,Supreme maple OD ! Soooo easy drinking & well ...,5.0,0,0,0,0,0,1,0,0,0.57735,0.9018
3,Kentucky Brunch Brand Stout,I have now have 4 different year of KBBS and c...,5.0,1,0,0,0,0,1,1,0,0.666667,0.8689
4,Kentucky Brunch Brand Stout,2020 Bottle . absolutely bonker Maple Syrup ...,5.0,0,0,0,0,0,0,1,0,0.0,-0.5487


# Task E

In [67]:
evaluation_df = attribute_occurence_df.groupby('product_name')[['similarity_score', 'sentiment_score']].mean().\
reset_index()
evaluation_df['evaluation_score'] = evaluation_df['similarity_score'] + evaluation_df['sentiment_score']
evaluation_df.sort_values('evaluation_score', ascending=False, inplace=True)
recommendation = evaluation_df.head(3)['product_name'].values
recommendation

array(['Double Stack', 'Pliny The Younger', 'Keene Idea'], dtype=object)

# Task F

In [68]:
cust_pref = 'aggressive balanced hoppy'

def spacy_similarity(review, cust_pref):
    review_doc = nlp(review)
    cust_pref_doc = nlp(cust_pref)
    return review_doc.similarity(cust_pref_doc)

attribute_occurence_df['spacy_similarity_score'] = attribute_occurence_df['product_review'].map(
    lambda x : spacy_similarity(x, cust_pref))

  return review_doc.similarity(cust_pref_doc)


KeyboardInterrupt: 

In [70]:
evaluation_df_w2v = attribute_occurence_df.groupby('product_name')[['spacy_similarity_score', 'sentiment_score']].mean().\
reset_index()
evaluation_df_w2v['evaluation_score_w2v'] = evaluation_df_w2v['spacy_similarity_score'] + evaluation_df_w2v['sentiment_score']
evaluation_df_w2v.sort_values('evaluation_score_w2v', ascending=False, inplace=True)
recommendation_w2v = evaluation_df_w2v.head(3)['product_name'].values
recommendation_w2v

array(['Mexican Cake - Maple Bourbon Barrel-Aged', 'Flora Plum',
       'Blåbær Lambik'], dtype=object)

**Comparision between the two methods using % of reviews with the attribute**

In [109]:
cust_attributes = list(input_attributes.T[(input_attributes.T == 1)].dropna().index)

mask1 = attribute_occurence_df['product_name'].isin(recommendation)
bow_reco_check = attribute_occurence_df.loc[mask1]
bow_reco_check[cust_attributes] = bow_reco_check[cust_attributes].astype(bool).astype(int)
bow_reco_check.groupby('product_name')[cust_attributes].sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0_level_0,aggressive,balanced,hoppy
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Double Stack,21,16,24
Keene Idea,11,21,21
Pliny The Younger,15,20,18


In [110]:
mask2 = attribute_occurence_df['product_name'].isin(recommendation_w2v)

spacy_reco_check = attribute_occurence_df.loc[mask2]
spacy_reco_check[cust_attributes] = spacy_reco_check[cust_attributes].astype(bool).astype(int)
spacy_reco_check.groupby('product_name')[cust_attributes].sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0_level_0,aggressive,balanced,hoppy
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Blåbær Lambik,19,7,10
Flora Plum,16,9,13
Mexican Cake - Maple Bourbon Barrel-Aged,9,12,19


**TODO - add commentary**

# Task G

In [115]:
evaluation_df_rating = attribute_occurence_df.groupby('product_name')['user_rating'].mean().reset_index()
evaluation_df_rating.sort_values('user_rating', ascending=False, inplace=True)
recommendation_rating = evaluation_df_rating['product_name'].head(3).values
recommendation_rating

array(['SR-71', 'Chemtrailmix', 'Blessed'], dtype=object)

In [116]:
mask3 = attribute_occurence_df['product_name'].isin(recommendation_rating)

rating_reco_check = attribute_occurence_df.loc[mask3]
rating_reco_check[cust_attributes] = rating_reco_check[cust_attributes].astype(bool).astype(int)
rating_reco_check.groupby('product_name')[cust_attributes].sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0_level_0,aggressive,balanced,hoppy
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Blessed,13,11,9
Chemtrailmix,13,11,8
SR-71,10,8,11
