In [21]:
import matplotlib.pyplot as plt
import numpy as np
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore")

In [5]:
df_reviews = pd.read_csv("Comments.csv")
df_reviews

Unnamed: 0.1,Unnamed: 0,product_name,user_rating,product_review
0,0,Carlton Cold,2.01,looks like beer smells and tastes like urine w...
1,1,Carlton Cold,2.23,september 2008: 375 ml clear bottle courtesy o...
2,2,Carlton Cold,1.02,i recently reviewed victoria bitter which i dr...
3,3,Carlton Cold,2.95,carlton cold is filtered below zero degrees ce...
4,4,Carlton Cold,1.62,format: a standard clear 355ml bottle with tha...
...,...,...,...,...
7207,7207,Sculpin,4.08,smells great with an exquisite finish. with a ...
7208,7208,Bar Fly,4.57,yesterdays meandering around through the bottl...
7209,7209,Bar Fly,4.29,the smoky aroma is stronger than it follows on...
7210,7210,Bar Fly,4.16,midnight black body topped with a dense creamy...


In [8]:
# Download NLTK stopwords (you only need to do this once)
#nltk.download('stopwords')

# Extract the comment column
comments = df_reviews[df_reviews['product_review'].apply(lambda x: isinstance(x, str))]['product_review']

# Get the NLTK English stop words
stop_words = set(stopwords.words('english'))

# Tokenize and preprocess the comments (remove punctuation, convert to lowercase, and remove stop words)
def preprocesstext(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = re.findall(r'\b\w+\b', text.lower())  # Tokenize and convert to lowercase
    nswords = [word for word in words if word not in stop_words]
    return list(set(nswords))

# Tokenize and preprocess the comments, removing stop words
words_nostop = []
for comment in comments:
    words = preprocesstext(comment)
    words_nostop.extend(words)

# Calculate word frequencies
words_nostop_freq = pd.Series(words_nostop).value_counts()

wnf_df = pd.DataFrame(words_nostop_freq)
wnf_df = wnf_df.reset_index()
wnf_df = wnf_df.rename(columns={'index': 'words', 0: 'frequency'})

wnf_df.to_csv('review_words.csv')

In [4]:
import pandas as pd
file=pd.read_csv("beer attributes.csv")

In [5]:
file.head()

Unnamed: 0,attribute,frequency
0,light,2029
1,carbonation,1835
2,sweet,1669
3,malt,1654
4,white,1469


In [6]:
attributes=file[file["frequency"]>=700]["attribute"]

In [7]:
print("Available Attributes:")

for idx, attr in enumerate(attributes):
    print(f"{idx}.{attr}")

selected_attributes=[]

for i in range(3):
    while True:
         try:
              selection = int(input(f"Enter the number for attribute {i+1}:"))
              if selection>=1 and selection<=len(attributes):
                   selected_attributes.append(attributes[selection-1])
                   break
              else:
                   print("Invalid input.Please enter a valid number.")
         except ValueError:
              print("Invalid input. Please enter a number.") 

Available Attributes:
0.light
1.carbonation
2.sweet
3.malt
4.white
5.medium
6.dark
7.smooth
8.bitterness
9.clear
10.brown
11.chocolate
12.dry
13.sweetness
14.bitter
15.caramel
16.golden
17.black
18.fruit
Enter the number for attribute 1:6
Enter the number for attribute 2:2
Enter the number for attribute 3:7


In [10]:
selected_attributes

['medium', 'carbonation', 'dark']

In [11]:
#cosine similarity analysis
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

clean_df = df_reviews[df_reviews['product_review'].apply(lambda x: isinstance(x, str))].copy(deep=True)

clean_reviews = [" ".join(preprocesstext(review)) for review in clean_df["product_review"]]

clean_df.loc[:, "product_review"] = clean_reviews

In [12]:
def calc_similarity(reviews,attributes):
    vectorizer=CountVectorizer(vocabulary=attributes,binary=True) 
    attr_vec=vectorizer.transform([" ".join(attributes)])
    reviews_vec=vectorizer.fit_transform(reviews)
    similarity_scores=cosine_similarity(reviews_vec,attr_vec)
    return similarity_scores

similarity_scores=(calc_similarity(clean_reviews,selected_attributes))

In [13]:
results_df=clean_df.copy()
results_df.drop(["Unnamed: 0","user_rating"],axis=1,inplace=True)
results_df["similarity_score"]=similarity_scores
results_df.head()

Unnamed: 0,product_name,product_review,similarity_score
0,Carlton Cold,well years smells abv place set rate never uri...,0.0
1,Carlton Cold,september 2008 abv colour aroma much home clea...,0.0
2,Carlton Cold,sake australia artificial complexity yellow lo...,0.816497
3,Carlton Cold,bitterness creating drinking filtered dry subt...,0.0
4,Carlton Cold,sake makes pops yellow colour fairly got macro...,0.57735


In [14]:
results_df.groupby("product_name")["similarity_score"].mean().sort_values(ascending=False)[:3] #top3 recommendations in terms of cosine similarity

product_name
Barrel-Aged Malevolence Chocolate Caliente    1.000000
Wide Awake It's Morning                       1.000000
Society & Solitude #2                         0.908248
Name: similarity_score, dtype: float64

In [16]:
#Sentiment Analysis

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#initialize the sentiment analyzer
analyser = SentimentIntensityAnalyzer()

In [17]:
#function for calculating sentiment score
def sentiment_scores(review):
    score = analyser.polarity_scores(review)['compound']
    return score

In [18]:
#performed on original data reviews
df_senti = df_reviews[df_reviews['product_review'].apply(lambda x: isinstance(x, str))].copy(deep=True)
df_senti["sentiment_score"] = df_senti["product_review"].apply(sentiment_scores)
df_senti.head()

Unnamed: 0.1,Unnamed: 0,product_name,user_rating,product_review,sentiment_score
0,0,Carlton Cold,2.01,looks like beer smells and tastes like urine w...,0.501
1,1,Carlton Cold,2.23,september 2008: 375 ml clear bottle courtesy o...,0.8658
2,2,Carlton Cold,1.02,i recently reviewed victoria bitter which i dr...,-0.996
3,3,Carlton Cold,2.95,carlton cold is filtered below zero degrees ce...,0.7845
4,4,Carlton Cold,1.62,format: a standard clear 355ml bottle with tha...,0.7351


In [19]:
results_df["sentiment_score"] = df_senti["sentiment_score"]
results_df.describe()

Unnamed: 0,similarity_score,sentiment_score
count,7199.0,7199.0
mean,0.289641,0.609814
std,0.3525,0.465096
min,0.0,-0.996
25%,0.0,0.4404
50%,0.0,0.8199
75%,0.57735,0.942
max,1.0,0.9993


In [20]:
from sklearn.preprocessing import MinMaxScaler

results_df["sentiment_score_normalized"] = MinMaxScaler().fit_transform(np.array(results_df["sentiment_score"]).reshape(-1,1))
results_df.describe()

Unnamed: 0,similarity_score,sentiment_score,sentiment_score_normalized
count,7199.0,7199.0,7199.0
mean,0.289641,0.609814,0.804799
std,0.3525,0.465096,0.233096
min,0.0,-0.996,0.0
25%,0.0,0.4404,0.719892
50%,0.0,0.8199,0.910089
75%,0.57735,0.942,0.971283
max,1.0,0.9993,1.0


In [22]:
results_df["eval_score"] = results_df["similarity_score"]*results_df["sentiment_score_normalized"]
results_df.head()

Unnamed: 0,product_name,product_review,similarity_score,sentiment_score,sentiment_score_normalized,eval_score
0,Carlton Cold,well years smells abv place set rate never uri...,0.0,0.501,0.750263,0.0
1,Carlton Cold,september 2008 abv colour aroma much home clea...,0.0,0.8658,0.933093,0.0
2,Carlton Cold,sake australia artificial complexity yellow lo...,0.816497,-0.996,0.0,0.0
3,Carlton Cold,bitterness creating drinking filtered dry subt...,0.0,0.7845,0.892347,0.0
4,Carlton Cold,sake makes pops yellow colour fairly got macro...,0.57735,0.7351,0.867589,0.500903


In [23]:
results_df_top3 = results_df.sort_values(by='eval_score', ascending=False)
results_df_top3.head(3)

Unnamed: 0,product_name,product_review,similarity_score,sentiment_score,sentiment_score_normalized,eval_score
4665,Zonker Stout,brief backbone aromatic heart carafaroasted ca...,1.0,0.9991,0.9999,0.9999
5256,Terrible,tons long tripels colour much depth muted tiny...,1.0,0.998,0.999348,0.999348
2207,Ayinger Weizenbock,tap darker cloves cloudy second gum 22023 mild...,1.0,0.9969,0.998797,0.998797


### Association (Lift analysis)

Top 4 attributes from Task B are: light, carbonation, sweet and malt

In [6]:
df1=df_reviews.copy()

In [10]:
# removing strop words
df1["cleaned_review_wo_stopwords"] = df1["product_review"].astype(str).apply(lambda review: preprocesstext(review))

In [12]:
# Top 4 attributes
top_4_attributes=["light","carbonation","sweet","malt"]
top_4_attributes

['light', 'carbonation', 'sweet', 'malt']

In [16]:
# Top 10 beers with highest comments
top_10_beer = df_reviews.groupby('product_name')['product_name'].count().sort_values(ascending=False)[:10]
top_10_beer = top_10_beer.index.to_list()
top_10_beer

['Oktoberfest',
 'IPA',
 'Porter',
 'Tripel',
 'Pale Ale',
 'Péché Mortel',
 'Boatswain Double IPA (Twin Screw Steamer)',
 'Four O Street Legal Malt Liquor',
 'Camo Black Extra',
 'Cisk XS Extra Lager']

In [29]:
lift_db = df1.copy()
lift_db = lift_db[['product_name','product_review','cleaned_review_wo_stopwords']].apply(pd.Series.explode).set_index(['product_name','product_review']).reset_index().drop_duplicates().copy()      
lift_db

Unnamed: 0,product_name,product_review,cleaned_review_wo_stopwords
0,Carlton Cold,looks like beer smells and tastes like urine w...,ba
1,Carlton Cold,looks like beer smells and tastes like urine w...,tried
2,Carlton Cold,looks like beer smells and tastes like urine w...,members
3,Carlton Cold,looks like beer smells and tastes like urine w...,tastes
4,Carlton Cold,looks like beer smells and tastes like urine w...,rated
...,...,...,...
316799,Bar Fly,bottle at 2018 ris share. dark black pour smal...,ash
316800,Bar Fly,bottle at 2018 ris share. dark black pour smal...,ris
316801,Bar Fly,bottle at 2018 ris share. dark black pour smal...,2018
316802,Bar Fly,bottle at 2018 ris share. dark black pour smal...,aroma


In [30]:
def lift(n, a, b, ab):
    l = ((n*ab)/(a*b))
    return (l)

In [33]:
lift_values = pd.DataFrame(columns=['word_1','word_2','lift_val'])

for beer in top_10_beer:
    for attr in top_4_attributes:
        #Initialise lift to 0
        lift_db['beer'] = 0
        lift_db['attr'] = 0
        
        lift_db['beer'][lift_db['product_name'] == beer] = 1
        lift_db['attr'][lift_db['cleaned_review_wo_stopwords'] == attr] = 1
        
        c = lift_db.groupby(['product_name','product_review'])[['beer','attr']].sum().reset_index()   
        
        a = lift_db[lift_db['product_name']==beer]['product_review'].drop_duplicates().count()
        b = lift_db['attr'].sum()
        ab = c[(c['beer']>0) & (c['attr']==1)]['attr'].count()
        n = df1['product_review'].count()
        
        lift_val = lift(n, a, b, ab)
        
        lift_dict = {}
        lift_dict['word_1'] = beer
        lift_dict['word_2'] = attr
        lift_dict['lift_val'] = lift_val
        
        lift_values = lift_values.append(lift_dict, ignore_index=True)

In [34]:
lift_values

Unnamed: 0,word_1,word_2,lift_val
0,Oktoberfest,light,1.004166
1,Oktoberfest,carbonation,0.888263
2,Oktoberfest,sweet,1.790452
3,Oktoberfest,malt,1.561267
4,IPA,light,0.887013
5,IPA,carbonation,1.144255
6,IPA,sweet,0.629032
7,IPA,malt,1.451704
8,Porter,light,1.223467
9,Porter,carbonation,1.217533


In [35]:
similarity = pd.crosstab(lift_values['word_1'], lift_values['word_2'], lift_values['lift_val'], aggfunc=np.mean,rownames=['Beer'], colnames=['Attribute'])
similarity

Attribute,carbonation,light,malt,sweet
Beer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Boatswain Double IPA (Twin Screw Steamer),0.245198,0.443507,1.360972,0.808755
Camo Black Extra,0.490395,0.66526,1.088778,1.07834
Cisk XS Extra Lager,1.716383,1.33052,1.905361,2.695851
Four O Street Legal Malt Liquor,0.98079,1.108767,2.994139,2.695851
IPA,1.144255,0.887013,1.451704,0.629032
Oktoberfest,0.888263,1.004166,1.561267,1.790452
Pale Ale,0.923097,0.417418,1.537098,0.507454
Porter,1.217533,1.223467,1.20141,1.041156
Péché Mortel,1.384645,1.669672,1.537098,1.268636
Tripel,1.445375,0.746959,0.916866,1.135095
