In [5]:
import matplotlib.pyplot as plt
import numpy as np
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
import spacy

import warnings
warnings.filterwarnings("ignore")

In [6]:
df_reviews = pd.read_csv("Comments.csv")
df_reviews

Unnamed: 0.1,Unnamed: 0,product_name,user_rating,product_review
0,0,Carlton Cold,2.01,looks like beer smells and tastes like urine w...
1,1,Carlton Cold,2.23,september 2008: 375 ml clear bottle courtesy o...
2,2,Carlton Cold,1.02,i recently reviewed victoria bitter which i dr...
3,3,Carlton Cold,2.95,carlton cold is filtered below zero degrees ce...
4,4,Carlton Cold,1.62,format: a standard clear 355ml bottle with tha...
...,...,...,...,...
7207,7207,Sculpin,4.08,smells great with an exquisite finish. with a ...
7208,7208,Bar Fly,4.57,yesterdays meandering around through the bottl...
7209,7209,Bar Fly,4.29,the smoky aroma is stronger than it follows on...
7210,7210,Bar Fly,4.16,midnight black body topped with a dense creamy...


In [7]:
# Download NLTK stopwords (you only need to do this once)
#nltk.download('stopwords')
# Get the NLTK English stop words
stop_words = set(stopwords.words('english'))

# Tokenize and preprocess the comments (remove punctuation, convert to lowercase, and remove stop words)
def preprocesstext(text):
    if isinstance(text, str):  # Check if 'text' is a string
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        words = re.findall(r'\b\w+\b', text.lower())  # Tokenize and convert to lowercase
        nswords = [word for word in words if word not in stop_words]
        return ' '.join(nswords)
    else:
        return ''

df_reviews['review_processed'] = df_reviews['product_review'].apply(preprocesstext)
df_reviews

Unnamed: 0.1,Unnamed: 0,product_name,user_rating,product_review,review_processed
0,0,Carlton Cold,2.01,looks like beer smells and tastes like urine w...,looks like beer smells tastes like urine metal...
1,1,Carlton Cold,2.23,september 2008: 375 ml clear bottle courtesy o...,september 2008 375 ml clear bottle courtesy ca...
2,2,Carlton Cold,1.02,i recently reviewed victoria bitter which i dr...,recently reviewed victoria bitter drank old ti...
3,3,Carlton Cold,2.95,carlton cold is filtered below zero degrees ce...,carlton cold filtered zero degrees celsius cre...
4,4,Carlton Cold,1.62,format: a standard clear 355ml bottle with tha...,format standard clear 355ml bottle old carlton...
...,...,...,...,...,...
7207,7207,Sculpin,4.08,smells great with an exquisite finish. with a ...,smells great exquisite finish citrus like smel...
7208,7208,Bar Fly,4.57,yesterdays meandering around through the bottl...,yesterdays meandering around bottle backlog to...
7209,7209,Bar Fly,4.29,the smoky aroma is stronger than it follows on...,smoky aroma stronger follows tongue damn good ...
7210,7210,Bar Fly,4.16,midnight black body topped with a dense creamy...,midnight black body topped dense creamy light ...


In [8]:
df_reviews = df_reviews[df_reviews['review_processed'] != '']

In [9]:
selected_attributes = ['medium', 'carbonation', 'dark']

In [10]:
nlp = spacy.load("en_core_web_md")

In [11]:
def calculate_similarity(attribute, review):
    attribute_doc = nlp(attribute)
    review_doc = nlp(review)
    return attribute_doc.similarity(review_doc)

In [24]:
product_scores = {}
for product_name, group in df_reviews.groupby("product_name"):
    product_scores[product_name] = {}
    for attribute in selected_attributes:
        scores = [calculate_similarity(attribute, review) for review in group["review_processed"]]
        avg_score = sum(scores) / len(scores)
        product_scores[product_name][attribute] = avg_score


In [25]:
for product, attributes in product_scores.items():
    avg_score = sum(attributes.values()) / len(attributes)
    product_scores[product]['avg'] = avg_score

In [29]:
df_word_vec = pd.DataFrame.from_dict(product_scores, orient='index')
df_word_vec.sort_values(by='avg',ascending=False)

Unnamed: 0,medium,carbonation,dark,avg
Barrel-Aged Malevolence Chocolate Caliente,0.603454,0.515385,0.464895,0.527912
Yellow Bus,0.548023,0.439478,0.576942,0.521481
It Was All A Dream,0.562260,0.428910,0.571852,0.521007
Foggy Window,0.556993,0.485255,0.518662,0.520303
Stargazer,0.595000,0.434786,0.527469,0.519085
...,...,...,...,...
Dorothy (Wine Barrel Aged),0.203657,0.070065,0.303654,0.192459
Black Tuesday - Rum Barrel-Aged,0.235911,0.295057,-0.026037,0.168310
Gallo Draft,0.119539,0.063242,0.240805,0.141195
Speedway Stout - Vietnamese Coffee,0.100112,-0.030924,0.127651,0.065613
