In [0]:
## intalling required packages and drivers
!pip install -U -q spacy
!pip install -U -q spacy-lookups-data
!python -m spacy download en_core_web_sm
!python -m spacy download en

## General libraries
import sys
import spacy
nlp = spacy.load('en')
import pandas as pd
import numpy as np
import re
import string
import itertools
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

## Required for scraping
!pip install selenium
!apt install -yq chromium-chromedriver

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
from nltk.stem import WordNetLemmatizer 

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[K     |████████████████████████████████| 10.4MB 2.5MB/s 
[K     |████████████████████████████████| 122kB 51.1MB/s 
[K     |████████████████████████████████| 3.7MB 32.8MB/s 
[K     |████████████████████████████████| 2.2MB 29.4MB/s 
[K     |████████████████████████████████| 29.2MB 2.0MB/s 
[?25h  Building wheel for spacy-lookups-data (setup.py) ... [?25l[?25hdone
Collecting en_core_web_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0MB)
[K     |████████████████████████████████| 12.0MB 648kB/s 
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.5-cp36-none-any.whl size=12011741 sha256=9c0f88593745011c921213325b8fd6f7a55a00a821e2cb9023ad2e5855b681ab
  Stored in directory: /tmp/pip-ephem-wheel-cache-naz5_n38/wheels/6a/47/fb/6b5a0b8906d8e8779246c67d4658f

True

## Data Scraping

In [0]:
class Scraper():
  def __init__(self, driver, base_url, xpaths, split_index, start = 0, attributes = dict()):
    ## Initializing class variables
    self.base_url = base_url
    self.driver = driver
    self.xpaths = xpaths
    self.split_index = split_index
    self.content = pd.DataFrame(columns=list(xpaths.keys()))
    self.flag = True
    self.start = start
    self.attributes = attributes

  ## Extracting relevent content from the forum
  def get_content(self):
      self.flag = True
      post = self.start
      index = 0
      self.driver.get(self.base_url)
      while self.flag:
        ## Extract relevant elements and store in a dataframe
        for attr in list(xpaths.keys()):
          path = self.xpaths[attr][:self.split_index]+str(post)+self.xpaths[attr][self.split_index+1:]
          if attr in list(self.attributes.keys()):
            content = self.get_attr_if_exists(path, self.attributes[attr])
          else:
            content = self.get_content_if_exists(path)
          if(content!=''):
            self.content.at[index,attr] = content
        post += 1
        index += 1

  ## Check if name element is present and extract
  def get_content_if_exists(self, path):
    try:
        element = self.driver.find_element_by_xpath(path)
    except NoSuchElementException:
        self.flag = False
        return ''
    return element.text

   ## Check if name element is present and extract
  def get_attr_if_exists(self, path, attribute):
    try:
        element = self.driver.find_element_by_xpath(path)
    except NoSuchElementException:
        self.flag = False
        return ''
    return element.get_attribute(attribute)

In [0]:
## Initial parameters setting
base_url_beer = "https://www.beeradvocate.com/beer/top-rated/"
driver = webdriver.Chrome('chromedriver', options=chrome_options)
xpaths = {'name':'/html/body/div[2]/div/div[2]/div[2]/div[2]/div/div/div[3]/div/div/div[2]/table/tbody/tr[2]/td[2]/a/b',
          'description':'/html/body/div[2]/div/div[2]/div[2]/div[2]/div/div/div[3]/div/div/div[2]/table/tbody/tr[2]/td[2]/span',
          'numberofratings':'/html/body/div[2]/div/div[2]/div[2]/div[2]/div/div/div[3]/div/div/div[2]/table/tbody/tr[2]/td[3]/b',
          'rating':'/html/body/div[2]/div/div[2]/div[2]/div[2]/div/div/div[3]/div/div/div[2]/table/tbody/tr[2]/td[4]/b',
          'link':'/html/body/div[2]/div/div[2]/div[2]/div[2]/div/div/div[3]/div/div/div[2]/table/tbody/tr[2]/td[2]/a'}
attributes = {'link':'href'}

## Creating scrapper class object and scraping the forum
scraper = Scraper(driver,base_url_beer,xpaths,88,2,attributes)
scraper.get_content()

## Storing scraped data
beers = scraper.content
beers = beers.reset_index()

## Getting reviews data
reviews = pd.DataFrame()
driver = webdriver.Chrome('chromedriver', options=chrome_options)
for beer in tqdm(range(beers.shape[0])):
  xpaths = {'content':'/html/body/div[2]/div/div[2]/div[2]/div[2]/div/div/div[3]/div/div/div[2]/div[8]/div/div[1]/div[2]'}
  attributes = {}
  scraper = Scraper(driver,beers.ix[beer,'link'],xpaths,88,1,attributes)
  scraper.get_content()

  ## Storing scraped data
  data = scraper.content
  data['index'] = beers.ix[beer,'index']
  reviews = pd.concat([reviews,data],axis=0)

reviews['user_rating'] = reviews['content'].apply(lambda x: re.findall('[0-9\.]*(?=\/)',x)[0])
reviews['comment'] = reviews['content'].apply(lambda x: x.split('\n\n',1)[1].rsplit('\n\n',2)[0])

reviews = reviews[['index','comment','user_rating']]

data = pd.merge(reviews, beers, on = 'index', how = 'left')
data = data[['name','comment','user_rating']]
data.columns = ['product_name','product_review','user_rating']

data['clean_review'] = data['product_review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation+'\n')))
data['clean_review'] = data['clean_review'].apply(lambda x: x.lower())

## Data Cleaning

In [0]:
data = pd.read_csv("clean_data.csv")

In [0]:
class DataCleaning():
  def __init__(self, data, message_column, stopwords):
    ## Initializing class variables
    self.data = data
    self.stopwords = stopwords
    self.clean_data = self.data[message_column].apply(word_tokenize)
    self.lemmatizer = WordNetLemmatizer() 

  def get_clean_data(self):
    ## make all words lowercase
    output = self.clean_data.apply(self.convert_to_lower)
    ## remove non-alphanumeric words
    output = output.apply(self.remove_non_alphanumeric)
    ## remove stop words
    output = output.apply(self.remove_stopwords)
    ## remove short words
    output = output.apply(self.remove_short_words)
    ## lemmatizing words
    output = output.apply(self.lemmatize_words)
    ## converting list to string
    output = output.apply(lambda x: ' '.join(x))
    self.data['new_review'] = output
    
  def convert_to_lower(self, row):
    first = [word.lower() for word in row]
    second = [word.strip() for word in first]
    return second

  def remove_non_alphanumeric(self, row):
    return [word for word in row if word.isalpha()]

  def remove_stopwords(self, row):
    return [word for word in row if not word in self.stopwords]

  def remove_short_words(self, row):
    return [word for word in row if len(word)>2]

  def lemmatize_words(self, row):
    return [self.lemmatizer.lemmatize(word) for word in row]

In [0]:
stop_words = set(stopwords.words('english'))

cleaning = DataCleaning(data,'clean_review',stop_words)
cleaning.get_clean_data()

data = cleaning.data[['product_name','user_rating','new_review']]
data.columns = ['product_name','user_rating','product_review']

data.head()

Unnamed: 0,product_name,user_rating,product_review
0,Kentucky Brunch Brand Stout,4.8,silver wax aroma whiskey maple toffee umami ta...
1,Kentucky Brunch Brand Stout,4.74,beer pours pitch black frothy tan head bottle ...
2,Kentucky Brunch Brand Stout,4.68,probably smoothest beer ever smelled better ta...
3,Kentucky Brunch Brand Stout,5.0,dark black thick little bit tan head smell ama...
4,Kentucky Brunch Brand Stout,4.97,poured black ink thin ruby edge degree fast fi...


## Beer Attributes

In [0]:
all_attributes = ['aggressive', 'balanced', 'complex', 'diacetyl', 'estery', 'floral', 'fruity', 'hoppy', 'malty', 'roasty', 'robust']

In [0]:
print("Select three things you want your beer to be from the list below:\n1 Agressive\n2. Balanced\n3. Complex\n4. Diacetyl\n5. Estery\n6. Floral\n7. Fruity\n8. Hoppy\n9. Malty\n10. Roasty\n11. Robust\n\nIf you want your beer to be Aggressive, Balanced, and Roasty, type in 1 2 10.\n")
attr = list(map(int,input().split(' ')))
attr = [i-1 for i in attr]
attributes = [all_attributes[i] for i in attr]
print("\nThanks for your input.\n\nYou chose {}.".format(','.join(attributes)))

Select three things you want your beer to be from the list below:
1 Agressive
2. Balanced
3. Complex
4. Diacetyl
5. Estery
6. Floral
7. Fruity
8. Hoppy
9. Malty
10. Roasty
11. Robust

If you want your beer to be Aggressive, Balanced, and Roasty, type in 1 2 10.

6 7 9

Thanks for your input.

You chose floral,fruity,malty.


## Similarity and Sentiment of Reviews

In [0]:
class Similarity():
  def __init__(self, data, attributes, column):
    self.data = data
    self.attributes = attributes
    self.column = column

  def get_similarity(self):
    for row in tqdm(range(self.data.shape[0])):
      word_tokens = nlp(self.data.ix[row,self.column])
      similarity = 0
      for attr in self.attributes:
        similarity += nlp(attr).similarity(word_tokens)
      similarity = similarity/len(self.attributes)
      self.data.at[row,'similarity'] = similarity

  def get_sentiment(self):
    sentiment_analyzer = SentimentIntensityAnalyzer()
    for row in tqdm(range(self.data.shape[0])):
      sentiment_all = sentiment_analyzer.polarity_scores(self.data.ix[row,self.column])
      self.data.at[row,'sentiment'] = float(sentiment_all['compound'])

In [0]:
similarity = Similarity(data, attributes, 'product_review')
similarity.get_similarity()
similarity.get_sentiment()

data_similarity = similarity.data

100%|██████████| 6210/6210 [09:09<00:00, 11.30it/s]
100%|██████████| 6210/6210 [00:04<00:00, 1295.27it/s]


In [0]:
data_similarity.head()

Unnamed: 0,product_name,user_rating,product_review,similarity,sentiment
0,Kentucky Brunch Brand Stout,4.8,silver wax aroma whiskey maple toffee umami ta...,0.48211,-0.4215
1,Kentucky Brunch Brand Stout,4.74,beer pours pitch black frothy tan head bottle ...,0.47995,0.2263
2,Kentucky Brunch Brand Stout,4.68,probably smoothest beer ever smelled better ta...,0.430278,0.836
3,Kentucky Brunch Brand Stout,5.0,dark black thick little bit tan head smell ama...,0.456915,0.7972
4,Kentucky Brunch Brand Stout,4.97,poured black ink thin ruby edge degree fast fi...,0.459284,0.9781


## Top 300 Reviews

In [0]:
top_300 = data_similarity.sort_values('similarity',ascending=False).head(300)

top_300.head()

Unnamed: 0,product_name,user_rating,product_review,similarity,sentiment
1635,Fourth Dementia - Bourbon Barrel-Aged,4.75,smell taste like sweet raisin overall awesome ...,0.551946,0.9559
249,King JJJuliusss,5.0,old rating june amazing mouthfeel juice explos...,0.524934,0.8402
1696,Nectarine Premiere,5.0,draft site gardel orange like hazy style ipa m...,0.524792,0.9475
1339,Lou Pepe - Framboise,4.68,vintagepours deep red little headaroma huge ra...,0.523578,0.4445
2173,Alter Ego,4.49,trillium tree house sharp fresh would try pref...,0.522427,0.5859


## Sentiments of Top 300 Reviews

In [0]:
top_300['user_rating'] = top_300['user_rating'].astype('float')
top_300['sentiment'] = top_300['sentiment'].astype('float')

top_300.sort_values('sentiment',ascending=False).head()

Unnamed: 0,product_name,user_rating,product_review,similarity,sentiment
1250,Focal Banger,4.73,poured leo ursus glassa really pretty transluc...,0.487926,0.9928
4568,Sosus,4.5,great dipa topplong goliath one great hope gre...,0.509049,0.9926
1556,Beatification,4.72,poured beatification new year got love cork po...,0.501652,0.9904
4695,Matt,4.7,long time want thanks barrythebear vintage bot...,0.508008,0.9884
1653,Doppelganger,4.4,first attempted shiftdaced beer revieew renove...,0.488293,0.9858


## Recommendations on the basis of similarity and sentiment

In [0]:
data_recommend = top_300.groupby('product_name').agg({'user_rating':'mean','similarity':'mean','sentiment':'mean'})

In [0]:
data_recommend.sort_values(['sentiment','similarity'],ascending=False).head(3)

Unnamed: 0_level_0,user_rating,similarity,sentiment
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Focal Banger,4.73,0.487926,0.9928
Double Dry Hopped Fort Point Pale Ale,4.34,0.503068,0.9846
JJJuliusss,4.77,0.493087,0.9826


## Reccomendations on the basis of ratings

In [0]:
data_similarity['user_rating'] = data_similarity['user_rating'].astype('float')
data_similarity['sentiment'] = data_similarity['sentiment'].astype('float')

data_recommend_without = data_similarity.groupby('product_name').agg({'user_rating':'mean','similarity':'mean','sentiment':'mean'})

data_recommend_without_3 = data_recommend_without.sort_values(['user_rating'],ascending=False).head(3)
data_recommend_without_3

Unnamed: 0_level_0,user_rating,similarity,sentiment
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Kentucky Brunch Brand Stout,4.812,0.447277,0.800576
Chemtrailmix,4.811176,0.447881,0.849235
Barrel-Aged Abraxas,4.7964,0.443101,0.816876


## Task E and Task F Comparison

#### Beers that are recommended just on the basis of user ratings are not ideal recommendations for evryone. Some people might like to drink the most popular and higly rated beer. However, when it comes to recommending a product like beer, people tend to have different tastes and likings based on different attributes of the beer. People who have a taste for hoppy beers might like Witbier while people who want aggresive beers might like IPA.

#### This difference can be seen in general recommendation of Kentucky Brunch Brand Stout, Chemtrailmix, and Barrel-Aged Abraxas versus personalized recommendation of Focal Banger, Double Dry Hopped Fort Point Pale Ale, and JJJuliusss. The general recommendations, most of the time, would not meet specific preference of a person. We can see that a person who wants his/her beer to be floral, fruity, and malty in taste and texture would not like general recommendations which are far from having these properties.