<h3>Scrapping ratings from beeradvocate.com<h3>

In [1]:
#import necessary modules
import urllib2
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import pickle
import bs4
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

This code covers steps I took in creating Beer Maven.  Beer Maven is a beer recommendation system that allows users to search for beers and add them to a custom beer list.  This list is then sorted in the order a user will enjoy the beers.

In [2]:
#First go to the top 250 users webpage on beer advocate and parse it into a bs4 object
res = requests.get('https://www.beeradvocate.com/members/?sort=beers')
Soup = bs4.BeautifulSoup(res.text,'lxml')

In [3]:
#get the url's for each individual user
urls = [link.get('href') for link in Soup.find_all(class_='username')]
urls[0:10]

['/community/members/stonedtrippin.601042/',
 '/community/members/stonedtrippin.601042/',
 '/community/members/uclabrewn84.439438/',
 '/community/members/uclabrewn84.439438/',
 '/community/members/sammy.3853/',
 '/community/members/sammy.3853/',
 '/community/members/biboergosum.168458/',
 '/community/members/biboergosum.168458/',
 '/community/members/beerchitect.14442/',
 '/community/members/beerchitect.14442/']

In [4]:
#get number of reviews for each user and remove extra junk
numb = []
for n in Soup.find_all(class_='username'):
    numb.extend(n.find_next().find_next().contents)

numb = [int(n.replace(',','')) for n in numb[:250]]
numb

[10173,
 8767,
 7737,
 7193,
 7146,
 6852,
 6581,
 6450,
 6281,
 5810,
 5690,
 5620,
 5315,
 5279,
 5159,
 5158,
 5058,
 4912,
 4866,
 4796,
 4789,
 4662,
 4622,
 4290,
 4256,
 4174,
 4021,
 3947,
 3931,
 3886,
 3852,
 3846,
 3830,
 3819,
 3774,
 3772,
 3722,
 3718,
 3715,
 3712,
 3647,
 3606,
 3594,
 3590,
 3543,
 3517,
 3398,
 3388,
 3313,
 3297,
 3293,
 3284,
 3212,
 3181,
 3155,
 3153,
 3145,
 3117,
 3116,
 3102,
 3099,
 3038,
 2983,
 2885,
 2863,
 2852,
 2807,
 2750,
 2703,
 2666,
 2638,
 2635,
 2635,
 2629,
 2629,
 2591,
 2586,
 2559,
 2521,
 2502,
 2500,
 2494,
 2492,
 2486,
 2465,
 2465,
 2463,
 2455,
 2454,
 2416,
 2369,
 2340,
 2330,
 2330,
 2324,
 2289,
 2266,
 2235,
 2232,
 2200,
 2197,
 2179,
 2164,
 2131,
 2094,
 2092,
 2090,
 2078,
 2063,
 2053,
 2051,
 2049,
 2048,
 2044,
 2039,
 2039,
 2032,
 2000,
 1996,
 1996,
 1994,
 1993,
 1955,
 1953,
 1947,
 1946,
 1945,
 1939,
 1927,
 1906,
 1900,
 1887,
 1864,
 1845,
 1842,
 1839,
 1830,
 1820,
 1819,
 1819,
 1818,
 1815,
 1808

In [5]:
#remove duplicates and urls grab erroneously
urls = [n for n in urls if n[0]=='/']
urls = urls[::2]
urls[0:10]

['/community/members/stonedtrippin.601042/',
 '/community/members/uclabrewn84.439438/',
 '/community/members/sammy.3853/',
 '/community/members/biboergosum.168458/',
 '/community/members/beerchitect.14442/',
 '/community/members/metter98.95017/',
 '/community/members/brentk56.6284/',
 '/community/members/phyl21ca.2335/',
 '/community/members/superspak.456300/',
 '/community/members/nerofiddled.526/']

In [6]:
#extract usernames from urls
users = [r.replace('/community/members/','') for r in urls]
users = [r.rsplit('.',1)[0] for r in users]
users[0:10]

['stonedtrippin',
 'uclabrewn84',
 'sammy',
 'biboergosum',
 'beerchitect',
 'metter98',
 'brentk56',
 'phyl21ca',
 'superspak',
 'nerofiddled']

In [7]:
#create beer review url with username to find all beer rated by this user
user_links = ['https://www.beeradvocate.com/user/beers/?ba='+ k for k in users]
user_links[0:10]

['https://www.beeradvocate.com/user/beers/?ba=stonedtrippin',
 'https://www.beeradvocate.com/user/beers/?ba=uclabrewn84',
 'https://www.beeradvocate.com/user/beers/?ba=sammy',
 'https://www.beeradvocate.com/user/beers/?ba=biboergosum',
 'https://www.beeradvocate.com/user/beers/?ba=beerchitect',
 'https://www.beeradvocate.com/user/beers/?ba=metter98',
 'https://www.beeradvocate.com/user/beers/?ba=brentk56',
 'https://www.beeradvocate.com/user/beers/?ba=phyl21ca',
 'https://www.beeradvocate.com/user/beers/?ba=superspak',
 'https://www.beeradvocate.com/user/beers/?ba=nerofiddled']

In [8]:
#add starting page number to access each page of reviews.  calculate that using total number of review/50 (50 per page)
page_links = []
for n in range(len(users)):
    for k in range(0,numb[n],50):
        page_links.append(user_links[n]+'&&start='+str(k))
        
page_links[0:10]

['https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=0',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=50',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=100',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=150',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=200',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=250',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=300',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=350',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=400',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=450']

In [9]:
#initialize empty dataframe
df = pd.DataFrame(columns = ['username','beer','rating','rDev'])
ind = -1
for link in range(len(page_links)):
    
    #This part counts when page url's restart at 0 aka when we've moved on to the next user
    if page_links[link][-2:] == '=0':
        ind = ind + 1
    try:
        #Parse each webpage into bs4
        rest = requests.get(page_links[link])
        souper = bs4.BeautifulSoup(rest.text,'lxml')
        
        #Get rating and rDev and add in username
        stuff = souper.find_all('b')[5:]
        beer_names = np.array([k.contents[0] for k in stuff[:-1:2]])
        ratings = np.array([p.contents[0] for p in stuff[1::2]])
        rDev = np.array([r.find_next().text for r in stuff[1::2]])
        col1 = np.repeat(users[ind],len(ratings))
        
        #Get beer and brewery names
        arc = souper.find_all('a')
        duck = np.array([arc[n].contents for n in range(len(arc)) if str(arc[n].get('href'))[0:14]=='/beer/profile/'][1::2])
        
        #Add to the empty dataframe
        mat = pd.DataFrame(np.column_stack([col1,beer_names,duck,ratings,rDev]),columns = ['username','beer','brewery','rating','rDev'])
        df = pd.concat([df,mat],ignore_index=True)
    except:
        pass

In [10]:
df.head()

Unnamed: 0,username,beer,rating,rDev


In [11]:
#basic data cleaning and saving to pkl file
import sys
sys.setrecursionlimit(10000)
df.loc[:,'rating'] = pd.to_numeric(df.rating)
df.loc[:,'rDev'] = df.loc[:,'rDev'].str.replace('%','').str.replace('+','')
df.loc[:,'rDev'] = pd.to_numeric(df.rDev)
df.drop(104081, inplace=True)
df.drop(103930, inplace=True)
df = pickle.dump(df,open("Beer_Rating_Dataset.pkl","wb"))

ValueError: labels [104081] not contained in axis

In [None]:
#Assign each unique beer, brewery combination an id number
df = df.assign(beer_id=(df['brewery'] + '_' + df['beer']).astype('category').cat.codes)

<h3>Scraping Beer Types<h3>

In [None]:
#Go to beer styles page
res = requests.get('https://www.beeradvocate.com/beer/style/')
Soup = bs4.BeautifulSoup(res.text,'lxml')

In [None]:
#get urls
urls = [link.get('href') for link in Soup.find_all('a')]
urls[0:10]

In [None]:
#throw away bad urls
good = [x for x in urls if type(x)==str]
good[0:10]

In [None]:
#filter other bad urls
better = [x for x in good if x[6:11]=='style']
better[0:10]

In [None]:
#get ride of bad links at the start and end of list
bet = better[2:-2]
bet[0:10]

In [None]:
#Get number of reviews.  The number is listed several ways and so the loop needs a way to catch all of them.
numbs = []
for r in range(len(bet)):
    try:
        res = requests.get(bet[r])
        ba = bs4.BeautifulSoup(res.text,'lxml')
        if ba.find_all('b')[3].contents[0][0:5]=='Style':
            numb = ba.find_all('b')[3].contents[0].split('of ')[1].split(') -')[0]
            numbs.append(int(numb))
        else:
            numb = ba.find_all('b')[2].contents[0].split('of ')[1].split(') -')[0]
            numbs.append(int(numb))
    except:
        if ba.find_all('b')[4].contents[0][0:5]=='Style':
                numb = ba.find_all('b')[4].contents[0].split('of ')[1].split(') -')[0]
                numbs.append(int(numb))

In [None]:
#add page number to end of urls
page_links = []
for n in range(len(bet)):
    for k in range(0,numbs[n],50):
        page_links.append(bet[n]+str(k))
        
page_links[0:10]

In [None]:
#get beer styles and names
dg = pd.DataFrame(columns=['brewery','beer','type'])
for link in range(len(page_links)):
        try:
            ttt = []
            grr = []
            print (float(link)/float(4000))*100
            rest = requests.get(page_links[link])
            souper = bs4.BeautifulSoup(rest.text,'lxml')
            category = souper.find_all('h1')[0].contents[0]
            beers = souper.find_all('b')[5:-1:3]
            beers = [x.contents[0] for x in beers]
            dab = [ba.find_all('a')[x].contents for x in range(len(ba.find_all('a')))]
            for x in dab:
                try:
                    if type(x[0]) == bs4.element.NavigableString:
                        grr.append(x[0])
                        ttt = grr[124:-13]
                except:
                    pass
            place = np.column_stack([ttt, beers])
            dh = pd.DataFrame(place,columns = ['brewery', 'beer'])
            dh.loc[:,'type'] = category
            dg = pd.concat([dg,dh],ignore_index=True)
        except:
            print page_links[link]

<h3>Exploratory Data Analysis<h3>

In [None]:
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('Beer_Dataset')
df.head()

In [None]:
df.rating.hist()
plt.show()
print 'The mean of ratings is {} and the standard deviation is {}.'.format(df.rating.mean(),df.rating.std())

Ratings between 2.5-4.5 are very frequent.  The 5 point rating scale is compressed in this way, such that atrocious beers like Budweiser and Coors get about 2.5 average ratings.

In [None]:
# The best beers with a minimum of 10 ratings.  Beer with 1/2 reviews sometimes get perfect 5's but this is unreliable
multi = mult[mult>=10].index
df[df.beer_id.isin(multi)].groupby(['beer_id','brewery','beer']).mean().sort_values('rating',ascending=False).head(10)

In [None]:
# The worst beers with a minimum of 10 ratings
df[df.beer_id.isin(multi)].groupby(['beer_id','brewery','beer']).mean().sort_values('rating').head(10)

In [None]:
#average ratings for beer group
df.groupby('group').mean().rating

In [None]:
df.groupby('group').rating.mean().plot(kind='bar')
plt.show()

In [None]:
#standard deviation of groups.  Higher standard deviation means a more controversial beer
df.groupby('group').rating.std().plot(kind='bar')
plt.show()

In [None]:
#rDev is rating deviation.  It represents how far of the website wide average rating a user was.  Here it represents how
#top 250 reviewers rated the beer in comparison to every user.
df.groupby('group').rDev.mean().plot(kind='bar')
plt.show()

In [None]:
#need more stuff here

<h3>Assigning Beer Groups<h3>

In [None]:
import surprise
#load rating datafram
df = pickle.load(open('Beer_Rating_Dataset.pkl','rb'))
df.head()

In [None]:
#load beer styles
dg = pickle.load(open('beer_types_final.pkl','rb'))
dg.head()

In [None]:
#find beers with more than one style listed
recur = dg.beer.value_counts().values > 1
tulips = dg.beer.value_counts()[recur].index

In [None]:
#Assign the most common beer style listed to beer with more than one style.  If there is a tie the first alphabetically is picked
for n in tulips:
    foam = dg[dg.beer==n].type
    dg.loc[n,'type'] = foam.value_counts().index[0]

In [None]:
#merge dataframes
dz = df.merge(dg,on='beer')

For lack of a better option, groups were assigned by me for the purpose learning user preference.  In the future I would love to have beer experts create their own groups and average them.  I tried to do this in a post on beeradvocate.com but got banned.

In [None]:
IPA = ['American IPA','American Pale Ale (APA)','American Double / Imperial IPA','English India Pale Ale (IPA)',
       'Belgian IPA','English Pale Ale','Belgian Pale Ale','American Black Ale']

In [None]:
DarkAles = ['American Amber / Red Ale', 'American Brown Ale','Belgian Dark Ale','English Brown Ale',
            'Irish Red Ale','Flanders Red Ale','Altbier',u'Bière de Garde','Scottish Ale','Dubbel',
            'Quadrupel (Quad)','Winter Warmer']

In [None]:
PorterStouts = ['American Porter','American Stout','American Double / Imperial Stout','English Porter','English Stout',
                'Milk / Sweet Stout','Irish Dry Stout','Foreign / Export Stout','Oatmeal Stout','Russian Imperial Stout',
                'Baltic Porter']

In [None]:
Wheat = ['American Dark Wheat Ale','American Pale Wheat Ale','American Blonde Ale','Witbier','English Pale Mild Ale',
         'English Dark Mild Ale','Berliner Weissbier','Dunkelweizen','Gose',u'Kölsch','Kölsch','Hefeweizen','Kristalweizen',
         'Weizenbock']

In [None]:
Strong = ['American Barleywine','Wheatwine','English Strong Ale','English Barleywine','Braggot','Belgian Strong Dark Ale',
          'Belgian Strong Pale Ale','American Strong Ale','Euro Strong Lager','American Malt Liquor','Scotch Ale / Wee Heavy',
         u'Bière de Champagne / Bière Bru','Tripel',u'Bière de Champagne / Bière Brut','Old Ale']

In [None]:
LightLager = ['American Adjunct Lager','American Double / Imperial Pilsner','Light Lager','Czech Pilsener',
             'Euro Pale Lager','Dortmunder / Export Lager','German Pilsener','Kellerbier / Zwickelbier','Munich Helles Lager',
             'Happoshu','Japanese Rice Lager']

In [None]:
DarkLager = ['Bock','Doppelbock','Eisbock','California Common / Steam Beer','American Amber / Red Lager','American Pale Lager',
            'Euro Dark Lager','Maibock / Helles Bock', u'Märzen / Oktoberfest','Munich Dunkel Lager','Rauchbier','Schwarzbier',
             'Vienna Lager']

In [None]:
Fruity = ['Fruit / Vegetable Beer','Herbed / Spiced Beer','Smoked Beer','American Wild Ale','Lambic - Fruit',
        'Saison / Farmhouse Ale','Cream Ale','Pumpkin Ale','Chile Beer','English Bitter','Extra Special / Strong Bitter (ESB)',
          'Kvass','Scottish Gruit / Ancient Herbed Ale','Faro','Flanders Oud Bruin','Gueuze','Lambic - Unblended','Black & Tan',
         'Roggenbier','Sahti']

In [None]:
#assign groups
for x in IPA:
    dz.loc[dz.type==x,'group'] = 'IPA/PA'
for x in DarkAles:
    dz.loc[dz.type==x,'group'] = 'Dark Ale'
for x in PorterStouts:
    dz.loc[dz.type==x,'group'] = 'Porter/Stout'
for x in Wheat:
    dz.loc[dz.type==x,'group'] = 'Wheat'
for x in Strong:
    dz.loc[dz.type==x,'group'] = 'High ABV'
for x in DarkLager:
    dz.loc[dz.type==x,'group'] = 'Dark Lager'
for x in LightLager:
    dz.loc[dz.type==x,'group'] = 'Pale Lager'
for x in Fruity:
    dz.loc[dz.type==x,'group'] = 'Fruity/Flavored'
dz.loc[dz.type=='Low Alcohol Beer','group'] = 'Non-Alcoholic'

In [None]:
dz.group.value_counts().plot(kind='bar')
plt.show()

In [None]:
#fix weird cases
dz.loc[dz.beer.str.contains('IPA')&dz.group.isnull(),'group'] = 'IPA/PA'
dz.loc[dz.beer.str.contains('Pale')&dz.group.isnull(),'group'] = 'IPA/PA'
dz.loc[dz.beer == 'Ryeday The 13th','group'] = 'High ABV'
dz.loc[(dz.type == 'Rye Beer')&dz.group.isnull(),'group'] = 'Fruity/Flavored'

<h3>Model Selection<h3>

Now to fit a model to the ratings data.  I used a recommendation module called Surprise.  I tested every algorith Surprise offers but will only include code for SVD and KNNBaseline here for brevity.

In [None]:
from surprise import SVD
from surprise import KNNBaseline as KNNB
import surprise
from surprise import Dataset
from surprise import evaluate
from surprise import Reader
from surprise import GridSearch
from surprise import Reader

In [None]:
dx = pd.read_csv('Beer_Dataset',encoding='utf-8')

In [None]:
reader = Reader(rating_scale=(1, 5))
jaja = Dataset.load_from_df(dx[['username','beer_id','rating']], reader)
jaja.split()
algo = SVD()
algo2 = KNNB()

In [None]:
#call the evaluate method to see how well the untuned SVD algorithm performs
perf = evaluate(algo, jaja, measures=['RMSE', 'MAE'])

In [None]:
#and the same for the KNNBaseline algorithm
perf2 = evaluate(algo2, jaja, measures=['RMSE','MAE'])

In [None]:
#KNNBaseline performs the best so now to tune the model with GridSearch
params = {'k':[10,20,30,40,50]}
Gs = GridSearch(KNNB,params)
Gs.evaluate(jaja)

<h3>Adding User Preferences<h3>

The cold start problem is a major issue for beer Maven.  Initially my goal was for users to rate a 10 or so common beers so that their rating could be added to the dataframe and predicted on. Here is a function I created to do this

In [None]:
def StartProfile():
    ratings = []
    beers = ['Blue Moon Belgian White','Negra Modelo','Corona Extra','Guinness Draught','Brown Shugga\'','Fat Tire Amber Ale','Newcastle Brown Ale','Budweiser','Arrogant Bastard','Sculpin IPA']
    brewery = ['Coors Brewing Company','Grupo Modelo S.A. de C.V.','Grupo Modelo S.A. de C.V.','Guinnness Ltd.','Lagunitas Brewing Company','New Belgium Brewing','Heineken Nederland B.V.','Anheuser_Busch','Arrogant Brewing','Ballast Point Brewing Company']
    beer_id = [33753,52675,52662,52740,64504,76970,55151,5746,6533,8579]
    beer_id = [int(x) for x in beer_id]
    user = raw_input('Please enter your username:')
    print 'Please rate the following beers (1-5).  If you have not tried a beer, respond with 0'
    ratings.append(float(raw_input('Blue Moon')))
    ratings.append(float(raw_input('Negra Modelo')))
    ratings.append(float(raw_input('Corona Extra')))
    ratings.append(float(raw_input('Guinness Draught')))
    ratings.append(float(raw_input('Lagunitas Brown Shugga\'')))
    ratings.append(float(raw_input('Fat Tire Amber Ale')))
    ratings.append(float(raw_input('Newcastle Brown Ale')))
    ratings.append(float(raw_input('Budweiser')))
    ratings.append(float(raw_input('Arrogant Bastard Ale')))
    ratings.append(float(raw_input('Sculpin IPA')))
    
    length = len(df)
    for n in range(len(ratings)):
        avg = df[df.beer_id==beer_id[n]].mean()[1]
        length += 1
        # Users are assigned the average rating if they have not tried a beer.
        if ratings[n] == 0:
            df.loc[length,'username'] = user
            df.loc[length,'brewery'] = brewery[n]
            df.loc[length,'beer'] = beers[n]
            df.loc[length,'rating'] = avg
            df.loc[length,'beer_id'] = beer_id[n]
            df.loc[length,'rDev'] = 0
        else:
            df.loc[length,'username'] = user
            df.loc[length,'brewery'] = brewery[n]
            df.loc[length,'beer'] = beers[n]
            df.loc[length,'rating'] = ratings[n]
            df.loc[length,'beer_id'] = beer_id[n]
            df.loc[length,'rDev'] = ((ratings[n]-avg)/avg)*100

I toyed with the number of beer and which beers to use, but I found that no matter what, the recommendation system was too insensitive to these ratings.  Recommendations were being made according to the mean rating of beers of the dataset and did not reflect user preferences at all.

Typically beer preferences are communicated by the varieties of beer a person likes.  For example, many people love hoppy IPA's and Pale Ales.  Others like more drinkable wheat beers or dislike the heavy taste of a Porter or Stout. This motivated me to try and solve my cold start problem by asking users to rate types of beer.  This is the reason I scrapped data on beer styles and subsequently binned these styles into 9 larger groups.  Users ratings of these beer groups are used to modify the dataframe, and then the modified dataframe is predicted on by the recommendation system.

In [None]:
#This function is an early mock-up.  This task is handled by the plotly dash app instead
def Create_Pref():
    '''Run to create a preference vector to be used with Add_User_Preference function'''
    groups = ['IPAs and Pale Ales','Porters and Stouts','Fruity and Flavored Beers','Dark Ales','Wheat Beers',
              'High Alcohol Content','Pale Lager','Dark Lager','Non-Alcoholic']
    prefs = []
    
    print 'Rate the following beers groups according the following scale:'
    print '1: Strongly Dislike   2: Somewhat Dislike   3: Neutral   4: Somewhat Like  5: Love'
    for x in groups:
        prefs.append(int(raw_input(x)))
        
    return prefs

In [None]:
#This function is used in the plotly dash app to modify the dataframe
def Add_User_Preference(pref_vect, df):
    '''Takes a preference vector and the beer rating dataframe and return a modified dataframe to be predicted on'''
    data = df.copy()
    
    #get group names
    grouped = data.group.value_counts().index.values
    
    for x in range(len(grouped)-1):
        
        #get the original scores for all beers of given group
        score = data[data.group==grouped[x]].rating
        
        #If group has a rating of 1 subtract .5 rating from every beer in the group
        if pref_vect[x] == 1:
            data.loc[data.group==grouped[x],'rating'] = score - .5
        
        #If group has rating of 2 subtract .25
        if pref_vect[x] == 2:
            data.loc[data.group==grouped[x],'rating'] = score - .25
        
        #If group has rating of 4 add .25
        if pref_vect[x] == 4:
            data.loc[data.group==grouped[x],'rating'] = score + .25
        
        #If group has rating of 5 add .5
        if pref_vect[x] == 5:
            data.loc[data.group==grouped[x],'rating'] = score + .5
        data.loc[data.rating > 5,'rating'] = 5
    
    return data

I've tinkered with the + and - values for ratings of 1,2,4, and 5 and found adjustments of .25 and .5 to work best.  However, this needs to be tested further and optimized.

In [None]:
#now train the model on the modified dataframe
dr = Add_User_Preference()
data = Dataset.load_from_df(dr[['username','beer_id','rating']], reader)
#data.split()
trainset = data.build_full_trainset()
algo2.train(trainset)
#testset = trainset.build_anti_testet()
#predictions = algo.test(testset)

There is an issue with the build_anti_testset method in the Surprise module.  The method runs for 30 minutes before exceeding the my RAM and crashing.  This makes little sense as the build_full_testset method works totally fine and runs in under a minute.  Fortunately the evaluate method used earlier serves the same purpose and the scores demonstrate that overfitting is not an issue.  Since I cannot build a testset I decide to train the model using the full dataframe.

In [None]:
#This function is included in the plotly dash app and modified so that a username isn't needed and users add to the list by 
#searching for beer name and brewery

def Sort_Beers(user, beer_list):
    '''Take a user name and a list of beer_ids.  Returns the list sorted in order of score predicted by the recommendation system'''
    ok = [[x,algo.predict(user, x)[3]] for x in beer_list]
    okee = pd.DataFrame(ok,columns=['beer_id','rating'])
    do = okee.groupby('beer_id').mean().sort_values('rating',ascending=False).index
    return sorted([[dr[dr.beer_id == x].rating.values[0], dr[dr.beer_id==x].beer.values[0]] for x in do])[-1::-1]

<h3>Search Engine<h3>

In order to use the app, user must be able to quickly search for beers by beer, brewery, or a combination of the two.  I started by trying to create my own search in the form of a spell checker using nltk and spaCy.

In [None]:
#Here is one such attempt lusing edit distance
import re
Words = list(set(dx.brewery.values))

def Super_Lookup(entry):
    '''Takes a search query and returns the 3 closest results from the set of brewery names'''
    #remove stopwords
    stopwords = [' Brewing', ' Brewery',' Company',' Co.',' Tasting Room',' Pub',' &',' Craft',' Brauerei','Berkeley','Works','Co']
    brewers = Words
    for r in stopwords:
        entry = entry.replace(r,'')
        brewers = [x.replace(r,'') for x in brewers]
    
    #tokenize entry and brewers
    entry_len =  len(re.findall(r'\b\w+\b', entry))
    brewer_len = {brewers[x]:(len(re.findall(r'\b\w+\b', brewers[x])),x) for x in range(len(brewers))}
    poss = [(x, brewer_len[x][1]) for x in brewer_len.keys() if brewer_len[x][0]>=entry_len]
    
    #return closest match with same number of words
    f = [[nltk.edit_distance(entry,poss[x][0]),Words[poss[x][1]]] for x in range(len(poss))]
    return [sorted(f)[0][1], sorted(f)[1][1], sorted(f)[2][1]]

In [None]:
Super_Lookup('Firestone-Walker')

Ultimately I realized I would be better off using a open source python library called Whoosh.

In [None]:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer

#Declare the schema
schema = Schema(beer_id=STORED(),
                brewery=TEXT(stored=True),
                beer=TEXT(stored=True),
                group=KEYWORD)

In [None]:
df = pd.read_csv('Beer_Dataset',encoding='utf-8')

In [None]:
#create the index used to search
import os.path
from whoosh.index import create_in

if not os.path.exists("index"):
    os.mkdir("index")
ix = create_in("index", schema)

In [None]:
from whoosh.index import open_dir

ix = open_dir("index")

In [None]:
#open writer in order to add documents to the index
writer = ix.writer()

In [None]:
docs = df[['brewery','beer_id','beer','group']].groupby(['beer_id','brewery','beer','group']).count().reset_index()

In [None]:
for t in range(len(docs)):
    writer.add_document(brewery = docs.brewery[t], beer = docs.beer[t], beer_id = docs.beer_id[t], group = docs.group[t])

In [None]:
#commit changes to index
writer.commit()

In [None]:
#Create a multifield parser that allows user to search beer, brewery, or both fields
from whoosh.qparser import QueryParser, MultifieldParser
qp = MultifieldParser(["brewery", "beer"], schema=ix.schema)

In [None]:
#an example of searching the index for the string 'fat tire amber'

#create search parse
q = qp.parse(u'fat tire amber')

#search index
with ix.searcher() as s:
    results = s.search(q)
    
    #record results and format
    liner = []
    ids = []
    for x in range(len(results)):
        try:
            be = results[x]['beer']
            br = results[x]['brewery']
            bi = results[x]['beer_id']
            new_line = str(be + ', ' + br)
            ids.append(bi)
            liner.append(new_line)
        except:
            print results[x]

    lines = []
    
    #format for use with dropdown list in plotly dash
    for d in range(len(resulte)):
        ent = {'label':liner[d],'value':ids[d]}
        lines.append(ent)
    print lines
    
#In the app user select the beer name from a list and press a button which adds the corresponding value, the beer_id, to
#beer list so users don't need to manually look up the ids.

In order to use the app please go to: https://github.com/thewho14/Beer_Maven and follow the installation instructions in the readme.  Please be patient as the app runs very slowly at the moment.

<h3>Text Analsysis<h3>

Before I tried a collaborative recommendation system based on ratings, I tried building a content based recommendation system
using text reviews for the top 250 beer most rated beers on beeradvocate.com.  I'll omit the code used to scrape as it is very similar to scraping code above.  The dataset I created is a dictionary where each key is a beer name and the values are lists of strings storing reviews for that beer.

In [None]:
#I first tried a bag of words approach to find top used words for each beer.
import pickle
import pandas
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
import scipy as sp
from collections import defaultdict

In [None]:
revs = pickle.load(open('Beer_Reviews.pkl'))

#flatten lists inside of the data
for name in revs.keys():
    reviews = []
    for n in range(len(revs[name])):
        if type(revs[name][n]) == unicode:
            reviews.append(revs[name][n])
        elif type(revs[name][n])==list:
            reviews.extend(revs[name][n])
    revs[name] = reviews

In [None]:
def GetTopWords(key):
    '''takes beer name as a string and returns the top 250 most used words from reviews for that beer'''
    #initialize count vectorizer, fit to reviews, and transform them into bag of words format
    count_vect = CountVectorizer(stop_words = 'english',analyzer='word')
    Mega = count_vect.fit_transform(revs[key])
    
    #Initialize tfid transformer and transform the bag of words
    Tfid = TfidfTransformer()
    Kila = Tfid.fit_transform(Mega)
    
    #Get index of top words
    ind = Kila.toarray().sum(axis=0).argsort()
    brew = ind[::-1][0:250]
    
    #get top words
    b = [count_vect.get_feature_names()[n] for n in brew]
    return b

In [None]:
GetTopWords('Budweiser')

In [None]:
#store the top 250 words for each beer in a new dictionary
top250 = defaultdict()

for k in revs.keys():
    top250[k] = GetTopWords(k)

In [None]:
#a first try at recommendation.  Beers are listed in order of how many words they share from their top 250 list.
def FindSimilarBeers(Beer):
    '''Takes a beer name and returns beer in order of how many top words they share'''
    counts = []
    #pickle.load(open(top100,'wb'))
    words = top250[Beer]
    for k in top250.keys():
        if k != Beer:
            counts.append(len(words.intersection(top250[k])))
    closest = np.argmax(counts)
    return top250.keys()[closest]

In [None]:
#This approach did not work very well
FindSimilarBeers('Sculpin IPA')

Many top words are not useful on their own.  They need to be included in phrases in order them to be meaningful.  For example, words like "taste", "head", and "flavor" need an adjective preceding them to make them meaningful ie "bad taste", "tan head", and "malty flavor".  As a result I moved on to looking for ways to find descriptive phrases.

In [None]:
#I tried used Bigram and Trigram collocations in nltk to look for common pairings, but these still weren't useful phrases
import nltk
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
[revs['Bud Light']
finder = BigramCollocationFinder.from_words(revs['Bud Light'])
finder.nbest(bigram_measures.pmi, 3)

In [None]:
#I also tried using noun chunks in spaCy
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_lg')

In [None]:
#parse reviews in spaCy doc objects
doc = nlp.pipe(revs['Bud Light'])

In [None]:
#print out noun chunks.  As you can see this also does not work very well
for doc in nlp.pipe(revs['Bud Light'][0:10]):
    for k in doc.noun_chunks:
            print k, len(k)

When documents are parsed in spaCy, they are assigned a part-of-speech tags based spaCy's build in deep learning model.  I decided to manually look for common patterns in these parts-of-speech tags in order to capture phrases.

In [None]:
#sample review string
string = (u'Pours a clear copper, with 2 ½ big foamy fingers of shiny off-white head. This shrinks down pretty quickly, with a frothy ½ finger that leaves back a cascade of lacing which hangs in sticky clumps. The aroma is floral and leafy, with a faint metallic tinge to the proceedings. The rest of the nose is filled out with citrus, caramel, toast, and a light touch of diacetyl. Off-notes hurt this a bit.' )

#remove punctuation
string = string.replace(u',','').replace(u'\'','').replace(u'.','')

#parse
text = nlp(string)

In [None]:
from spacy.matcher import Matcher

#initialize spaCy matcher
matcher = Matcher(nlp.vocab)

#define patterns such as "adjective noun" and "noun noun conjunctive noun"
pattern1 = [{'TAG':'JJ'},{'TAG':'NN'}]
pattern2 = [{'TAG':'JJ'},{'TAG':'JJ'}]
pattern3 = [{'TAG':'NN'},{'TAG':'NN'}]
pattern4 = [{'TAG':'NN'},{'TAG':'JJ'}]
pattern5 = [{'TAG':'NN'},{},{'TAG':'NN'},{'TAG':u',','OP':'!'}]
pattern6 = [{'TAG':'NN'},{'TAG':'NN'},{'TAG':u',','OP':'*'},{'TAG':'CC'},{'TAG':'NN'}]

#match patterns
matcher.add('type1', None, pattern1,pattern2,pattern3,pattern4,pattern5, pattern6)
matches = matcher(text)

r = [[matches[n][1],matches[n][2]] for n in range(len(matches))]
[text[r[n][0]:r[n][1]] for n in range(len(r))]