In this notebook I scrape data from the website beeradvocate.com.

In [40]:
import urllib2
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import pickle
import bs4
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [41]:
#First go to the top 250 users webpage on beer advocate and parse it into a bs4 object
res = requests.get('https://www.beeradvocate.com/members/?sort=beers')
Soup = bs4.BeautifulSoup(res.text,'lxml')

In [42]:
#get the url's for each individual user
urls = [link.get('href') for link in Soup.find_all(class_='username')]

In [43]:
urls[0:10]

['/community/members/stonedtrippin.601042/',
 '/community/members/stonedtrippin.601042/',
 '/community/members/uclabrewn84.439438/',
 '/community/members/uclabrewn84.439438/',
 '/community/members/sammy.3853/',
 '/community/members/sammy.3853/',
 '/community/members/biboergosum.168458/',
 '/community/members/biboergosum.168458/',
 '/community/members/beerchitect.14442/',
 '/community/members/beerchitect.14442/']

In [44]:
#get number of reviews for each user and remove extra junk
numb = []
for n in Soup.find_all(class_='username'):
    numb.extend(n.find_next().find_next().contents)

numb = [int(n.replace(',','')) for n in numb[:250]]

In [45]:
#remove duplicates and urls grab erroneously
urls = [n for n in urls if n[0]=='/']
urls = urls[::2]

In [46]:
#extract usernames from urls
users = [r.replace('/community/members/','') for r in urls]
users = [r.rsplit('.',1)[0] for r in users]
users[0:10]

['stonedtrippin',
 'uclabrewn84',
 'sammy',
 'biboergosum',
 'beerchitect',
 'metter98',
 'brentk56',
 'phyl21ca',
 'superspak',
 'nerofiddled']

In [47]:
#create beer review url with username to find all beer rated by this user
user_links = ['https://www.beeradvocate.com/user/beers/?ba='+ k for k in users]
user_links[0:10]

['https://www.beeradvocate.com/user/beers/?ba=stonedtrippin',
 'https://www.beeradvocate.com/user/beers/?ba=uclabrewn84',
 'https://www.beeradvocate.com/user/beers/?ba=sammy',
 'https://www.beeradvocate.com/user/beers/?ba=biboergosum',
 'https://www.beeradvocate.com/user/beers/?ba=beerchitect',
 'https://www.beeradvocate.com/user/beers/?ba=metter98',
 'https://www.beeradvocate.com/user/beers/?ba=brentk56',
 'https://www.beeradvocate.com/user/beers/?ba=phyl21ca',
 'https://www.beeradvocate.com/user/beers/?ba=superspak',
 'https://www.beeradvocate.com/user/beers/?ba=nerofiddled']

In [48]:
#add starting page number to access each page of reviews.  calculate that using total number of review/50 (50 per page)
page_links = []
for n in range(len(users)):
    for k in range(0,numb[n],50):
        page_links.append(user_links[n]+'&&start='+str(k))

In [None]:
#These are now working urls that take us to every rating the top 250 raters have made.
page_links[0:10]

['https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=0',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=50',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=100',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=150',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=200',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=250',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=300',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=350',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=400',
 'https://www.beeradvocate.com/user/beers/?ba=stonedtrippin&&start=450']

In [None]:
#initialize empty dataframe
df = pd.DataFrame(columns = ['username','beer','rating','rDev'])
ind = -1
for link in range(len(page_links)):
    #This part counts when page url's restart at 0 aka when we've moved on to the next user
    if page_links[link][-2:] == '=0':
        ind = ind + 1
    try:
        #Parse each webpage into bs4
        rest = requests.get(page_links[link])
        souper = bs4.BeautifulSoup(rest.text,'lxml')
        
        #Get rating and rDev and add in username
        stuff = souper.find_all('b')[5:]
        beer_names = np.array([k.contents[0] for k in stuff[:-1:2]])
        ratings = np.array([p.contents[0] for p in stuff[1::2]])
        rDev = np.array([r.find_next().text for r in stuff[1::2]])
        col1 = np.repeat(users[ind],len(ratings))
        #Get beer and brewery names
        arc = souper.find_all('a')
        duck = np.array([arc[n].contents for n in range(len(arc)) if str(arc[n].get('href'))[0:14]=='/beer/profile/'][1::2])
        #Add to the empty dataframe
        mat = pd.DataFrame(np.column_stack([col1,beer_names,duck,ratings,rDev]),columns = ['username','beer','brewery','rating','rDev'])
        df = pd.concat([df,mat],ignore_index=True)
    except:
        pass

In [None]:
df

In [None]:
#basic data cleaning and saving to pkl file
import sys
sys.setrecursionlimit(10000)
df.loc[:,'rating'] = pd.to_numeric(df.rating)
#df.loc[:,'rDev'] = df.loc[:,'rDev'].str.replace('%','').str.replace('+','')
df.loc[:,'rDev'] = pd.to_numeric(df.rDev)
#df.drop(104081, inplace=True)
#Assign each beer brewery combination a unique id number
df = df.assign(beer_id=(df['brewery'] + '_' + df['beer']).astype('category').cat.codes)
df = pickle.dump(df,open("Beer_Rating_Dataset.pkl","wb"))

In [None]:
df.drop(103930, inplace=True)

In [None]:
df.sort_values('beer')

In [None]:
len(df.beer)

In [None]:
df[df.beer.str.contains('Corona')].mean()

In [None]:
df.groupby('username').mean().sort_values('rating',ascending=False)

In [None]:
df.rating.hist()
plt.xlabel('Ratings'); plt.ylabel('Number of Ratings')
plt.show()

In [None]:
df[['brewery','beer']].groupby('brewery').count().sort_values('beer',ascending=False)

In [None]:
df[['username','beer']].groupby('username').count().sort_values('beer',ascending=False).hist()
plt.xlabel('Number of Beers Reviewed'); plt.ylabel('Number of People')
plt.show()

In [None]:
df = df.assign(beer_id=(df['brewery'] + '_' + df['beer']).astype('category').cat.codes)
#pickle.dump(df, open('Beer_User_df.pkl','wb'))

Now its time to get the beer styles of each beer.

In [None]:
#Go to beer styles page
res = requests.get('https://www.beeradvocate.com/beer/style/')
Soup = bs4.BeautifulSoup(res.text,'lxml')

In [None]:
#get urls
urls = [link.get('href') for link in Soup.find_all('a')]

In [None]:
#throw away bad urls
good = [x for x in urls if type(x)==str]
good[0:10]

In [None]:
#filter other bad urls
better = [x for x in good if x[6:11]=='style']

In [None]:
#get ride of bad links at the start and end of list
bet = better[2:-2]
bet[0:10]

In [None]:
#create actual urls
bet = ['https://www.beeradvocate.com' + x + '?sort=revsD&start=0' for x in bet]
bet

In [None]:
res = requests.get(bet[0])
ba = bs4.BeautifulSoup(res.text,'lxml')

In [None]:
#numb = ba.find_all('b')[4].contents[0].split('of ')[1].split(') -')[0]
#int(numb)
#first = [ba.find_all('a')[x].contents[0] for x in range(len(ba.find_all('a'))) if ba.find_all('a')[x].contents[0]>0]
#second = [x.get('href') for x in first if x.get('href')[0:13]=='/beer/profile']
dab = [ba.find_all('a')[x].contents for x in range(len(ba.find_all('a')))]
#[dab[x+1] for x in range(len(dab)-1)

In [None]:
grr = []
for x in dab:
    try:
        if type(x[0]) == bs4.element.NavigableString:
            grr.append(x[0])
    except:
        pass

In [None]:
ttt = grr[124:-13]
ttt

In [None]:
#Get number of reviews.  The number is listed several ways and so the loop needs a way to catch all of them.
numbs = []
for r in range(len(bet)):
    try:
        res = requests.get(bet[r])
        ba = bs4.BeautifulSoup(res.text,'lxml')
        if ba.find_all('b')[3].contents[0][0:5]=='Style':
            numb = ba.find_all('b')[3].contents[0].split('of ')[1].split(') -')[0]
            numbs.append(int(numb))
        else:
            numb = ba.find_all('b')[2].contents[0].split('of ')[1].split(') -')[0]
            numbs.append(int(numb))
    except:
        if ba.find_all('b')[4].contents[0][0:5]=='Style':
                numb = ba.find_all('b')[4].contents[0].split('of ')[1].split(') -')[0]
                numbs.append(int(numb))

In [None]:
page_links = []
for n in range(len(bet)):
    for k in range(0,numbs[n],50):
        page_links.append(bet[n]+str(k))
        
page_links[0:10]

In [None]:
#get beer styles and names
dg = pd.DataFrame(columns=['brewery','beer','type'])
for link in range(len(page_links)):
        try:
            ttt = []
            grr = []
            print (float(link)/float(4000))*100
            rest = requests.get(page_links[link])
            souper = bs4.BeautifulSoup(rest.text,'lxml')
            category = souper.find_all('h1')[0].contents[0]
            beers = souper.find_all('b')[5:-1:3]
            beers = [x.contents[0] for x in beers]
            dab = [ba.find_all('a')[x].contents for x in range(len(ba.find_all('a')))]
            for x in dab:
                try:
                    if type(x[0]) == bs4.element.NavigableString:
                        grr.append(x[0])
                        ttt = grr[124:-13]
                except:
                    pass
            place = np.column_stack([ttt, beers])
            dh = pd.DataFrame(place,columns = ['brewery', 'beer'])
            dh.loc[:,'type'] = category
            dg = pd.concat([dg,dh],ignore_index=True)
        except:
            print page_links[link]

In [None]:
pickle.dump(dg,open('beer_types_final.pkl','wb'))

In [None]:
dg.drop('brewery',axis=1, inplace= True)

In [None]:
dg

Beer Groups
1. IPA/PA: American IPA, American Double/Imperial IPA, American Pale Ale (APA), Belgian IPA, Belgain Pale Ale, Belgian Strong Pale Ale, English India Pale Ale (IPA), 
2. Dark Ales (Amber/Red/Brown)
3. Stouts/Porters
4. Wheat Beer
5. Dark Lager
6. Light Lager
7. Other(spiced,saisson)