In [1]:
import pandas as pd
import numpy as np

# Loading Data

In [2]:
brew = pd.read_csv('../data/breweryinfo.csv')
brew.head()

Unnamed: 0,beer_ratings,beer_reviews,beer_score,beers,brewery_name,brewery_number,brewery_pdev,brewery_ratings,brewery_review,brewery_score,city,country,province
0,8,3,3.39,6,'A Magara,40282,,,,,Nocera Terinese,Italy,
1,2969,517,3.84,48,(512) Brewing Company,17863,0%,2.0,0.0,4.02/5,Austin,United States,Texas
2,176,74,3.65,19,'t Hofbrouwerijke,13160,,,,,Beerzel,Belgium,
3,1,1,3.3,1,"0,5 Pub",53883,0%,0.0,0.0,0/5,Cahul,Moldova,
4,1,0,4.72,5,0 Mile Brewing Company,42171,,,,,Hummelstown,United States,Pennsylvania


#### Let's remove the '%' and '/5' from brewery_pdev and brewery_score

In [3]:
brew['brewery_pdev'] = pd.to_numeric(brew['brewery_pdev'].str.replace('%', ''))

In [4]:
brew['brewery_score'] = pd.to_numeric(brew['brewery_score'].str.replace('/5', ''))

In [5]:
brew.head()

Unnamed: 0,beer_ratings,beer_reviews,beer_score,beers,brewery_name,brewery_number,brewery_pdev,brewery_ratings,brewery_review,brewery_score,city,country,province
0,8,3,3.39,6,'A Magara,40282,,,,,Nocera Terinese,Italy,
1,2969,517,3.84,48,(512) Brewing Company,17863,0.0,2.0,0.0,4.02,Austin,United States,Texas
2,176,74,3.65,19,'t Hofbrouwerijke,13160,,,,,Beerzel,Belgium,
3,1,1,3.3,1,"0,5 Pub",53883,0.0,0.0,0.0,0.0,Cahul,Moldova,
4,1,0,4.72,5,0 Mile Brewing Company,42171,,,,,Hummelstown,United States,Pennsylvania


In [6]:
beer = pd.read_csv('../data/beerinfo.csv')
beer.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,abv,availability,ba_score,beer_name,beer_number,brewery_name,brewery_number,notes,pdev,ranking,ratings,reviews,style
0,,,0.0,Trilla,172737,'A Magara,40282,,0%,-,0,0,German Hefeweizen
1,,,4.14,Trupija,172735,'A Magara,40282,,0%,-,1,0,Belgian Saison
2,,,2.8,Riulì,273357,'A Magara,40282,,0%,-,2,1,American Pale Ale (APA)
3,,,3.57,Mericana,318408,'A Magara,40282,,0%,-,1,0,American IPA
4,,,3.41,Magarìa,249754,'A Magara,40282,,0%,-,2,1,American Porter


Reading the beer csv file prompted an error on two beers that were not parsed by the spider properly. The first one is [Weird Beard / Farmageddon Suspect Device](https://www.beeradvocate.com/beer/profile/31624/228703/) and the second one is [Interboro / Pipeworks - Mad Fat! Unicorn](https://www.beeradvocate.com/beer/profile/44293/272459/). I have no idea the scraper failed on those two, but we have to move on with our lives. We have to remove those entries. 

In [7]:
beer = beer[pd.to_numeric(beer['brewery_number'], errors='coerce').notnull()]
beer['brewery_number'] = pd.to_numeric(beer['brewery_number'])
beer.shape

(274223, 13)

In [8]:
comments = pd.read_csv('../data/comment.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
comments.head()

Unnamed: 0,ba_score,beer_number,comment,date,feel,look,overall,rdev,smell,taste,username
0,4.14,172735,,"Dec 24, 2017",,,,,,,Nattesferd
1,2.83,273357,"Dark amber, very hazy, with some small pieces ...","Mar 22, 2017",,,,,,,stcules
2,3.57,318408,,"Dec 24, 2017",,,,,,,Nattesferd
3,3.02,249754,"Black, light brown foam.<br>\nIn the smell coc...","Sep 21, 2016",,,,,,,stcules
4,3.55,380750,,"Nov 12, 2018",,,,,,,desint


In [10]:
comments.shape

(2404130, 11)

Similarly, we'd have to remove comment entries for those two beers

In [10]:
comments = comments[pd.to_numeric(comments['beer_number'], errors='coerce').notnull()]
comments['beer_number'] = pd.to_numeric(comments['beer_number'])

In [11]:
comments.shape

(2422451, 11)

In [12]:
beer = beer[pd.to_numeric(beer['beer_number'], errors='coerce').notnull()]
beer['beer_number'] = pd.to_numeric(beer['beer_number'])


## Taking a look at comment data

In [13]:
comments['comment'].iloc[3]

'Black, light brown foam.<br>\nIn the smell cocoa, toasted, chocolate. Dusty.<br>\nSame notes in the taste.<br>\nQuite round body, not excessive.<br>\nOk, nohing to remember, but an honest, drinkable, right to the styel robust porter.<br>\nThe 8ABV are well perceptible.<br><br><i class="fas fa-align-left"></i>\xa0<span class="muted">244 characters</span><br><br>'

#### There are evidently still some html tags left in the comments. We shell remove that

In [14]:
comments['comment'] = comments['comment'].str.replace('<[^>]*>',' ')
comments['comment'] = comments['comment'].str.replace('\\n',' ')
comments['comment'] = comments['comment'].str.replace('(\\xa0.*characters)',' ')

In [15]:
comments['comment'].iloc[3]

'Black, light brown foam.  In the smell cocoa, toasted, chocolate. Dusty.  Same notes in the taste.  Quite round body, not excessive.  Ok, nohing to remember, but an honest, drinkable, right to the styel robust porter.  The 8ABV are well perceptible.        '

### Saving data as one SQL database

In [32]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///beeradvocate.db')

In [33]:
brew.to_sql('breweries', con = engine)
beer.to_sql('beers', con = engine)
comments.to_sql('reviews', con = engine)

## Subsetting data to Canadian only

In [16]:
can_brew = brew[brew['country'] == 'Canada']
can_brew.head()

Unnamed: 0,beer_ratings,beer_reviews,beer_score,beers,brewery_name,brewery_number,brewery_pdev,brewery_ratings,brewery_review,brewery_score,city,country,province
32,0,0,0.0,3,13 Barrels Brewing,56604,0.0,0.0,0.0,0.0,Bathurst,Canada,New Brunswick
70,81,25,3.98,31,2 Crows Brewing Co.,48315,0.0,1.0,0.0,4.04,Halifax,Canada,Nova Scotia
126,271,75,3.86,99,5 Paddles Brewing Co.,32116,9.51,12.0,2.0,4.31,Whitby,Canada,Ontario
135,15,14,3.53,12,4th Meridian Brewing Co.,47607,,,,,Lloydminster,Canada,Alberta
176,6,0,4.31,10,9 Mile Legacy Brewing,52744,0.0,1.0,1.0,4.75,Saskatoon,Canada,Saskatchewan


In [17]:
can_beer = beer[beer['brewery_number'].isin(can_brew['brewery_number'])]
can_beer.head()

Unnamed: 0,abv,availability,ba_score,beer_name,beer_number,brewery_name,brewery_number,notes,pdev,ranking,ratings,reviews,style
41,,,0.0,Tall Tales,425476,13 Barrels Brewing,56604,,0%,-,0,0,American IPA
42,,,0.0,Rendez-Vous,425481,13 Barrels Brewing,56604,,0%,-,0,0,American Amber / Red Ale
44,,,0.0,Pabineau,425475,13 Barrels Brewing,56604,,0%,-,0,0,American Pale Ale (APA)
407,,,4.3,STAND UP BROWN,344989,9 Mile Legacy Brewing,52744,,0%,-,1,0,American Brown Ale
408,,,4.24,SINGLE HOP SERIES ENIGMA,347006,9 Mile Legacy Brewing,52744,,0%,-,1,0,English Pale Ale


In [18]:
can_reviews = comments[comments['beer_number'].isin(can_beer['beer_number'])]
can_reviews.head()

Unnamed: 0,ba_score,beer_number,comment,date,feel,look,overall,rdev,smell,taste,username
787,4.3,344989,,"May 13, 2018",,,,,,,LocalBeerGuy
788,4.24,347006,,"May 22, 2018",,,,,,,LocalBeerGuy
789,4.24,345724,,"May 11, 2019",,,,,,,LocalBeerGuy
790,4.43,371424,,"Sep 21, 2018",,,,,,,LocalBeerGuy
791,4.49,371425,,"Sep 21, 2018",,,,,,,LocalBeerGuy


### Save these files as csv for furthur analysis

In [19]:
can_brew.to_csv("../data/CanadianBreweries.csv")
can_beer.to_csv("../data/CanadianBeers.csv")
can_reviews.to_csv("../data/CanadianReviews.csv")