In [1]:
import pandas as pd
import numpy as np

# Loading Data

In [2]:
brew = pd.read_csv('./data/breweryinfo.csv')
brew.head()

Unnamed: 0,beer_ratings,beer_reviews,beer_score,beers,brewery_name,brewery_number,brewery_pdev,brewery_ratings,brewery_review,brewery_score,city,country,province
0,8,3,3.39,6,'A Magara,40282,,,,,Nocera Terinese,Italy,
1,38,23,3.98,7,1 Dampfbierbrauerei Zwiesel GmbH & Co.KG,6006,,,,,Zwiesel,Germany,
2,1,1,3.3,1,"0,5 Pub",53883,0%,0.0,0.0,0/5,Cahul,Moldova,
3,2952,514,3.85,45,(512) Brewing Company,17863,0%,2.0,0.0,4.02/5,Austin,United States,Texas
4,0,0,0.0,4,0 Mile Brewing Company,42171,,,,,Hummelstown,United States,Pennsylvania


#### Let's remove the '%' and '/5' from brewery_pdev and brewery_score

In [3]:
brew['brewery_pdev'] = pd.to_numeric(brew['brewery_pdev'].str.replace('%', ''))

In [4]:
brew['brewery_score'] = pd.to_numeric(brew['brewery_score'].str.replace('/5', ''))

In [5]:
brew.head()

Unnamed: 0,beer_ratings,beer_reviews,beer_score,beers,brewery_name,brewery_number,brewery_pdev,brewery_ratings,brewery_review,brewery_score,city,country,province
0,8,3,3.39,6,'A Magara,40282,,,,,Nocera Terinese,Italy,
1,38,23,3.98,7,1 Dampfbierbrauerei Zwiesel GmbH & Co.KG,6006,,,,,Zwiesel,Germany,
2,1,1,3.3,1,"0,5 Pub",53883,0.0,0.0,0.0,0.0,Cahul,Moldova,
3,2952,514,3.85,45,(512) Brewing Company,17863,0.0,2.0,0.0,4.02,Austin,United States,Texas
4,0,0,0.0,4,0 Mile Brewing Company,42171,,,,,Hummelstown,United States,Pennsylvania


In [6]:
beer = pd.read_csv('./data/beerinfo.csv')
beer.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,abv,availability,ba_score,beer_name,beer_number,brewery_name,brewery_number,notes,pdev,ranking,ratings,reviews,style
0,5.6,Year-round,4.14,Trupija,172735,'A Magara,40282,Year-round,0%,-,1,0,Belgian Saison
1,5.5,Year-round,0.0,Trilla,172737,'A Magara,40282,Year-round,0%,-,0,0,German Hefeweizen
2,6.0,Year-round,2.8,Riulì,273357,'A Magara,40282,Year-round,0%,-,2,1,American Pale Ale (APA)
3,8.0,Year-round,3.41,Magarìa,249754,'A Magara,40282,Year-round,0%,-,2,1,American Porter
4,7.0,Year-round,3.57,Mericana,318408,'A Magara,40282,Year-round,0%,-,1,0,American IPA


Reading the beer csv file prompted an error on two beers that were not parsed by the spider properly. The first one is [Weird Beard / Farmageddon Suspect Device](https://www.beeradvocate.com/beer/profile/31624/228703/) and the second one is [Interboro / Pipeworks - Mad Fat! Unicorn](https://www.beeradvocate.com/beer/profile/44293/272459/). I have no idea the scraper failed on those two, but we have to move on with our lives. We have to remove those entries. 

In [7]:
beer = beer[pd.to_numeric(beer['beer_number'], errors='coerce').notnull()]
beer['beer_number'] = pd.to_numeric(beer['beer_number'])

beer = beer[pd.to_numeric(beer['brewery_number'], errors='coerce').notnull()]
beer['brewery_number'] = pd.to_numeric(beer['brewery_number'])
beer.shape

(267434, 13)

In [8]:
comments = pd.read_csv('./data/comment.csv')

FileNotFoundError: [Errno 2] File b'./data/comment.csv' does not exist: b'./data/comment.csv'

In [None]:
comments.head()

In [None]:
comments.shape

Similarly, we'd have to remove comment entries for those two beers

In [None]:
comments = comments[pd.to_numeric(comments['beer_number'], errors='coerce').notnull()]
comments['beer_number'] = pd.to_numeric(comments['beer_number'])

In [None]:
comments.shape

### There are some breweries with multiple locations

In [None]:
brew['brewery_name'].value_counts().head()

In [None]:
brew[brew['brewery_name'] == 'Granite City Food & Brewery'].head()

These appear to be Breweries that also own resturants. Luckily, it seems that their beer ratings are the same. We will ignore them for now, and will need additional work when we do analysis on them.

## Taking a look at comment data

In [None]:
comments['comment'].iloc[3]

#### There are evidently still some html tags left in the comments. We shell remove that

In [None]:
comments['comment'] = comments['comment'].str.replace('<[^>]*>',' ')
comments['comment'] = comments['comment'].str.replace('\\n',' ')
comments['comment'] = comments['comment'].str.replace('(\\xa0.*characters)',' ')

In [None]:
comments['comment'].iloc[3]

### Saving data as one SQL database

In [None]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///./data/beeradvocate.db')

In [None]:
brew.to_sql('breweries', con = engine)
beer.to_sql('beers', con = engine)
comments.to_sql('reviews', con = engine)

### Saving data as CSV files

In [None]:
brew.to_csv("./data/cleaned_brewerys.csv")
beer.to_csv("./data/cleaned_beers.csv")
comments.to_csv("./data/cleaned_reviews.csv")

## Subsetting data to Canadian only

In [None]:
can_brew = brew[brew['country'] == 'Canada']
can_brew.head()

In [None]:
can_beer = beer[beer['brewery_number'].isin(can_brew['brewery_number'])]
can_beer.head()

In [None]:
can_reviews = comments[comments['beer_number'].isin(can_beer['beer_number'])]
can_reviews.head()

### Save these files as csv for furthur analysis

In [None]:
can_brew.to_csv("./data/CanadianBreweries.csv")
can_beer.to_csv("./data/CanadianBeers.csv")
can_reviews.to_csv("./data/CanadianReviews.csv")