In [59]:
# Import necessary libraries
import requests
import pandas as pd
import time
import datetime
from bs4 import BeautifulSoup

# Establish base url
base_url = 'http://www.brewtoad.com'
scan_url = 'https://www.brewtoad.com/recipes?&sort=rank&recipe_types%5B%5D=3'

# Create an empty list of recipes
recipes = []

# Set Start Time
start = time.time()

# Pull 2500 pages of recipes
for i in range(1, 10):
    # set page number
    params = {'page': i}
    
    # generate request
    res = requests.get(url=scan_url, params=params)
    
    # Instantiate new beautiful soup parser
    soup = BeautifulSoup(res.content, 'lxml')
    
    # Find all recipe links on the page
    recipe_links = soup.find_all('a', attrs={'class': 'recipe-link'})

    for r in recipe_links:
        # Get new URL
        new_url = base_url + r.attrs['href']
#         print(new_url)

        # Use new URL to generate new request
        new_res = requests.get(new_url)
#         print(new_res.status_code)

        # Create new beautiful soup parser
        new_soup = BeautifulSoup(new_res.content, 'lxml')

        #instantiate recipe dictionary
        recipe = {}

        # Find Name and Style
        recipe['name'] = new_soup.find('h1').text.strip()
        recipe['style'] = new_soup.find('div', attrs={'class': 'header-content'}).find('a').text.strip()

        # Find base stats and append to dictionary
        base_stats = new_soup.find_all('div', attrs={'class': 'value'})
        recipe['OG'] = float(base_stats[0].text.strip())
        recipe['FG'] = float(base_stats[1].text.strip())
        recipe['IBU'] = int(base_stats[2].text.strip())
        recipe['SRM'] = int(base_stats[3].text.strip())
        recipe['ABV'] = float(base_stats[4].text.replace('%', '').strip())/100

        # Find fermentables and append to dictionary
        grains = new_soup.find('table', attrs={'id': 'fermentables'}).find('tbody').find_all('tr')
        fermentables = {}
        for j, g in enumerate(grains):
            row = g.find_all('td')
            fermentable = {}
            fermentable['amount'] = float(row[0].text.strip().split()[0])
            fermentable['amount_unit'] = row[0].text.strip().split()[1]
            fermentable['name'] = row[1].text.strip()
            fermentable['maltster'] = row[2].text.strip()
            fermentable['use'] = row[3].text.strip()
            fermentable['PPG'] = int(row[4].text.strip())
            fermentable['color'] = int(row[5].text.strip().split()[0])
            fermentable['color_unit'] = row[5].text.strip().split()[1]
            fermentables[j] = fermentable
        recipe['fermentables'] = fermentables

        # Find hops and append to dictionary
        hop_table = new_soup.find('table', attrs={'id': 'hops'}).find('tbody').find_all('tr')
        hops = {}
        for j, h in enumerate(hop_table):
            row = h.find_all('td')
            hop = {}
            hop['amount'] = float(row[0].text.strip().split()[0])
            hop['amount_unit'] = row[0].text.strip().split()[1]
            hop['name'] = row[1].text.strip()
            hop['time'] = int(row[2].text.strip().split()[0])
            hop['use'] = row[3].text.replace('\n', '')
            hop['form'] = row[4].text.replace('\n', '')
            hop['alpha'] = float(row[5].text.strip().replace('%', ''))/100
            hops[j] = hop
        recipe['hops'] = hops

        # Find yeast
        yeast_table = new_soup.find('table', attrs={'id': 'yeasts'}).find('tbody').find('tr')
        yeast_row = yeast_table.find_all('td')
        recipe['yeast_name'] = yeast_row[0].text.strip()
        recipe['yeast_lab'] = yeast_row[1].text.strip().replace('\n', ' ')
        recipe['yeast_attenuation'] = float(yeast_row[2].text.strip().replace('%', ''))/100

        # Find boil time and batch size
        final_stats = new_soup.find('ul', attrs={'class': 'stat-group-thirds'}).find_all('li')
        recipe['volume'] = float(final_stats[0].text.replace('Batch Size', '').strip().split()[0])
        recipe['volume_units'] = final_stats[0].text.replace('Batch Size', '').strip().split()[1]
        recipe['boil_time'] = int(final_stats[1].text.replace('Boil Time', '').strip().split()[0])
        recipe['boil_time_units'] = final_stats[1].text.replace('Boil Time', '').strip().split()[1]
        
        # Determine if there are extras
        if new_soup.find('table', attrs={'id': 'extras'}):
            recipe['extras'] = 1
        else:
            recipe['extras'] = 0

        # Add recipe to recipe list
        recipes.append(recipe)
    
    elapsed = time.time()-start
    hours = elapsed//3600
    minutes = (elapsed-(hours*3600))//60
    seconds = int((elapsed-(hours*3600)-(minutes*60)))
    print(i, 'Pages Scanned', str(i/25) + '% Complete', 'TIME ELAPSED: Hours:', hours, 'Minutes:', minutes, 'Seconds:', seconds)
        
    # Every 100 pages, create dataframe and send to csv
    if i % 100 == 0:

        # Convert list of dictionaries into dataframe
        df = pd.DataFrame(recipes)
        df.to_csv('./brewtoad_recipes_' + str(i) + '.csv')
        recipes = []
        time.sleep(3)
    
        print(str(i/25) + '% Complete')


1 Pages Scanned 0.04% Complete TIME ELAPSED: Hours: 0.0 Minutes: 0.0 Seconds: 19


KeyboardInterrupt: 

In [45]:
df.tail()

Unnamed: 0,ABV,FG,IBU,OG,SRM,boil_time,boil_time_units,extras,fermentables,hops,name,style,volume,volume_units,yeast_attenuation,yeast_lab,yeast_name
54,0.061,1.014,42,1.061,38,60,min,1,"{0: {'amount': 18.0, 'amount_unit': 'lb', 'nam...","{0: {'amount': 2.0, 'amount_unit': 'oz', 'name...",Milk Chocolate Peanut Butter Stout,American Stout,10.0,gal,0.765,White Labs WLP001,California Ale Yeast
55,0.06,1.014,25,1.06,15,60,min,0,"{0: {'amount': 8.0, 'amount_unit': 'lb', 'name...","{0: {'amount': 1.0, 'amount_unit': 'oz', 'name...",Raging Red Irish Red Ale,Irish Red Ale,5.0,gal,0.765,White Labs WLP001,California Ale Yeast
56,0.053,1.014,32,1.054,6,60,min,1,"{0: {'amount': 10.1, 'amount_unit': 'lb', 'nam...","{0: {'amount': 0.7, 'amount_unit': 'oz', 'name...",Sierra Nevada Pale Ale Clone (All Grain),American Pale Ale,5.5,gal,0.75,Wyeast 1056,American Ale
57,0.051,1.014,25,1.052,15,90,min,0,"{0: {'amount': 22.0, 'amount_unit': 'lb', 'nam...","{0: {'amount': 4.0, 'amount_unit': 'oz', 'name...",Jamil's BYO Irish Red Ale,Irish Red Ale,12.0,gal,0.74,White Labs WLP004,Irish Ale Yeast
58,0.034,1.009,21,1.035,23,60,min,0,"{0: {'amount': 5.5, 'amount_unit': 'lb', 'name...","{0: {'amount': 0.5, 'amount_unit': 'oz', 'name...",Reaper's Mild sub willamette,Mild,5.5,gal,0.75,White Labs WLP039,Nottingham Ale Yeast


In [54]:
73//60

1

In [41]:
df.loc[12, 'fermentables']

{0: {'amount': 9.0,
  'amount_unit': 'lb',
  'name': 'Maris Otter Pale Ale Malt',
  'maltster': 'Thomas Fawcett & Sons',
  'use': 'Mash',
  'PPG': 36,
  'color': 2,
  'color_unit': '°L'},
 1: {'amount': 1.5,
  'amount_unit': 'lb',
  'name': 'Munich Malt 10L',
  'maltster': 'Briess',
  'use': 'Mash',
  'PPG': 35,
  'color': 10,
  'color_unit': '°L'},
 2: {'amount': 0.75,
  'amount_unit': 'lb',
  'name': 'Caramel Malt 40L',
  'maltster': 'Briess',
  'use': 'Mash',
  'PPG': 34,
  'color': 40,
  'color_unit': '°L'},
 3: {'amount': 0.5,
  'amount_unit': 'lb',
  'name': 'Chocolate Wheat Malt',
  'maltster': 'Any',
  'use': 'Mash',
  'PPG': 33,
  'color': 400,
  'color_unit': '°L'},
 4: {'amount': 0.5,
  'amount_unit': 'lb',
  'name': 'Black Malt',
  'maltster': 'Briess',
  'use': 'Mash',
  'PPG': 32,
  'color': 500,
  'color_unit': '°L'}}

In [37]:
def fermentable_length(row):
    max_len = 0
    row_len = len(row['fermentables'])
    if row_len > max_len:
        max_len = row_len
    return max_len

In [40]:
df.apply(fermentable_length, axis=1).value_counts()

4    20
5    13
3    10
2    10
6     3
7     2
1     1
dtype: int64