In [16]:
# Import necessary libraries
import requests
import pandas as pd
import time
import datetime
from bs4 import BeautifulSoup

# Establish base url
base_url = 'http://www.brewtoad.com'
scan_url = 'https://www.brewtoad.com/recipes?&sort=rank&recipe_types%5B%5D=3'

# Create an empty list of recipes
recipes = []

# Set Start Time
start = time.time()

# Pull 3000 pages of recipes
for i in range(1, 3):
    # set page number
    params = {'page': i}
    
    # generate request
    res = requests.get(url=scan_url, params=params)
    
    # Instantiate new beautiful soup parser
    soup = BeautifulSoup(res.content, 'lxml')
    
    # Find all recipe links on the page
    recipe_links = soup.find_all('a', attrs={'class': 'recipe-link'})

    for r in recipe_links:
        try:
            # Get new URL
            new_url = base_url + r.attrs['href']

            # Use new URL to generate new request
            new_res = requests.get(new_url)

            # Create new beautiful soup parser
            new_soup = BeautifulSoup(new_res.content, 'lxml')

            #instantiate recipe dictionary
            recipe = {}

            # Find Name and Style
            recipe['name'] = new_soup.find('h1').text.strip()
            recipe['style'] = new_soup.find('div', attrs={'class': 'header-content'}).find('a').text.strip()

            # Include link to recipe
            recipe['link'] = new_url

            # Check if it conforms to style guidelines
            recipe['meets_guidelines'] = 0
            if new_soup.find('i', attrs={'class': 'icon icon-check'}):
                recipe['meets_guidelines'] = 1

            # Find base stats and append to dictionary
            base_stats = new_soup.find_all('div', attrs={'class': 'value'})
            recipe['OG'] = float(base_stats[0].text.strip())
            recipe['FG'] = float(base_stats[1].text.strip())
            recipe['IBU'] = int(base_stats[2].text.strip())
            recipe['SRM'] = int(base_stats[3].text.strip())
            recipe['ABV'] = float(base_stats[4].text.replace('%', '').strip())/100

            # Find fermentables and append to dictionary
            grains = new_soup.find('table', attrs={'id': 'fermentables'}).find('tbody').find_all('tr')
            fermentables = {}
            for j, g in enumerate(grains):
                generic_ferm = True
                row = g.find_all('td')
                fermentable = {}
                fermentable['amount'] = float(row[0].text.strip().split()[0])
                fermentable['amount_unit'] = row[0].text.strip().split()[1]
                if row[1].find('a', attrs={'class': 'js-modal'}):
                    fermentable['name'] = row[1].find('a')['data-title']
                else:
                    fermentable['name'] = row[1].text.strip()
                    generic_ferm = False
                fermentable['maltster'] = row[2].text.strip()
                fermentable['use'] = row[3].text.strip()
                fermentable['PPG'] = int(row[4].text.strip())
                fermentable['color'] = int(row[5].text.strip().split()[0])
                fermentable['color_unit'] = row[5].text.strip().split()[1]
                fermentable['generic'] = generic_ferm
                fermentables[j] = fermentable
            recipe['fermentables'] = fermentables

            # Find hops and append to dictionary
            hop_table = new_soup.find('table', attrs={'id': 'hops'}).find('tbody').find_all('tr')
            hops = {}
            for j, h in enumerate(hop_table):
                row = h.find_all('td')
                hop = {}
                generic_hop = True
                hop['amount'] = float(row[0].text.strip().split()[0])
                hop['amount_unit'] = row[0].text.strip().split()[1]
                if row[1].find('a', attrs={'class': 'js-modal'}):
                    hop['name'] = row[1].find('a')['data-title']
                else:
                    hop['name'] = row[1].text.strip()
                    generic_hop = False
                hop['name'] = row[1].text.strip()
                hop['time'] = int(row[2].text.strip().split()[0])
                hop['use'] = row[3].text.replace('\n', '')
                hop['form'] = row[4].text.replace('\n', '')
                hop['alpha'] = float(row[5].text.strip().replace('%', ''))/100
                hop['generic'] = generic_hop
                hops[j] = hop
            recipe['hops'] = hops

            # Find yeast
            yeast_table = new_soup.find('table', attrs={'id': 'yeasts'}).find('tbody').find_all('tr')
            yeasts = {}
            for j, y in enumerate(yeast_table):
                yeast_row = y.find_all('td')
                yeast = {}
                generic_yeast = True
                if yeast_row[0].find('a', attrs={'class': 'js-modal'}):
                    yeast['name'] = yeast_row[0].find('a')['data-title']
                else:
                    yeast['name'] = row[0].text.strip()
                    generic_yeast = False
                yeast['generic'] = generic_yeast
                yeast['yeast_lab'] = yeast_row[1].text.strip().replace('\n', ' ')
                yeast['yeast_attenuation'] = float(yeast_row[2].text.strip().replace('%', ''))/100
                yeasts[j] = yeast
            recipe['yeast'] = yeasts

            # Find boil time and batch size
            final_stats = new_soup.find('ul', attrs={'class': 'stat-group-thirds'}).find_all('li')
            recipe['volume'] = float(final_stats[0].text.replace('Batch Size', '').strip().split()[0])
            recipe['volume_units'] = final_stats[0].text.replace('Batch Size', '').strip().split()[1]
            recipe['boil_time'] = int(final_stats[1].text.replace('Boil Time', '').strip().split()[0])
            recipe['boil_time_units'] = final_stats[1].text.replace('Boil Time', '').strip().split()[1]

            # Determine if there are extras
            if new_soup.find('table', attrs={'id': 'extras'}):
                extra_table = new_soup.find('table', attrs={'id': 'extras'}).find('tbody').find_all('tr')
                extras = {}
                for j, e in enumerate(extra_table):
                    extra_rows = e.find_all('td')
                    extra = {}
                    generic = False
                    extra['amount'] = float(extra_rows[0].text.strip().split()[0])
                    extra['amount_unit'] = extra_rows[0].text.strip().split()[1]
                    if extra_rows[1].find('a', attrs={'class': 'js-modal'}):
                        extra['name'] = extra_rows[1].find('a')['data-title']
                        generic = True
                    else:
                        extra['name'] = extra_rows[1].text.strip()
                    extra['time'] = extra_rows[2].text.strip()
                    extra['use'] = extra_rows[3].text.strip()
                    extra['generic'] = generic
                    extras[j] = extra
                recipe['extras'] = extras
            else:
                recipe['extras'] = 0


            # Add recipe to recipe list
            recipes.append(recipe)
        except:
            continue
    
    if i % 3 == 0:
        elapsed = time.time()-start
        hours = int(elapsed//3600)
        minutes = int((elapsed-(hours*3600))//60)
        seconds = int((elapsed-(hours*3600)-(minutes*60)))
        print(i, 'Pages Scanned', '\t', str(i/30) + '% Complete \t TIME ELAPSED: {}:{}:{}'.format(hours, minutes, seconds))
        
    # Every 100 pages, create dataframe and send to csv
    if i % 100 == 0:

        # Convert list of dictionaries into dataframe
        df = pd.DataFrame(recipes)
        df.to_csv('./brewtoad_recipes_' + str(i) + '.csv')
        recipes = []
        time.sleep(3)


In [14]:
df = pd.DataFrame(recipes)
df['yeast'][10]

{0: {'name': 'American Ale II',
  'generic': True,
  'yeast_lab': 'Wyeast 1272',
  'yeast_attenuation': 0.74}}

In [17]:
for i in df.index:
    d = df.loc[i, 'yeast']
    for j in d.values():
        if j['generic'] == False:
            print(i)

In [54]:
73//60

1

In [41]:
df.loc[12, 'fermentables']

{0: {'amount': 9.0,
  'amount_unit': 'lb',
  'name': 'Maris Otter Pale Ale Malt',
  'maltster': 'Thomas Fawcett & Sons',
  'use': 'Mash',
  'PPG': 36,
  'color': 2,
  'color_unit': '°L'},
 1: {'amount': 1.5,
  'amount_unit': 'lb',
  'name': 'Munich Malt 10L',
  'maltster': 'Briess',
  'use': 'Mash',
  'PPG': 35,
  'color': 10,
  'color_unit': '°L'},
 2: {'amount': 0.75,
  'amount_unit': 'lb',
  'name': 'Caramel Malt 40L',
  'maltster': 'Briess',
  'use': 'Mash',
  'PPG': 34,
  'color': 40,
  'color_unit': '°L'},
 3: {'amount': 0.5,
  'amount_unit': 'lb',
  'name': 'Chocolate Wheat Malt',
  'maltster': 'Any',
  'use': 'Mash',
  'PPG': 33,
  'color': 400,
  'color_unit': '°L'},
 4: {'amount': 0.5,
  'amount_unit': 'lb',
  'name': 'Black Malt',
  'maltster': 'Briess',
  'use': 'Mash',
  'PPG': 32,
  'color': 500,
  'color_unit': '°L'}}

In [37]:
def fermentable_length(row):
    max_len = 0
    row_len = len(row['fermentables'])
    if row_len > max_len:
        max_len = row_len
    return max_len

In [40]:
df.apply(fermentable_length, axis=1).value_counts()

4    20
5    13
3    10
2    10
6     3
7     2
1     1
dtype: int64

In [14]:
import requests
import pandas as pd
import time
import datetime
from bs4 import BeautifulSoup

u = 'https://www.brewtoad.com/recipes/firework-cream-ale'
res = requests.get(u)
bs = BeautifulSoup(res.content, 'lxml')

In [15]:
grains = bs.find('table', attrs={'id': 'fermentables'}).find('tbody').find_all('tr')
fermentables = {}
for j, g in enumerate(grains):
    generic_ferm = True
    row = g.find_all('td')
    fermentable = {}
    fermentable['amount'] = float(row[0].text.strip().split()[0])
    fermentable['amount_unit'] = row[0].text.strip().split()[1]
    if row[1].find('a', attrs={'class': 'js-modal'}):
        fermentable['name'] = row[1].find('a')['data-title']
    else:
        fermentable['name'] = row[1].text.strip()
        generic_ferm = False
    fermentable['maltster'] = row[2].text.strip()
    fermentable['use'] = row[3].text.strip()
    fermentable['PPG'] = int(row[4].text.strip())
    fermentable['color'] = int(row[5].text.strip().split()[0])
    fermentable['color_unit'] = row[5].text.strip().split()[1]
    fermentable['generic'] = generic_ferm
    fermentables[j] = fermentable
fermentables


{0: {'amount': 8.0,
  'amount_unit': 'lb',
  'name': '2-Row (US)',
  'maltster': 'Rahr',
  'use': 'Mash',
  'PPG': 38,
  'color': 1,
  'color_unit': '°L',
  'generic': True},
 1: {'amount': 2.0,
  'amount_unit': 'lb',
  'name': 'Flaked Corn',
  'maltster': 'Any',
  'use': 'Mash',
  'PPG': 37,
  'color': 1,
  'color_unit': '°L',
  'generic': True},
 2: {'amount': 1.0,
  'amount_unit': 'lb',
  'name': 'Crystal 15L (CA)',
  'maltster': 'Great Western',
  'use': 'Mash',
  'PPG': 10,
  'color': 15,
  'color_unit': '°L',
  'generic': True},
 3: {'amount': 1.0,
  'amount_unit': 'lb',
  'name': 'Carapils (Dextrine Malt) (US)',
  'maltster': 'Briess',
  'use': 'Mash',
  'PPG': 34,
  'color': 1,
  'color_unit': '°L',
  'generic': True}}