In [2]:
import requests
from bs4 import BeautifulSoup as bs
import json

In [7]:
def recipe_link_to_dict(link):
    """ Take in the link to a NYTimes Cooking recipe 
        and extract all of the necessary info in dict form
        This includes:
         * name
         * description
         * prep time
         * yield
         * a url for the photo
         * all of the tags (formerly categories)
         * all of the ingredients, broken down into name, quantity, unit, and comment
         * all of the steps: number and instruction text
         NOTE: We do not pull user comments and names because it doesn't come with the original html, it
         s jscript loaded when the user clicks 'show comments'
    """
    soup = bs(requests.get(link).content, 'html.parser')
    recipe = {}
    recipe['name'] = soup.find('h1', class_='recipe-title').text
    recipe['description'] = soup.find('div', class_='topnote').p.text
    recipe['preparation_time'] = soup.find('ul', class_="recipe-time-yield").li.text.replace('Time','').strip()
    recipe['yield'] = soup.find('span', itemprop="recipeYield").text.strip()
    recipe['photo_url'] = soup.find('img', itemprop='image')['src']

    # get all the tags that work
    recipe['tags'] = []
    for a in soup.find('p', class_="special-diets tag-block").contents:
        try:
            recipe['tags'].append({'name': a.text})
        except:
            pass
        
    # get ingredient info
    recipe['ingredients'] = []
    for li in soup.find_all('li', itemprop="recipeIngredient"):
        ingredient = {'quantity': li.find('span', class_='quantity').text}
        name = li.find('span', itemprop='name').text
        ingredient['name'] = name
        # unit is section of phrase BEFORE name, comment is section AFTER
        split = li.find('span', class_='ingredient-name').text.split(name,1) #maxsplit=1
        ingredient['unit'] = split[0]
        ingredient['comment'] = split[1]
        recipe['ingredients'].append(ingredient)
        
    # get the step info
    recipe['steps'] = []
    print recipe['name']
    for i, li in enumerate(soup.find('ol', itemprop='recipeInstructions').contents):
        
        print li
#         try:
#             recipe['steps'].append({'number':i, 'instructions':li.text})
#         except: # some elements aren't li and break on .text
#             pass
    
    return recipe

In [5]:
def write_json(data, fname='recipe_data.json'):
    with open(fname, 'w') as fp:
        json.dump(data, fp)
    

In [8]:
root = 'http://cooking.nytimes.com'
num_searches = 10
search_links = [ root+'/search?q=&page='+str(n+1) for n in range(num_searches)]

print "Getting recipe links"
recipe_links = []
for i, link in enumerate(search_links):
    soup = bs(requests.get(link).content, 'html.parser')
    new_recipe_links = [root+link['href'] for link in soup.find_all('a', class_="card-recipe-info")]
    recipe_links += new_recipe_links
print 'Number of recipes links grabbed: %i' % len(recipe_links)

# load each recipe link and convert it to a dict
recipes = []
for i, link in enumerate(recipe_links):
    print "Downloading recipe #%i" % (i+1)
    # it would appear a lot of these pages aren't as well-formed as I thought
    # or (more likely) my code is buggy, but there's plenty of ones that work
    # so just skip them gracefully
    try:
        recipes.append(recipe_link_to_dict(link))
    except (NameError, TypeError, ValueError, AttributeError) as e:
        print "SKIPPED", e
    
print "Finished, %i total recipes successfully downloaded. Now writing out to file" % len(recipes)
write_json(recipes)
print "All Done!"

Getting recipe links
Number of recipes links grabbed: 480
Downloading recipe #1

      Fiery Sweet Potatoes
    


<li>Heat oven to 375 degrees. Bake potatoes on a baking sheet until very soft, about 1 hour 15 minutes. When cool enough to handle, peel and mash.</li>


<li>In a small saucepan, heat coconut milk with curry paste over low heat. Mix coconut milk mixture, half the sugar, half the butter and salt into potatoes. Keep warm until ready to serve, or cover and refrigerate up to two days.</li>


<li>At least 30 minutes before serving, heat oven to 425 degrees. Put potatoes in a baking dish, cover with foil and bake for 20 minutes. Uncover potatoes, dot with remaining butter and sugar and broil until brown and crusty on top, checking often to prevent scorching.</li>


Downloading recipe #2

      Caramelized Corn With Fresh Mint
    


<li>If using frozen corn, drain between layers of paper towels until thawed, about 30 minutes.</li>


<li>In a wide skillet, melt half the butter ov

ConnectionError: ('Connection aborted.', gaierror(8, 'nodename nor servname provided, or not known'))

In [18]:
import json
from datetime import datetime
from hashlib import sha256
def format_date(date):
    TIMESTAMP_FORMAT = '%Y-%m-%d %H:%M:%S'
    return datetime.strftime(date, TIMESTAMP_FORMAT)

In [7]:
with open('recipe_data.json' ,'r') as f:
    data = json.loads(f.read())
        

In [45]:
import urllib
def download_photo(url):
    try:
        photo_hash = sha256(url).hexdigest()
        localname = photo_hash+".jpg"
        urllib.urlretrieve(url, localname)
        return localname
    except:
        return "default.jpg"

for d in data[:10]:
    download_photo(d['photo_url'])

In [63]:
print data[0].keys()
ingredients = [[(i['name'], i['unit']) for i in d['ingredients']] for d in data]
for i in ingredients:
    print i
    print

[u'description', u'ingredients', u'preparation_time', u'tags', u'yield', u'steps', u'photo_url', u'name']
[(u'soy sauce', u'tablespoon '), (u'oyster sauce', u'tablespoons '), (u'sugar', u'Pinch of '), (u'rice vinegar', u'tablespoons '), (u'neutral oil', u'tablespoon '), (u'garlic', u'tablespoon finely minced '), (u'baby bok choy', u'bunches of ')]

[(u'neutral oil', u'tablespoon '), (u'ginger', u'1 1/2-inch piece fresh '), (u'jalape\xf1o pepper', u''), (u'orange zest', u'tablespoons '), (u'garlic', u''), (u'brown sugar', u'cup light '), (u'rice vinegar', u'cup '), (u'soy sauce', u'cup '), (u'fish sauce', u'tablespoon '), (u'egg white', u'large '), (u'cornstarch', u'tablespoon '), (u'kosher salt', u'pinch '), (u'rib-eye steak', u'boneless '), (u'neutral oil', u'cup '), (u'scallions', u''), (u'red chiles', u'dried ')]

[(u'extra-virgin olive oil', u'tablespoons '), (u'garlic', u''), (u'red pepper', u'Pinch of crushed '), (u'mussels', u'pounds '), (u'white wine', u'cup '), (u'baguette', u

In [11]:
import sqlite3
def db_connect():
    return sqlite3.connect('test.db')

In [12]:
conn = db_connect()

In [20]:
# create the main user
email = "chef@goodfood.com"
first_name= "Anthony"
last_name= "Bourdain"
hashed_password=sha256("ILoveCooking").hexdigest()
icon_code=1
created_at = format_date(datetime.now())
last_login_at = format_date(datetime.now())
user = (email, first_name, last_name, hashed_password, icon_code, created_at, last_login_at)

64


In [None]:
conn.execute("""INSERT INTO users (email, first_name, last_name, hashed_password, icon_code, created_at, last_login_at)
                VALUES email=%s, first_name=%s, last_name=%s, hashed_password=%s, icon_code=%i, 
                created_at=%s, last_login_at=%s""", user)

In [29]:
import psycopg2
psycopg2.paramstyle

'pyformat'

In [36]:
unique_ingredients = list(set([i['name'] for d in data for i in d['ingredients']]))
ingredients = [{'name':i} for i in unique_ingredients]
print ingredients

[{'name': u'iodized salt'}, {'name': u'rosemary leaves'}, {'name': u'duck breasts'}, {'name': u'Worcestershire sauce'}, {'name': u'za\u2019atar'}, {'name': u'Sugar'}, {'name': u'plain kefir'}, {'name': u'Bouquet garni'}, {'name': u'porcini'}, {'name': u'sriracha sauce'}, {'name': u'milk'}, {'name': u'vanilla bean'}, {'name': u'blueberries'}, {'name': u'Thai basil leaves'}, {'name': u'pineapple juice'}, {'name': u'sesame paste'}, {'name': u'bunch scallions'}, {'name': u'dark brown sugar'}, {'name': u'pork shoulder'}, {'name': u'chopped fresh dill'}, {'name': u'cayenne pepper'}, {'name': u'Mint leaf'}, {'name': u'Boiling water'}, {'name': u'unsweetened coconut'}, {'name': u'couscous'}, {'name': u'heavy cream'}, {'name': u'white bread'}, {'name': u'Primitivo Quiles Vermouth Rojo'}, {'name': u'blanched almonds'}, {'name': u'caraway seeds'}, {'name': u'turnips'}, {'name': u'vanilla extract'}, {'name': u'wasabi paste'}, {'name': u'navel oranges'}, {'name': u'Kaffir lime leaves'}, {'name': u'

In [40]:
categories = [ t for d in data for t in d['tags']]
print len(categories), categories[:10]
unique_categories = list(set([ t['name'] for d in data for t in d['tags']]))
categories = [{'name':i} for i in unique_ingredients]
print len(categories)

979 [{u'name': u'Bok Choy'}, {u'name': u'Oyster Sauce'}, {u'name': u'Beef'}, {u'name': u'Orange'}, {u'name': u'Scallion'}, {u'name': u'Chinese'}, {u'name': u'Asian'}, {u'name': u'Baguette'}, {u'name': u'Mussel'}, {u'name': u'French'}]
841


In [57]:
import numpy as np
def get_random_nutritional_info():
    options = ["Eh it's not too good.",
               "This thing is a super food!",
               "Chances of quadruple bypass after eating this are 50/50...",
               "Pretty average I guess.",
               "Freakin awesome!"
              ]
    choice = int(np.floor(np.random.rand()*len(options)))
    return options[choice] + " (NOTE: This was randomly generated)"
for _ in range(10):
    print get_random_nutritional_info()

Chances of quadruple bypass after eating this are 50/50... (NOTE: This was randomly generated)
Pretty average I guess. (NOTE: This was randomly generated)
This thing is a super food! (NOTE: This was randomly generated)
Freakin awesome! (NOTE: This was randomly generated)
Chances of quadruple bypass after eating this are 50/50... (NOTE: This was randomly generated)
This thing is a super food! (NOTE: This was randomly generated)
Freakin awesome! (NOTE: This was randomly generated)
Chances of quadruple bypass after eating this are 50/50... (NOTE: This was randomly generated)
This thing is a super food! (NOTE: This was randomly generated)
Freakin awesome! (NOTE: This was randomly generated)
