# get urls

In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path
import pandas as pd

In [None]:
base_url_alton = "https://www.foodnetwork.com/profiles/talent/alton-brown/recipes" + "/recentlyaired-/p/{}", 54+1
base_url_show = "https://www.foodnetwork.com/shows/good-eats/recipes" + "/recentlyaired-/p/{}", 46+1

In [None]:
def template_to_urls(template, max):
    return [template.format(i) for i in range(1, max+1)]
    
def get_soup(url):
    response = requests.get(url)
    if not response.ok:
        raise ValueError("{} could not be retrieved.".format(url))
    return BeautifulSoup(response.text, "lxml")

def soup_to_reviews(soup):
    recipe_reviews = {
        "https:" + item.a.get("href") :
        (
            item.find(attrs={'class': "gig-rating-stars"}).get('title') if item.find(attrs={'class': "gig-rating-stars"}) else None, 
            item.find(attrs={'class': "gig-rating-ratingsum"}).text if item.find(attrs={'class': "gig-rating-ratingsum"}) else None,
        )
        for item in soup.find(attrs={'class': "l-List"}).find_all(attrs={'class': "m-MediaBlock__m-TextWrap"})
    }
    
    return recipe_reviews

def soup_to_recipes(soup):
    recipe_urls = [
        "https:" + item.a.get("href") 
        for item in soup.find(attrs={'class': "l-List"}).find_all(attrs={'class': "m-MediaBlock__m-TextWrap"})
    ]
    return recipe_urls

In [None]:
recipe_urls1 = {}
for url in tqdm(template_to_urls(*base_url_alton)):
    new_urls = soup_to_reviews(get_soup(url))
    if len(new_urls) != 15:
        print(len(new_urls), url)
    recipe_urls1.update(**new_urls)

print(len(recipe_urls1.keys()), '>', len(set(recipe_urls1.keys())))

In [None]:
recipe_urls2 = {}
for url in tqdm(template_to_urls(*base_url_show)):
    new_urls = soup_to_reviews(get_soup(url))
    if len(new_urls) != 15:
        print(len(new_urls), url)
    recipe_urls2.update(**new_urls)

print(len(recipe_urls2.keys()), '>', len(set(recipe_urls2.keys())))

In [None]:
df_reviews = pd.DataFrame()
for url, v in recipe_urls1.items():
    name = url.split('/')[-1]
    reviews = int(v[1].split(' ')[0]) if v[1] else None
    stars = float(v[0].split(' ')[0]) if v[0] else None
    assert name not in df_reviews.index
    df_reviews = df_reviews.append(pd.Series(data={'url': url, 'stars': stars, 'reviews': reviews}, name=name))
for url, v in recipe_urls2.items():
    name = url.split('/')[-1]
    reviews = int(v[1].split(' ')[0]) if v[1] else None
    stars = float(v[0].split(' ')[0]) if v[0] else None
    if name in df_reviews.index:
        assert df_reviews.loc[name,'url'] == url
        if stars: assert df_reviews.loc[name,'stars'] == stars
        if reviews: assert df_reviews.loc[name,'reviews'] == reviews
        continue
    df_reviews = df_reviews.append(pd.Series(data={'url': url, 'stars': stars, 'reviews': reviews}, name=name))
df_reviews.sort_values(['stars', 'reviews'], ascending=False).head(10)

In [None]:
recipe_urls = set(recipe_urls1).union(recipe_urls2)
print(len(recipe_urls))

In [None]:
Path('recipe_urls.txt').write_text('\n'.join(recipe_urls))
assert recipe_urls == set(Path('recipe_urls.txt').read_text().split('\n'))

In [None]:
a = recipe_urls
b = set(df_reviews.url.unique())
len(a), len(b), len(a-b), len(b-a), len(a.union(b))

In [None]:
set(df_reviews.url.unique()) - recipe_urls

In [None]:
recipe_urls - set(df_reviews.url.unique())

# get recipes

In [None]:
import requests
from tqdm import tqdm
from pathlib import Path

In [None]:
recipe_urls = set(Path('recipe_urls.txt').read_text().split('\n'))

In [None]:
def download_urls(urls):
    skipped = []
    error = []
    downloaded = []
    for url in tqdm(urls):
        filename = Path('/'.join(url.split('//')[1].split('/')[1:]) + '.html')
        if filename.exists() and filename.read_text(encoding='utf8').strip():
            skipped += [url]
            continue
        filename.parent.mkdir(parents=True, exist_ok=True)

        response = requests.get(url)
        if not response.ok:
            error += [url]
            continue
        text = response.text
        
        filename.write_text(text, encoding='utf8')
        downloaded += [url]
    print(f'skipped: {len(skipped)}  error: {len(error)}  downloaded: {len(downloaded)}  TOTAL: {len(skipped + error + downloaded)}')


download_urls(recipe_urls)

# process recipes

In [None]:
from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path
import json

In [None]:
recipe_files = sorted(Path('recipes').glob('**/*.html'))
recipe_files[:5]

In [None]:
def file_to_soup(filename):
    return BeautifulSoup(Path(filename).read_text(encoding='utf8'), 'lxml')


def soup_to_data(soup):
    ret = {}
    recipe = soup.find(attrs={'class': "o-Recipe"})
    if not recipe:
        return None
    # summary
    recipe_summary = recipe.find(attrs={'class': "m-RecipeSummary"})
    ret['Title'] = recipe_summary.find(attrs={'class': "o-AssetTitle__a-HeadlineText"}).text
    ret['Author'] = recipe_summary.find(attrs={'class': "o-Attribution__m-TextWrap"}).a.text
    ret['Rating_stars'] = recipe_summary.find(attrs={'class': "gig-rating-stars "})
    ret['Rating_stars'] = ret['Rating_stars'] and ret['Rating_stars'].get('title')
    ret['Rating_num'] = recipe_summary.find(attrs={'class': "gig-rating-ratingsum "})
    ret['Rating_num'] = ret['Rating_num'] and ret['Rating_num'].text
    recipe_info = recipe_summary.find(attrs={'class': "o-RecipeInfo"})
    for ul in recipe_info.find_all('ul'):
        for li in ul.find_all('li'):
            span1, *span2 = li.find_all('span')
            k = span1.text.strip(':').strip()
            v = '\n'.join(s.text.strip() for s in span2)
            assert k not in ret
            ret[k] = v
    # footer
    recipe_footer = recipe.find(attrs={'class': "recipe-body-footer"})
    recipe_sources = recipe_footer.find(attrs={'class': "o-VideoPromo"})
    if recipe_sources:
        for recipe_source in recipe_sources.find_all(attrs={'class': "m-MediaBlock__a-Source"}):
            span1, *span2 = recipe_source.find_all('span')
            k = span1.text.strip(':').strip()
            if k == "Episodes":
                k = k[:-1]
            v = '\n'.join(s.text.strip() for s in span2)
            assert k not in ret
            ret[k] = v
    recipe_tags = recipe_footer.find(attrs={'class': "o-Capsule__m-TagList m-TagList"})
    if recipe_tags:
        ret['Categories'] = ';'.join([tag.text for tag in recipe_tags.find_all('a')])
    # body / ingredients
    ingredients = recipe.find(attrs={'class': "o-Ingredients__m-Body"})
    if ingredients:
        ingredient_title = "Ingredients"
        section_count = 0
        ret[ingredient_title] = []
        for ingredient in ingredients.find_all(['p', 'h6']):
            if ingredient.name == 'p':
                ret[ingredient_title] += [ingredient.text]
            else:
                section = ingredient.text.strip().strip(':')
                section_count += 1
                ingredient_title = f"Ingredients.{section_count}.{section}"
                assert ingredient_title not in ret
                ret[ingredient_title] = []
    # body / method
    method = recipe.find(attrs={'class': "o-Method__m-Body"})
    ret['Directions'] = [li.text.strip() for li in method.find_all('li')]
    return ret

# soup = file_to_soup(recipe_files[22])
# soup_to_data(soup)

In [None]:
data = {}
for filename in tqdm(recipe_files):
    k = filename.name.split('.')[0].split('-')[-1]
    while k in data:
        k += '_'
    data[k] = soup_to_data(file_to_soup(filename))
    if data[k] is None:
        print(f"{k} couldn't parse as recipe")

In [None]:
%debug

In [None]:
with Path('recipe_data.json').open(mode='w') as f:
    json.dump(data, f)
with Path('recipe_data.json').open() as f:
    assert data == json.load(f)

# data

In [None]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import json

In [None]:
with Path('recipe_data.json').open() as f:
    data = json.load(f)

In [None]:
df = pd.DataFrame()
for name, d in tqdm(data.items(), leave=False):
    if d:
        d2 = d.copy()
        if "Ingredients" in d2:
            ing_len = 0
            for k in list(d2):
                if k.startswith("Ingredients"):
                    ing_len += len(d2.pop(k))
            d2["n_Ingredients"] = ing_len
        if "Directions" in d2:
            d2["n_Directions"] = len(d2.pop("Directions"))
        df = df.append(pd.Series(d2, name=name.split('-')[-1]))
    else:
        df = df.append(pd.Series(name=name.split('-')[-1]))
new_col_order = [k for k in d2.keys() if k in df.columns] + [c for c in df.columns if c not in (d2.keys())]
df = df[new_col_order]

df.index.name = "foodnetwork_id"
df['Author'] = df['Author'].str.replace("Recipe courtesy of ", "").astype('category')
df['Rating_stars'] = df['Rating_stars'].replace('pending rating', pd.np.NaN).astype(float)
df['Rating_num'] = df['Rating_num'].astype(float)  # Int
df['Level'] = df['Level'].astype('category')
df['Show'] = df['Show'].astype('category')
df['n_Ingredients'] = df['n_Ingredients'].astype(float)  # Int
df['n_Directions'] = df['n_Directions'].astype(float)  # Int
df['Nutrition Info'] = df['Nutrition Info'].str.strip().replace('', pd.np.NaN)
total = df.Total.str.split('\n', expand=True)
df['Total'] = total[0]
df['note_Total'] = total[1].dropna()
for c in 'Total Cook Inactive Prep Active'.split():
    df[f't_{c}'] = pd.to_timedelta(df.pop(c))
df = df.dropna(axis=1, how='all')

df.head()

In [None]:
df.dtypes

In [None]:
df.to_pickle('recipe_df.pickle')
assert pd.read_pickle('recipe_df.pickle').equals(df)

# use DF

In [None]:
import pandas as pd
import pint
from pprint import pprint

In [None]:
df = pd.read_pickle('recipe_df.pickle')
df.head()

In [None]:
df_categories = []
df_categories = pd.Series(sum((cat for cat in df.Categories.str.split(';',).dropna().values if cat), []))
df_categories.value_counts().head(10)

In [None]:
pd.Series(df_categories).value_counts().head(25).iloc[::-1].plot.barh(figsize=(6, 6))

In [None]:
df[df.Categories.str.contains("Crowd") == True][['Title', 'Episode', 'Yield', 't_Total', ]].head(10)

In [None]:
c_dict = {'g': ['ounce', 'oz', 'pound']}
def convert_unit(ingredient, convert_dict):
    s = ingredient
    ureg = pint.UnitRegistry()
    ureg.default_format = '.1f'
    try:
        for dst_unit, src_units in convert_dict.items():
            for unit in src_units:
                if unit in s:
                    pos = s.find(' ', s.find(unit))
                    before, after = s[:pos], s[pos:]
                    return str(ureg.Quantity(before).to(dst_unit)) + after
    except pint.DimensionalityError:
        return s
    return s

def convert_units(ingredients, convert_dict):
    return [
        convert_unit(ingredient, convert_dict)
        for ingredient in ingredients
    ]

for k in list(data.keys())[:10]:
    pprint(convert_units(data[k]['Ingredients'], c_dict))

In [None]:
def print_recipe(recipe):
    for k, v in recipe.items():
        if v is None:
            continue
        if k == "Categories":
            v = v.split(';')
        elif k == "Ingredients":
            v = convert_units(v, c_dict)
        if isinstance(v, list):
            print()
            print(k)
            print("=" * len(k))
            pprint(v)
        else:
            print(f'{k+":":16} {v}')
print_recipe(data['1939636'])