In [1]:
import pandas as pd
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

In [2]:
ua = UserAgent()
random_agent = ua.random
headers = {'User-Agent': random_agent}

In [3]:
url = 'https://www.foodnetwork.com/recipes/recipes-a-z/123'

In [4]:
r = requests.get(url=url, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')

food_alphabet_urls = []
for link in soup.find_all('a', class_='o-IndexPagination__a-Button', href=True):
    url = 'https:' + link['href']
    food_alphabet_urls.append(url)

In [5]:
food_alphabet_urls

['https://www.foodnetwork.com/recipes/recipes-a-z/123',
 'https://www.foodnetwork.com/recipes/recipes-a-z/a',
 'https://www.foodnetwork.com/recipes/recipes-a-z/b',
 'https://www.foodnetwork.com/recipes/recipes-a-z/c',
 'https://www.foodnetwork.com/recipes/recipes-a-z/d',
 'https://www.foodnetwork.com/recipes/recipes-a-z/e',
 'https://www.foodnetwork.com/recipes/recipes-a-z/f',
 'https://www.foodnetwork.com/recipes/recipes-a-z/g',
 'https://www.foodnetwork.com/recipes/recipes-a-z/h',
 'https://www.foodnetwork.com/recipes/recipes-a-z/i',
 'https://www.foodnetwork.com/recipes/recipes-a-z/j',
 'https://www.foodnetwork.com/recipes/recipes-a-z/k',
 'https://www.foodnetwork.com/recipes/recipes-a-z/l',
 'https://www.foodnetwork.com/recipes/recipes-a-z/m',
 'https://www.foodnetwork.com/recipes/recipes-a-z/n',
 'https://www.foodnetwork.com/recipes/recipes-a-z/o',
 'https://www.foodnetwork.com/recipes/recipes-a-z/p',
 'https://www.foodnetwork.com/recipes/recipes-a-z/q',
 'https://www.foodnetwork.

In [6]:
data = {'food': [], 'ingredients': [], 'steps': [], 'images': []}

In [7]:
def get_recipe_df(recipe_url):
    global data
    try:
        r = requests.get(url=recipe_url, headers=headers)
    except:
        print('invalida url')
        return
        
    soup = BeautifulSoup(r.content, 'html.parser')
    
    food = soup.find('span', class_='o-AssetTitle__a-HeadlineText')
    food_name = ''
    if food:
        food_name = food.get_text(strip=True)      
    data['food'].append(food_name)
    
    ingredients_all = soup.find_all('p', class_='o-Ingredients__a-Ingredient')
    ingredients = []
    for ing in ingredients_all:
        ingredient = ing.find('span', class_='o-Ingredients__a-Ingredient--CheckboxLabel')
        if ingredient:
            ingredient_text = ingredient.get_text(strip=True)
            ingredients.append(ingredient_text)
    data['ingredients'].append('\n'.join(ingredients[1:]))


    directions_ol_tags = soup.find_all('ol')
    steps = []
    i = 1
    for ol_tag in directions_ol_tags:
        li_tags = ol_tag.find_all('li', class_='o-Method__m-Step')
        for li_tag in li_tags:
            steps.append(f"Step {i}: {li_tag.get_text(strip=True)}")
            i+=1
    data['steps'].append('\n'.join(steps))

    img_tag = soup.find('img', class_='m-MediaBlock__a-Image')
    image_url = ''
    if img_tag and 'src' in img_tag.attrs:
            image_url = 'https:' + img_tag['src']
    else:
        img_tag = soup.find('img', class_='kdp-poster__image')
        if img_tag and 'src' in img_tag.attrs:
            image_url = 'https:' + img_tag['src']
    data['images'].append(image_url)

In [9]:
recipes = []
for url in food_alphabet_urls: 
    while True:
        # get all recipes on current page
        r = requests.get(url=url, headers=headers)
        soup = BeautifulSoup(r.content, 'html.parser')
        link_elements = soup.find_all('ul', class_='m-PromoList o-Capsule__m-PromoList')
        for link in link_elements:
            for a in link.find_all('a'):
                recipe_url = 'https:' + a.get('href')
                recipes.append(recipe_url)
                get_recipe_df(recipe_url)
        # iterate to next page
        next_page_link = soup.find('a', class_='o-Pagination__a-NextButton')
        if next_page_link and not 'is-Disabled' in next_page_link.get('class', []):
            url = 'https:' + next_page_link['href']
        else:
            break

In [None]:
len(data['food']), len(data['ingredients']), len(data['steps']), len(data['images'])

In [None]:
df = pd.DataFrame(data)

In [None]:
df

In [None]:
rows_empty_string = df.map(lambda x: x=='').any(axis=1)
n_rows_empty_string = rows_empty_string.sum()
n_rows_empty_string

In [None]:
df = df[~rows_empty_string]
df.shape

In [None]:
df = df.drop_duplicates(subset='food', keep=False)

In [None]:
df = df.drop_duplicates(subset='images', keep=False)

In [None]:
df

In [None]:
# df.to_excel('foodnetwork_data.xlsx', index=False) 
df = pd.read_excel('./data/foodnetwork_data.xlsx')

In [None]:
output_folder = "./data/images/foodnetwork_images"

In [None]:
# Create an output directory if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for index, row in df.iterrows():
    url = row['images'].split('\n')[0]
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            image_content = response.content
#             image_name = f'image_{index}.jpg'
            image_name = url.split('/')[-1] or f'image_{index}.jpg'

            with open(os.path.join(output_folder, image_name), 'wb') as image_file:
                image_file.write(image_content)
            print(f'Successfully downloaded {image_name}')
        else:
            print(f'Failed to download image from {url}')
    except:
        print(f'An error occurred while downloading image from {url}')