In [1]:
import requests
from bs4 import BeautifulSoup
import random
import time
import pandas as pd

In [9]:
# Rotate User-Agents
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
]

def fetch_with_retries(url, max_retries=3):
    """Fetch URL content with retries and random User-Agent."""
    headers = {"User-Agent": random.choice(USER_AGENTS)}
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()  # Raise HTTPError for bad responses
            return response.content
        except requests.RequestException as e:
            retries += 1
            print(f"Retry {retries} for {url}: {e}")
            time.sleep(2 ** retries)  # Exponential backoff
    return None

def scrape_recipe(url, country):
    """Scrape recipe details including title, rating, comments, nutrition, etc."""
    html_content = fetch_with_retries(url)
    if not html_content:
        return None  # Skip if failed to fetch

    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract Recipe Title
    title_element = soup.find('h1', class_='article-heading')
    title = title_element.get_text(strip=True) if title_element else None

    # Extract Times and Servings
    details_div = soup.find('div', class_='mm-recipes-details__content')
    time_and_servings = {}
    if details_div:
        details_items = details_div.find_all('div', class_='mm-recipes-details__item')
        for item in details_items:
            label = item.find('div', class_='mm-recipes-details__label').get_text(strip=True)
            value = item.find('div', class_='mm-recipes-details__value').get_text(strip=True)
            time_and_servings[label] = value

    # Extract Ingredients
    #ingredients_list = []
    #ingredients_ul = soup.find('ul', class_='mm-recipes-structured-ingredients__list')
    #if ingredients_ul:
    #    ingredients_items = ingredients_ul.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
    #    for item in ingredients_items:
    #        ingredient_text = item.get_text(strip=True)
    #        ingredients_list.append(ingredient_text)
    ingredients_list = []
    ingredients_ul = soup.find('ul', class_='mm-recipes-structured-ingredients__list')
    if ingredients_ul:
       ingredients_items = ingredients_ul.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
       for item in ingredients_items:
           ingredient_text = ' '.join(item.stripped_strings)  # Join all text with spaces
           ingredients_list.append(ingredient_text)

    # Extract Directions/Steps
    steps_list = []
    steps_ol = soup.find('ol', class_='mntl-sc-block-group--OL')
    if steps_ol:
        steps_items = steps_ol.find_all('li', class_='mntl-sc-block-group--LI')
        for step in steps_items:
            step_text = step.get_text(strip=True)
            steps_list.append(step_text)

    # Extract Overall Rating
    rating_element = soup.find('div', class_='mm-recipes-review-bar__rating')
    rating = rating_element.get_text(strip=True) if rating_element else None

    # Extract Comments and Feedback Chips
    feedback_list = []
    feedback_items = soup.find_all('div', class_='feedback review')
    for feedback in feedback_items:
        comment_element = feedback.find('div', class_='feedback__text')
        comment = comment_element.get_text(strip=True) if comment_element else None

        chips = []
        chip_elements = feedback.find_all('span', class_='feedback-chips__text')
        for chip in chip_elements:
            chips.append(chip.get_text(strip=True))

        feedback_list.append({
            'comment': comment,
            'chips': chips
        })

    # Extract Nutrition Facts
    nutrition_facts = {}
    nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
    if nutrition_table:
        rows = nutrition_table.find_all('tr', class_='mm-recipes-nutrition-facts-summary__table-row')
        for row in rows:
            value = row.find('td', class_='mm-recipes-nutrition-facts-summary__table-cell text-body-100-prominent').get_text(strip=True)
            label = row.find('td', class_='mm-recipes-nutrition-facts-summary__table-cell text-body-100').get_text(strip=True)
            nutrition_facts[label] = value

    # Return Full Recipe Details
    return {
        'url': url,
        'country': country,
        'title': title,
        'details': time_and_servings,
        'ingredients': ingredients_list,
        'steps': steps_list,
        'rating': rating,
        'comments': feedback_list,
        'nutrition': nutrition_facts,
    }

def get_recipe_links(index_url):
    """Extract all unique recipe links from an index page."""
    html_content = fetch_with_retries(index_url)
    if not html_content:
        return []

    soup = BeautifulSoup(html_content, 'html.parser')
    recipe_links = []

    # Find all anchor tags leading to recipes
    for link in soup.find_all('a', href=True):
        href = link['href']
        if '/recipe/' in href:  # Adjust this pattern to match valid recipe URLs
            recipe_links.append(href)

    return list(set(recipe_links))  # Remove duplicates

# Function to extract dish type from URL
def get_dish_type_from_url(url):
    # Common dish type keywords in URLs
    dish_keywords = [
        "main-dishes", "appetizers", "desserts", "soups-and-stews",
        "side-dishes", "salads", "breads", "beverages", "breakfast",
        "snacks", "sauces", "marinades"
    ]
    for keyword in dish_keywords:
        if keyword in url:
            # Return the dish type in readable format (replace '-' with ' ')
            return keyword.replace('-', ' ').capitalize()
    return "Unknown"  # Default if no keyword is found

In [10]:
countries = [
    "Afghan", "Albanian", "Algerian", "Andorran", "Angolan", "Antiguan", "Argentinian", 
    "Armenian", "Australian", "Austrian", "Azerbaijani", "Bahamian", "Bahraini", "Bangladeshi", 
    "Barbadian", "Belarusian", "Belgian", "Belizean", "Beninese", "Bhutanese", "Bolivian", 
    "Bosnian", "Botswanan", "Brazilian", "British", "Bruneian", "Bulgarian", "Burkinabe", 
    "Burmese", "Burundian", "Cabo Verdean", "Cambodian", "Cameroonian", "Canadian", "Central African", 
    "Chadian", "Chilean", "Chinese", "Colombian", "Comoran", "Congolese", "Costa Rican", 
    "Croatian", "Cuban", "Cypriot", "Czech", "Danish", "Djiboutian", "Dominican", "Ecuadorian", 
    "Egyptian", "Emirati", "Equatoguinean", "Eritrean", "Estonian", "Eswatini", "Ethiopian", 
    "Fijian", "Finnish", "French", "Gabonese", "Gambian", "Georgian", "German", "Ghanaian", 
    "Greek", "Grenadian", "Guatemalan", "Guinean", "Guinea-Bissauan", "Guyanese", "Haitian", 
    "Honduran", "Hungarian", "Icelandic", "Indian", "Indonesian", "Iranian", "Iraqi", "Irish", 
    "Israeli", "Italian", "Ivorian", "Jamaican", "Japanese", "Jordanian", "Kazakhstani", 
    "Kenyan", "Kiribati", "Kittitian", "Kosovar", "Kuwaiti", "Kyrgyz", "Lao", "Latvian", 
    "Lebanese", "Lesotho", "Liberian", "Libyan", "Liechtenstein", "Lithuanian", "Luxembourg", 
    "Madagascan", "Malawian", "Malaysian", "Maldivian", "Malian", "Maltese", "Marshallese", 
    "Mauritanian", "Mauritian", "Mexican", "Micronesian", "Moldovan", "Monacan", "Mongolian", 
    "Montenegrin", "Moroccan", "Mozambican", "Namibian", "Nauruan", "Nepalese", "Dutch", 
    "New Zealander", "Nicaraguan", "Nigerian", "Nigerien", "North Korean", "North Macedonian", 
    "Norwegian", "Omani", "Pakistani", "Palauan", "Palestinian", "Panamanian", "Papua New Guinean", 
    "Paraguayan", "Peruvian", "Filipino", "Polish", "Portuguese", "Qatari", "Romanian", "Russian", 
    "Rwandan", "Saint Lucian", "Salvadoran", "Samoan", "San Marinese", "Sao Tomean", "Saudi", 
    "Senegalese", "Serbian", "Seychellois", "Sierra Leonean", "Singaporean", "Slovak", 
    "Slovenian", "Solomon Islander", "Somali", "South African", "South Korean", "South Sudanese", 
    "Spanish", "Sri Lankan", "Sudanese", "Surinamese", "Swazi", "Swedish", "Swiss", "Syrian", 
    "Tajik", "Tanzanian", "Thai", "Timorese", "Togolese", "Tongan", "Trinidadian", "Tunisian", 
    "Turkish", "Turkmen", "Tuvaluan", "Ugandan", "Ukrainian", "Uruguayan", "Uzbek", "Vanuatuan", 
    "Venezuelan", "Vietnamese", "Yemeni", "Zambian", "Zimbabwean"
]


In [11]:
def scrape_country_recipes(base_url, countries):
    """Scrape recipes for each country, including dish types."""
    all_recipes = []

    for country in countries:
        search_url = f"{base_url}/search?q={country}"  # Adjust URL pattern for the search query
        print(f"Fetching recipes for country: {country}")

        recipe_links = get_recipe_links(search_url)

        for recipe_url in recipe_links:
            # Extract dish type from URL
            dish_type = get_dish_type_from_url(recipe_url)

            # Scrape recipe details
            recipe_data = scrape_recipe(recipe_url, country)
            if recipe_data:
                recipe_data['dish_type'] = dish_type  # Include dish type in recipe data
                all_recipes.append(recipe_data)

            time.sleep(random.uniform(1, 3))  # Random delay between requests

        time.sleep(random.uniform(3, 6))  # Delay between countries to avoid being flagged

    return all_recipes

In [12]:
countries 
base_url = "https://www.allrecipes.com"  # Replace with the correct base URL of your target site
all_recipes = scrape_country_recipes(base_url, countries)

Fetching recipes for country: Afghan
Fetching recipes for country: Albanian
Fetching recipes for country: Algerian
Fetching recipes for country: Andorran
Fetching recipes for country: Angolan
Fetching recipes for country: Antiguan
Fetching recipes for country: Argentinian
Fetching recipes for country: Armenian
Fetching recipes for country: Australian
Fetching recipes for country: Austrian
Fetching recipes for country: Azerbaijani
Fetching recipes for country: Bahamian
Fetching recipes for country: Bahraini
Fetching recipes for country: Bangladeshi
Fetching recipes for country: Barbadian
Fetching recipes for country: Belarusian
Fetching recipes for country: Belgian
Fetching recipes for country: Belizean
Fetching recipes for country: Beninese
Fetching recipes for country: Bhutanese
Fetching recipes for country: Bolivian
Fetching recipes for country: Bosnian
Fetching recipes for country: Botswanan
Fetching recipes for country: Brazilian
Fetching recipes for country: British
Fetching recip

In [14]:
len(all_recipes)

1279

In [15]:
country_recipes_supp = pd.DataFrame(all_recipes)

In [None]:
country_recipes_supp.to_csv('country_recipes_supp.csv', index=False)