In [1]:
import requests
from bs4 import BeautifulSoup
import random
import time

In [None]:
# Rotate User-Agents
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
]

def fetch_with_retries(url, max_retries=3):
    """Fetch URL content with retries and random User-Agent."""
    headers = {"User-Agent": random.choice(USER_AGENTS)}
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()  # Raise HTTPError for bad responses
            return response.content
        except requests.RequestException as e:
            retries += 1
            print(f"Retry {retries} for {url}: {e}")
            time.sleep(2 ** retries)  # Exponential backoff
    return None

def scrape_recipe(url, country):
    """Scrape recipe details including title, rating, comments, nutrition, etc."""
    html_content = fetch_with_retries(url)
    if not html_content:
        return None  # Skip if failed to fetch

    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract Recipe Title
    title_element = soup.find('h1', class_='article-heading')
    title = title_element.get_text(strip=True) if title_element else None

    # Extract Times and Servings
    details_div = soup.find('div', class_='mm-recipes-details__content')
    time_and_servings = {}
    if details_div:
        details_items = details_div.find_all('div', class_='mm-recipes-details__item')
        for item in details_items:
            label = item.find('div', class_='mm-recipes-details__label').get_text(strip=True)
            value = item.find('div', class_='mm-recipes-details__value').get_text(strip=True)
            time_and_servings[label] = value

    # Extract Ingredients
    #ingredients_list = []
    #ingredients_ul = soup.find('ul', class_='mm-recipes-structured-ingredients__list')
    #if ingredients_ul:
    #    ingredients_items = ingredients_ul.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
    #    for item in ingredients_items:
    #        ingredient_text = item.get_text(strip=True)
    #        ingredients_list.append(ingredient_text)
    ingredients_list = []
    ingredients_ul = soup.find('ul', class_='mm-recipes-structured-ingredients__list')
    if ingredients_ul:
       ingredients_items = ingredients_ul.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
       for item in ingredients_items:
           ingredient_text = ' '.join(item.stripped_strings)  # Join all text with spaces
           ingredients_list.append(ingredient_text)

    # Extract Directions/Steps
    steps_list = []
    steps_ol = soup.find('ol', class_='mntl-sc-block-group--OL')
    if steps_ol:
        steps_items = steps_ol.find_all('li', class_='mntl-sc-block-group--LI')
        for step in steps_items:
            step_text = step.get_text(strip=True)
            steps_list.append(step_text)

    # Extract Overall Rating
    rating_element = soup.find('div', class_='mm-recipes-review-bar__rating')
    rating = rating_element.get_text(strip=True) if rating_element else None

    # Extract Comments and Feedback Chips
    feedback_list = []
    feedback_items = soup.find_all('div', class_='feedback review')
    for feedback in feedback_items:
        comment_element = feedback.find('div', class_='feedback__text')
        comment = comment_element.get_text(strip=True) if comment_element else None

        chips = []
        chip_elements = feedback.find_all('span', class_='feedback-chips__text')
        for chip in chip_elements:
            chips.append(chip.get_text(strip=True))

        feedback_list.append({
            'comment': comment,
            'chips': chips
        })

    # Extract Nutrition Facts
    nutrition_facts = {}
    nutrition_table = soup.find('table', class_='mm-recipes-nutrition-facts-summary__table')
    if nutrition_table:
        rows = nutrition_table.find_all('tr', class_='mm-recipes-nutrition-facts-summary__table-row')
        for row in rows:
            value = row.find('td', class_='mm-recipes-nutrition-facts-summary__table-cell text-body-100-prominent').get_text(strip=True)
            label = row.find('td', class_='mm-recipes-nutrition-facts-summary__table-cell text-body-100').get_text(strip=True)
            nutrition_facts[label] = value

    # Return Full Recipe Details
    return {
        'url': url,
        'country': country,
        'title': title,
        'details': time_and_servings,
        'ingredients': ingredients_list,
        'steps': steps_list,
        'rating': rating,
        'comments': feedback_list,
        'nutrition': nutrition_facts,
    }

def get_recipe_links(index_url):
    """Extract all unique recipe links from an index page."""
    html_content = fetch_with_retries(index_url)
    if not html_content:
        return []

    soup = BeautifulSoup(html_content, 'html.parser')
    recipe_links = []

    # Find all anchor tags leading to recipes
    for link in soup.find_all('a', href=True):
        href = link['href']
        if '/recipe/' in href:  # Adjust this pattern to match valid recipe URLs
            recipe_links.append(href)

    return list(set(recipe_links))  # Remove duplicates

# Example Usage


# Track country for each recipe
# Save recipes to a dataframe or JSON as needed

In [None]:


#Africa
east_af_url = ["https://www.allrecipes.com/recipes/17845/world-cuisine/african/east-african/"]
SAF_url = ["https://www.allrecipes.com/recipes/15035/world-cuisine/african/south-african/"]
NAF_url = ["https://www.allrecipes.com/recipes/17582/world-cuisine/african/north-african/",
           "https://www.allrecipes.com/recipes/1827/world-cuisine/african/north-african/moroccan/",
           "https://www.allrecipes.com/recipes/15039/world-cuisine/african/north-african/egyptian/"]





#europe
italian_url = [
    "https://www.allrecipes.com/recipes/16767/world-cuisine/european/italian/main-dishes/",
    "https://www.allrecipes.com/recipes/1789/world-cuisine/european/italian/authentic/",
    "https://www.allrecipes.com/recipes/1790/world-cuisine/european/italian/soups-and-stews/",
    "https://www.allrecipes.com/recipes/1791/world-cuisine/european/italian/desserts/"]
french_url = ["https://www.allrecipes.com/recipes/17138/world-cuisine/european/french/main-dishes/",
              "https://www.allrecipes.com/recipes/1857/world-cuisine/european/french/main-dishes/pork/",
              "https://www.allrecipes.com/recipes/1858/world-cuisine/european/french/main-dishes/chicken/",
              "https://www.allrecipes.com/recipes/1828/world-cuisine/european/french/desserts/",
              "https://www.allrecipes.com/recipes/1828/world-cuisine/european/french/desserts/",
              "https://www.allrecipes.com/recipes/16126/world-cuisine/european/french/french-bread/"]
ger_url = ["https://www.allrecipes.com/recipes/722/world-cuisine/european/german/",
           "https://www.allrecipes.com/recipes/16220/bread/yeast-bread/pretzels/",
           "https://www.allrecipes.com/recipes/2444/holidays-and-events/events-and-gatherings/oktoberfest/",
           "https://www.allrecipes.com/recipes/16147/salad/potato-salad/german-potato-salad/"]
grk_url = ["https://www.allrecipes.com/recipes/731/world-cuisine/european/greek/",
           "https://www.allrecipes.com/recipes/17152/world-cuisine/european/greek/main-dishes/",
           "https://www.allrecipes.com/recipes/1885/world-cuisine/european/greek/appetizers/",
           "https://www.allrecipes.com/recipes/1886/world-cuisine/european/greek/side-dishes/",
           "https://www.allrecipes.com/recipes/1887/world-cuisine/european/greek/salads/",
           "https://www.allrecipes.com/recipes/1888/world-cuisine/european/greek/desserts/"]
dutch_url = ["https://www.allrecipes.com/recipes/720/world-cuisine/european/dutch/"]
uk_ir_url = ["https://www.allrecipes.com/recipes/704/world-cuisine/european/uk-and-ireland/",
             "https://www.allrecipes.com/recipes/14980/world-cuisine/european/uk-and-ireland/occasions/",
             "https://www.allrecipes.com/recipes/705/world-cuisine/european/uk-and-ireland/english/",
             "https://www.allrecipes.com/recipes/706/world-cuisine/european/uk-and-ireland/irish/",
             "https://www.allrecipes.com/recipes/707/world-cuisine/european/uk-and-ireland/scottish/"]
sp_url = ["https://www.allrecipes.com/recipes/726/world-cuisine/european/spanish/",
          "https://www.allrecipes.com/recipes/17846/world-cuisine/european/spanish/main-dishes/",
          "https://www.allrecipes.com/recipes/17847/world-cuisine/european/spanish/appetizers/",
          "https://www.allrecipes.com/recipes/17848/world-cuisine/european/spanish/soups-and-stews/"]
portugal_url = ["https://www.allrecipes.com/recipes/724/world-cuisine/european/portuguese/"]
swiss_url = ["https://www.allrecipes.com/recipes/727/world-cuisine/european/swiss/"]
east_eu = ["https://www.allrecipes.com/recipes/713/world-cuisine/european/eastern-european/czech/",
           "https://www.allrecipes.com/recipes/714/world-cuisine/european/eastern-european/hungarian/",
           "https://www.allrecipes.com/recipes/715/world-cuisine/european/eastern-european/polish/",
           "https://www.allrecipes.com/recipes/716/world-cuisine/european/eastern-european/russian/"]
Scandinavian_url = ["https://www.allrecipes.com/recipes/1890/world-cuisine/european/scandinavian/swedish/",
                    "https://www.allrecipes.com/recipes/1891/world-cuisine/european/scandinavian/norwegian/",
                    "https://www.allrecipes.com/recipes/1892/world-cuisine/european/scandinavian/danish/",
                    "https://www.allrecipes.com/recipes/1893/world-cuisine/european/scandinavian/finnish/"]

#middle east
pers_url = ["https://www.allrecipes.com/recipes/15937/world-cuisine/middle-eastern/persian/"]
turk_url = ["https://www.allrecipes.com/recipes/1825/world-cuisine/middle-eastern/turkish/"]
leb_url = ["https://www.allrecipes.com/recipes/1824/world-cuisine/middle-eastern/lebanese/"]


#australia and new zeland (oceania)
aus_nz_url = ["https://www.allrecipes.com/recipes/228/world-cuisine/australian-and-new-zealander/",
              "https://www.allrecipes.com/recipes/15040/world-cuisine/australian-and-new-zealander/occasions/"]
#carrabean
cuba_url = ["https://www.allrecipes.com/recipes/709/world-cuisine/latin-american/caribbean/cuban/"]
jamaica_url = ["https://www.allrecipes.com/recipes/710/world-cuisine/latin-american/caribbean/jamaican/"]
puerto_rica_url = ["https://www.allrecipes.com/recipes/711/world-cuisine/latin-american/caribbean/puerto-rican/"]

#South america
mx_url = ["https://www.allrecipes.com/recipes/728/world-cuisine/latin-american/mexican/",
          "https://www.allrecipes.com/recipes/1215/world-cuisine/latin-american/mexican/soups-and-stews/",
          "https://www.allrecipes.com/recipes/1214/world-cuisine/latin-american/mexican/appetizers/",
          "https://www.allrecipes.com/recipes/17504/world-cuisine/latin-american/mexican/main-dishes/",
          "https://www.allrecipes.com/recipes/1216/world-cuisine/latin-american/mexican/main-dishes/burritos/"]
chili_url = ["https://www.allrecipes.com/recipes/1277/world-cuisine/latin-american/south-american/chilean/"]
colombia_url = ["https://www.allrecipes.com/recipes/14759/world-cuisine/latin-american/south-american/colombian/"]
argentina_url = ["https://www.allrecipes.com/recipes/2432/world-cuisine/latin-american/south-american/argentinian/"]
peru_url = ["https://www.allrecipes.com/recipes/2433/world-cuisine/latin-american/south-american/peruvian/"]


#Asia
jp_url = ["https://www.allrecipes.com/recipes/699/world-cuisine/asian/japanese/",
          "https://www.allrecipes.com/recipes/17490/world-cuisine/asian/japanese/appetizers/",
          "https://www.allrecipes.com/recipes/17491/world-cuisine/asian/japanese/main-dishes/",
          "https://www.allrecipes.com/recipes/17492/world-cuisine/asian/japanese/soups-and-stews/"]
chinese_url = ["https://www.allrecipes.com/recipes/17135/world-cuisine/asian/chinese/main-dishes/",
               "https://www.allrecipes.com/recipes/22838/world-cuisine/asian/chinese/main-dishes/beef/",
               "https://www.allrecipes.com/recipes/1902/world-cuisine/asian/chinese/main-dishes/chicken/",
               "https://www.allrecipes.com/recipes/1903/world-cuisine/asian/chinese/main-dishes/seafood/",
               "https://www.allrecipes.com/recipes/1901/world-cuisine/asian/chinese/main-dishes/pork/",
               "https://www.allrecipes.com/recipes/1900/world-cuisine/asian/chinese/soups-and-stews/"]
kr_url = ["https://www.allrecipes.com/recipes/700/world-cuisine/asian/korean/",
          "https://www.allrecipes.com/recipes/17832/world-cuisine/asian/korean/soups-and-stews/",
          "https://www.allrecipes.com/recipes/17833/world-cuisine/asian/korean/main-dishes/"]
ind_url = ["https://www.allrecipes.com/recipes/17136/world-cuisine/asian/indian/main-dishes/",
           "https://www.allrecipes.com/recipes/15973/world-cuisine/asian/indian/main-dishes/rice/",
           "https://www.allrecipes.com/recipes/1873/world-cuisine/asian/indian/main-dishes/curry/",
           "https://www.allrecipes.com/recipes/1875/world-cuisine/asian/indian/main-dishes/vegetarian/",
           "https://www.allrecipes.com/recipes/1879/world-cuisine/asian/indian/desserts/"]
pakistan_url = ["https://www.allrecipes.com/recipes/15974/world-cuisine/asian/pakistani/"]
bangladesh = ["https://www.allrecipes.com/recipes/16100/world-cuisine/asian/bangladeshi/"]
indonesia_url = ["https://www.allrecipes.com/recipes/698/world-cuisine/asian/indonesian/"]
malaysia_url = ["https://www.allrecipes.com/recipes/701/world-cuisine/asian/malaysian/"]
thailand_url = ["https://www.allrecipes.com/recipes/702/world-cuisine/asian/thai/"]
vietnam_url = ["https://www.allrecipes.com/recipes/703/world-cuisine/asian/vietnamese/"]

#North america
America_url = ["https://www.allrecipes.com/recipes/15876/us-recipes/southern/",
           "https://www.allrecipes.com/recipes/15965/us-recipes/jewish/",
           "https://www.allrecipes.com/recipes/16091/us-recipes/soul-food/",
           "https://www.allrecipes.com/recipes/17497/us-recipes/new-england/",
           "https://www.allrecipes.com/recipes/17425/us-recipes/us-recipes-by-state/"]
canada_url = ["https://www.allrecipes.com/recipes/15041/world-cuisine/canadian/occasions/",
              "https://www.allrecipes.com/recipes/16075/world-cuisine/canadian/vancouver/",
              "https://www.allrecipes.com/recipes/16104/world-cuisine/canadian/quebec/",
              "https://www.allrecipes.com/recipes/17719/world-cuisine/canadian/allrecipes-allstars/"]




In [None]:
import re  # For extracting dish type from URLs
import pandas as pd

countries_urls = {
    "Italian": italian_url,
    "French": french_url,
    "German": ger_url,
    "East_Africa": east_af_url,
    "South_Africa": SAF_url,
    "North_Africa": NAF_url,
    "Greek": grk_url,
    "Dutch": dutch_url,
    "UK_Ireland": uk_ir_url,
    "Spain": sp_url,
    "Portugal": portugal_url,
    "Switzerland": swiss_url,
    "Eastern_Europe": east_eu,
    "Scandinavia": Scandinavian_url,
    "Persian": pers_url,
    "Turkish": turk_url,
    "Lebanese": leb_url,
    "Australia_NZ": aus_nz_url,
    "Cuban": cuba_url,
    "Jamaican": jamaica_url,
    "Puerto_Rican": puerto_rica_url,
    "Mexican": mx_url,
    "Chilean": chili_url,
    "Colombian": colombia_url,
    "Argentinian": argentina_url,
    "Peruvian": peru_url,
    "Japanese": jp_url,
    "Chinese": chinese_url,
    "Korean": kr_url,
    "Indian": ind_url,
    "Pakistani": pakistan_url,
    "Bangladeshi": bangladesh,
    "Indonesian": indonesia_url,
    "Malaysian": malaysia_url,
    "Thai": thailand_url,
    "Vietnamese": vietnam_url,
    "American": America_url,
    "Canadian": canada_url
}

# Function to extract dish type from URL
def get_dish_type_from_url(url):
    # Common dish type keywords in URLs
    dish_keywords = [
        "main-dishes", "appetizers", "desserts", "soups-and-stews",
        "side-dishes", "salads", "breads", "beverages", "breakfast",
        "snacks", "sauces", "marinades"
    ]
    for keyword in dish_keywords:
        if keyword in url:
            # Return the dish type in readable format (replace '-' with ' ')
            return keyword.replace('-', ' ').capitalize()
    return "Unknown"  # Default if no keyword is found

# Updated scraping loop to include dish type
all_recipes = []

for country, urls in countries_urls.items():
    print(f"Processing recipes for {country}...")

    country_recipes = []
    for url in urls:
        recipe_urls = get_recipe_links(url)
        print(f"Found {len(recipe_urls)} recipe links for {country}.")
        for recipe_url in recipe_urls:
            recipe_data = scrape_recipe(recipe_url, country=country)
            if recipe_data:
                # Add dish type based on URL
                recipe_data["dish_type"] = get_dish_type_from_url(url)
                country_recipes.append(recipe_data)
            time.sleep(random.uniform(1, 3))  # Random delay between requests

    # Save the country's recipes to a CSV
    if country_recipes:
        df = pd.DataFrame(country_recipes)
        filename = f"{country}_recipes.csv"
        df.to_csv(filename, index=False)
        print(f"Saved {len(country_recipes)} recipes for {country} to {filename}.")

    # Add to the overall recipes list
    all_recipes.extend(country_recipes)

# Save all recipes into one file
if all_recipes:
    df_all = pd.DataFrame(all_recipes)
    df_all.to_csv("All_Recipes.csv", index=False)
    print(f"Saved all recipes to All_Recipes.csv.")


Processing recipes for Italian...
Found 34 recipe links for Italian.
Found 55 recipe links for Italian.
Found 59 recipe links for Italian.
Found 54 recipe links for Italian.
Saved 202 recipes for Italian to Italian_recipes.csv.
Processing recipes for French...
Found 64 recipe links for French.
Found 13 recipe links for French.
Found 26 recipe links for French.
Found 42 recipe links for French.
Found 42 recipe links for French.
Found 25 recipe links for French.
Saved 212 recipes for French to French_recipes.csv.
Processing recipes for German...
Found 61 recipe links for German.
Found 19 recipe links for German.
Found 69 recipe links for German.
Found 14 recipe links for German.
Saved 163 recipes for German to German_recipes.csv.
Processing recipes for East_Africa...
Found 24 recipe links for East_Africa.
Saved 24 recipes for East_Africa to East_Africa_recipes.csv.
Processing recipes for South_Africa...
Found 19 recipe links for South_Africa.
Saved 19 recipes for South_Africa to South_Af

In [None]:
import requests
import xml.etree.ElementTree as ET

def parse_sitemap(sitemap_url):
    """Fetch and parse a single sitemap, returning recipe URLs."""
    print(f"Parsing sitemap: {sitemap_url}")
    response = requests.get(sitemap_url)
    root = ET.fromstring(response.content)
    recipe_urls = []

    # Extract all <loc> elements with recipe URLs
    for url in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
        loc = url.text
        if "https://www.allrecipes.com/recipe/" in loc:
            recipe_urls.append(loc)
    return recipe_urls

def main():
    # Main sitemap index URL
    sitemap_index_url = "https://www.allrecipes.com/sitemap.xml"
    response = requests.get(sitemap_index_url)
    root = ET.fromstring(response.content)

    all_recipe_urls = []

    # Extract individual sitemap URLs from sitemap index
    for sitemap in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap"):
        sitemap_url = sitemap.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
        recipe_urls = parse_sitemap(sitemap_url)
        all_recipe_urls.extend(recipe_urls)

    # Print the total number of URLs collected
    print(f"Total recipe URLs collected: {len(all_recipe_urls)}")

    # Optional: Save to a file
    with open("recipe_urls.txt", "w") as f:
        f.write("\n".join(all_recipe_urls))
    print("Saved recipe URLs to recipe_urls.txt")

    return all_recipe_urls

if __name__ == "__main__":
    all_urls = main()


Parsing sitemap: https://www.allrecipes.com/sitemap_1.xml
Parsing sitemap: https://www.allrecipes.com/sitemap_2.xml
Parsing sitemap: https://www.allrecipes.com/sitemap_3.xml
Parsing sitemap: https://www.allrecipes.com/sitemap_4.xml
Total recipe URLs collected: 48667
Saved recipe URLs to recipe_urls.txt


In [None]:
# Function to extract dish type from URL
def get_dish_type_from_url(url):
    # Common dish type keywords in URLs
    dish_keywords = [
        "main-dishes", "appetizers", "desserts", "soups-and-stews",
        "side-dishes", "salads", "breads", "beverages", "breakfast",
        "snacks", "sauces", "marinades"
    ]
    for keyword in dish_keywords:
        if keyword in url:
            # Return the dish type in readable format (replace '-' with ' ')
            return keyword.replace('-', ' ').capitalize()
    return "Unknown"  # Default if no keyword is found

import pandas as pd

batch_size = 1000
all_sitemap_recipes = []
for i in range(20000, len(all_urls), batch_size):
    batch_urls = all_urls[i:i+batch_size]
    for url in batch_urls:
        recipe_data = scrape_recipe(url, country="N/A")
        if recipe_data:
            recipe_data["dish_type"] = get_dish_type_from_url(url)
            all_sitemap_recipes.append(recipe_data)
        time.sleep(random.uniform(0.5, 1.5))

    # Sauvegarde après chaque batch
    df = pd.DataFrame(all_sitemap_recipes)
    df.to_csv(f'sitemap_recipes_batch_{i//batch_size + 1}.csv', index=False)
