In [2]:
import pandas as pd
from rapidfuzz import process, fuzz
import os
import requests
from bs4 import BeautifulSoup

In [3]:
categories = {
 'Aardappels, groente, fruit': 'https://www.aldi.nl/producten/aardappels-groente-fruit.html',
 'Brood, bakkerij': 'https://www.aldi.nl/producten/brood-bakkerij.html',
 'Ontbijtgranen, broodbeleg, tussendoortjes': 'https://www.aldi.nl/producten/ontbijtgranen-broodbeleg-tussendoortjes.html',
 'Zuivel, eieren, boter': 'https://www.aldi.nl/producten/zuivel-eieren-boter.html',
 'Kaas, vleeswaren, tapas': 'https://www.aldi.nl/producten/kaas-vleeswaren-tapas.html',
 'Wijn': 'https://www.aldi.nl/producten/wijn.html',
 'Vlees, vis, vega': 'https://www.aldi.nl/producten/vlees-vis-vega.html',
 'Maaltijden, salades': 'https://www.aldi.nl/producten/maaltijden-salades.html',
 'Pasta, rijst, bakken, internationale keuken': 'https://www.aldi.nl/producten/pasta-rijst-bakken-internationale-keuken.html',
 'Soepen, sauzen, smaakmakers, conserven': 'https://www.aldi.nl/producten/soepen-sauzen-smaakmakers-conserven.html',
 'Snoep, koeken': 'https://www.aldi.nl/producten/snoep-koeken.html',
 'Chips, noten': 'https://www.aldi.nl/producten/chips-noten.html',
 'Diepvries': 'https://www.aldi.nl/producten/diepvries.html',
 'Bier en likeuren': 'https://www.aldi.nl/producten/bier-en-likeuren.html',
 'Sappen, frisdrank': 'https://www.aldi.nl/producten/sappen-frisdrank.html',
 'Thee, koffie': 'https://www.aldi.nl/producten/thee-koffie.html',
 'Huishouden': 'https://www.aldi.nl/producten/huishouden.html',
 'Baby, persoonlijke verzorging': 'https://www.aldi.nl/producten/baby-persoonlijke-verzorging.html',
 'Huisdieren': 'https://www.aldi.nl/producten/huisdieren.html',
 'Prijsverlagingen': 'https://www.aldi.nl/producten/Prijsverlagingen.html',
 'Zomerassortiment': 'https://www.aldi.nl/producten/zomerassortiment.html',
 'Cadeaukaarten': 'https://www.aldi.nl/producten/cadeaukaarten.html'
}

In [4]:
def scrape_sub_categories(url):
    response = requests.get(url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    sub_categories = {}
    # Find all the sub-categories by their div class
    for div in soup.find_all('div', class_="mod-content-tile__content"):
        h4_tag = div.find('h4')
        link_tag = div.find('a', class_="link link--primary")
        
        if h4_tag and link_tag:
            sub_category_name = h4_tag.text.strip()
            sub_category_link = link_tag['href']
            sub_categories[sub_category_name] = "https://www.aldi.nl" + sub_category_link

    return sub_categories

# Initialize an empty dictionary to hold all subcategory data
all_sub_categories = {}

# Iterate over the main categories and collect sub-categories into a flat structure
for category, url in categories.items():
    sub_categories = scrape_sub_categories(url)
    all_sub_categories.update(sub_categories)

In [5]:
all_sub_categories

{'Aardappelen': 'https://www.aldi.nl/producten/aardappels-groente-fruit/aardappelen.html',
 'Groenten': 'https://www.aldi.nl/producten/aardappels-groente-fruit/groenten.html',
 'Vers gesneden groenten, fruit en sla': 'https://www.aldi.nl/producten/aardappels-groente-fruit/vers-gesneden-groenten-fruit-en-sla.html',
 'Fruit': 'https://www.aldi.nl/producten/aardappels-groente-fruit/fruit.html',
 'Dagvers brood': 'https://www.aldi.nl/producten/brood-bakkerij/dagvers-brood.html',
 'Vers afgebakken snacks': 'https://www.aldi.nl/producten/brood-bakkerij/vers-afgebakken-snacks.html',
 'Vers afgebakken brood': 'https://www.aldi.nl/producten/brood-bakkerij/vers-afgebakken-brood.html',
 'Thuis afbakbrood': 'https://www.aldi.nl/producten/brood-bakkerij/thuis-afbakbrood.html',
 'Beschuit, toast, crackers': 'https://www.aldi.nl/producten/brood-bakkerij/beschuit-toast-crackers.html',
 'Zelf bakken': 'https://www.aldi.nl/producten/brood-bakkerij/zelf-bakken.html',
 'Ontbijtgranen, muesli': 'https://ww

In [6]:
# Initialize an empty DataFrame to store the data for all categories
df_aldi = pd.DataFrame(columns=['Category', 'Product Link', 'Product Name', 'Price', 'Quantity'])

# Iterate over each category and its link in all_sub_categories
for category_name, category_link in all_sub_categories.items():
    print(f"Scraping category: {category_name} from URL: {category_link}")
    
    try:
        # Fetch the category page content
        response = requests.get(category_link)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all product tiles on the category page
        tiles = soup.find_all('div', class_="mod mod-article-tile-placeholder mod-article-tile-placeholder--small")
        
        # Extract the 'data-tile-url' attribute for each product
        tile_urls = ["https://www.aldi.nl/" + tile['data-tile-url'] for tile in tiles if 'data-tile-url' in tile.attrs]
        
        # Create a temporary DataFrame for the current category
        df = pd.DataFrame({
            'Category': [category_name] * len(tile_urls),
            'Product Link': tile_urls,
            'Product Name': [""] * len(tile_urls),
            'Price': [""] * len(tile_urls),
            'Quantity': [""] * len(tile_urls)
        })
        
        # Iterate over each row in the DataFrame and scrape the data
        for index, row in df.iterrows():
            product_link = row['Product Link']
            print(f"Scraping product URL: {product_link}")
            
            try:
                # Fetch the product page content
                response = requests.get(product_link)
                response.raise_for_status()  # Ensure the request was successful
                
                # Parse the HTML content
                product_soup = BeautifulSoup(response.text, 'html.parser')
                
                # Scrape product data: price, quantity, and product name
                price_tag = product_soup.find('span', class_="price__wrapper")
                price = price_tag.text.strip() if price_tag else 'N/A'
                
                quantity_tag = product_soup.find('span', class_='price__unit')
                quantity = quantity_tag.text.strip() if quantity_tag else 'N/A'
                
                product_name_tag = product_soup.find('span', class_='mod-article-tile__title')
                product_name = product_name_tag.text.strip() if product_name_tag else 'N/A'
                
                # Update the DataFrame with the scraped data
                df.at[index, 'Product Name'] = product_name
                df.at[index, 'Price'] = price
                df.at[index, 'Quantity'] = quantity

            except Exception as e:
                print(f"Error scraping {product_link}: {e}")

            print(f"Product Name: {product_name}, Price: {price}, Quantity: {quantity}")
            print("------------------------------------------------------")
        
        # Append the data for the current category to the main DataFrame
        df_aldi = pd.concat([df_aldi, df], ignore_index=True)

    except Exception as e:
        print(f"Error scraping category {category_name}: {e}")

# Display the final DataFrame
print(df_aldi)

Scraping category: Aardappelen from URL: https://www.aldi.nl/producten/aardappels-groente-fruit/aardappelen.html
Scraping product URL: https://www.aldi.nl//content/aldi/netherlands/nl/web-consumer/producten/aardappels-groente-fruit/aardappelen/jcr:content/par/tiles/par/articletile12_copy_1.html
Product Name: Culinaire smaak aardappels, Price: 1.49, Quantity: Zak 800 g.
------------------------------------------------------
Scraping product URL: https://www.aldi.nl//content/aldi/netherlands/nl/web-consumer/producten/aardappels-groente-fruit/aardappelen/jcr:content/par/tiles/par/articletile12_copy_1_1744342783.html
Product Name: Culinaire zoete aardappelen, Price: 1.99, Quantity: 750 g
------------------------------------------------------
Scraping product URL: https://www.aldi.nl//content/aldi/netherlands/nl/web-consumer/producten/aardappels-groente-fruit/aardappelen/jcr:content/par/tiles/par/articletile_1802744775.html
Product Name: Vastkokende aardappelen, Price: N/A, Quantity: 1-kg
-

In [73]:
df_aldi.to_csv('./Aldi/aldi_products.csv', index=False)

In [None]:
# -------------------------------WIP from here on - need to change---------------------------------

In [11]:
my_grocery_list_df = pd.read_csv('my_grocery_list.csv')

In [16]:
# Step 2: Define a function for fuzzy matching with a confidence threshold
def fuzzy_match(item_name, choices, scorer=fuzz.WRatio, threshold=90):
    match = process.extractOne(item_name, choices, scorer=scorer)
    if match and match[1] >= threshold:
        return match[0]
    return None

# Step 3: Create a list to hold the matches
matches = []

# Step 4: Perform fuzzy matching for each item in my_grocery_list_df with a high confidence threshold
for item in my_grocery_list_df['Item Name']:
    match = fuzzy_match(item, df_jumbo['Product Name'], threshold=90)
    matches.append(match)

# Step 5: Add the matches to the my_grocery_list_df
my_grocery_list_df['Matched Product'] = matches

# Step 6: Merge the DataFrames on the matched product name to include 'Price' and 'Image URL'
merged_df = pd.merge(
    my_grocery_list_df, 
    df_jumbo[['Product Name', 'Price', 'Image URL', 'Price per Unit']], 
    how='left', 
    left_on='Matched Product', 
    right_on='Product Name'
)

# Drop the duplicate 'Product Name' column
merged_df.drop(columns=['Product Name'], inplace=True)

# Display the merged DataFrame
merged_df.head()

Unnamed: 0,Item Name,Amount,Quantity,Matched Product,Price,Image URL,Price per Unit
0,Noten Mix,200g,1 pack,,,,
1,Wasa Volkoren,275g,1 pack,,,,
2,Mager rundergehakt,500g,1 pack,,,,
3,Bananen,-,6 pieces,Dole Bananen Tros,1.89 €,https://jumbo.com/dam-images/fit-in/360x360/Pr...,"€ 0,38 per stuk0,38/stuk"
4,Maza Hoemoes,250g,1 tub,Maza Hoemoes Zongedroogde Tomaat 200g,2.49 €,https://jumbo.com/dam-images/fit-in/360x360/Pr...,"€ 12,45 per kilo12,45/kilo"


In [17]:
merged_df

Unnamed: 0,Item Name,Amount,Quantity,Matched Product,Price,Image URL,Price per Unit
0,Noten Mix,200g,1 pack,,,,
1,Wasa Volkoren,275g,1 pack,,,,
2,Mager rundergehakt,500g,1 pack,,,,
3,Bananen,-,6 pieces,Dole Bananen Tros,1.89 €,https://jumbo.com/dam-images/fit-in/360x360/Pr...,"€ 0,38 per stuk0,38/stuk"
4,Maza Hoemoes,250g,1 tub,Maza Hoemoes Zongedroogde Tomaat 200g,2.49 €,https://jumbo.com/dam-images/fit-in/360x360/Pr...,"€ 12,45 per kilo12,45/kilo"
5,Magere kwark,500g,2 tubs,Optimel Magere kwark vanille 1 x 500g,2.49 €,https://jumbo.com/dam-images/fit-in/360x360/Pr...,"€ 4,98 per kilo4,98/kilo"
6,Scharrel kipfiletblokjes,400g,1 pack,,,,
7,Volkorenbrood,800g,1 loaf,Fijn Volkorenbrood,0.99 €,https://jumbo.com/dam-images/fit-in/360x360/Pr...,"€ 0,99 per stuk0,99/stuk"
8,Aardbeien,250g,1 pack,Jumbo Aardbeien Hollands 400g,3.69 €,https://jumbo.com/dam-images/fit-in/360x360/Pr...,"€ 9,23 per kilo9,23/kilo"
9,Volkoren pasta,500g,1 pack,,,,


In [19]:

# -------ALDI Scraped Data-------

In [19]:
# import requests
# from bs4 import BeautifulSoup
# import json

# # URL of the page to scrape
# base_url = "https://www.aldi.nl"
# page_url = f"{base_url}/producten.html"

# # Send a GET request to the page
# response = requests.get(page_url)

# # Parse the HTML content
# soup = BeautifulSoup(response.content, 'html.parser')

# # Find all content tiles containing product information
# tiles = soup.find_all('div', class_='mod-content-tile')

# # List to store the product categories and links
# product_categories = []

# # Loop through each tile to extract the title and link
# for tile in tiles:
#     title = tile.find('h4', class_='mod-content-tile__title').get_text(strip=True)
#     link = tile.find('a', class_='mod-content-tile__action')['href']
#     full_link = f"{base_url}{link}"
#     product_categories.append({"title": title, "link": full_link})

# # Convert the list to JSON format
# json_output = json.dumps(product_categories, indent=4)

# # Print the JSON output
# print(json_output)

In [51]:
import requests
from bs4 import BeautifulSoup
import json

# Base URL of the Aldi website
base_url = "https://www.aldi.nl"

# URL of the main product page to scrape
page_url = f"{base_url}/producten.html"

def scrape_categories(page_url):
    """
    Scrape all category names and links from the main product page.
    
    Args:
    page_url (str): The URL of the main product page.
    
    Returns:
    dict: A dictionary containing category names as keys and their corresponding URLs as values.
    """
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    categories = {}

    # Find all content tiles containing category information
    tiles = soup.find_all('div', class_='mod-content-tile')

    # Loop through each tile to extract the category title and link
    for tile in tiles:
        try:
            category_name = tile.find('h4', class_='mod-content-tile__title').get_text(strip=True)
            category_link = tile.find('a', class_='mod-content-tile__action')['href']
            full_category_link = f"{base_url}{category_link}"
            categories[category_name] = full_category_link
        except AttributeError:
            continue
    
    return categories

def scrape_subcategories(category_url):
    """
    Scrape all subcategory names and links from a category page.
    
    Args:
    category_url (str): The URL of the category page.
    
    Returns:
    dict: A dictionary containing subcategory names as keys and their corresponding URLs as values.
    """
    response = requests.get(category_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    subcategories = {}

    # Find all content tiles containing subcategory information
    tiles = soup.find_all('div', class_='mod-content-tile')

    # Loop through each tile to extract the subcategory title and link
    for tile in tiles:
        try:
            subcategory_name = tile.find('h4', class_='mod-content-tile__title').get_text(strip=True)
            subcategory_link = tile.find('a', class_='mod-content-tile__action')['href']
            full_subcategory_link = f"{base_url}{subcategory_link}"
            subcategories[subcategory_name] = full_subcategory_link
        except AttributeError:
            continue
    
    return subcategories

def scrape_tiles_from_links(subcategory_links):
    """
    Go through each subcategory link, scrape 'tiles-grid' divs, and print contents of 'mod-content-tile__meta'.
    
    Args:
    subcategory_links (list): A list of subcategory URLs.
    """
    for link in subcategory_links:
        print(f"\nScraping tiles from: {link}")
        response = requests.get(link)
        
        # Check for a valid response
        if response.status_code != 200:
            print(f"Failed to retrieve {link}: Status code {response.status_code}")
            continue
        
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all <div> elements with the class 'tiles-grid'
        tiles = soup.find_all('div', class_='tiles-grid')

        # Loop through each 'tiles-grid' element to find 'mod-content-tile__meta'
        for tile in tiles:
            meta_elements = tile.find_all(class_='mod-content-tile__meta')
            for meta in meta_elements:
                print(meta.get_text(strip=True))  # Print the text of each 'mod-content-tile__meta' element

def main():
    # Scrape the categories from the main product page
    categories = scrape_categories(page_url)

    # Convert the dictionary to JSON format for categories
    json_categories = json.dumps(categories, indent=4, ensure_ascii=False)
    print("Categories JSON:")
    print(json_categories)

    # Initialize a dictionary to store all subcategories for each category
    all_subcategories = {}

    # Scrape subcategories for each category
    for category_name, category_url in categories.items():
        subcategories = scrape_subcategories(category_url)
        all_subcategories[category_name] = subcategories

    # Convert the dictionary to JSON format for subcategories
    json_subcategories = json.dumps(all_subcategories, indent=4, ensure_ascii=False)
    print("\nSubcategories JSON:")
    print(json_subcategories)

    # Extract all subcategory links into a list
    subcategory_links = []
    for subcategory_dict in all_subcategories.values():
        subcategory_links.extend(subcategory_dict.values())

    print("\nSubcategory Links List:")
    print(subcategory_links)

    # Scrape tiles from the subcategory links
    scrape_tiles_from_links(subcategory_links)

if __name__ == "__main__":
    main()


Categories JSON:
{
    "Winnaars": "https://www.aldi.nl/producten/winnaars.html",
    "Aardappels, groente, fruit": "https://www.aldi.nl/producten/aardappels-groente-fruit.html",
    "Brood, bakkerij": "https://www.aldi.nl/producten/brood-bakkerij.html",
    "Ontbijtgranen, broodbeleg, tussendoortjes": "https://www.aldi.nl/producten/ontbijtgranen-broodbeleg-tussendoortjes.html",
    "Zuivel, eieren, boter": "https://www.aldi.nl/producten/zuivel-eieren-boter.html",
    "Kaas, vleeswaren, tapas": "https://www.aldi.nl/producten/kaas-vleeswaren-tapas.html",
    "Wijn": "https://www.aldi.nl/producten/wijn.html",
    "Vlees, vis, vega": "https://www.aldi.nl/producten/vlees-vis-vega.html",
    "Maaltijden, salades": "https://www.aldi.nl/producten/maaltijden-salades.html",
    "Pasta, rijst, bakken, internationale keuken": "https://www.aldi.nl/producten/pasta-rijst-bakken-internationale-keuken.html",
    "Soepen, sauzen, smaakmakers, conserven": "https://www.aldi.nl/producten/soepen-sauzen-sma

: 