In [1]:
import pandas as pd
from rapidfuzz import process, fuzz
import os

In [1]:
# Define the categories and corresponding CSV filenames
categories = {
    "aardappelen_groente_fruit": "https://www.jumbo.com/producten/aardappelen,-groente-en-fruit/",
    "verse_maaltijden_gemak": "https://www.jumbo.com/producten/verse-maaltijden-en-gemak/",
    "vlees_vis_en_vega": "https://www.jumbo.com/producten/vlees,-vis-en-vega/",
    "brood_en_gebak": "https://www.jumbo.com/producten/brood-en-gebak/",
    "vleeswaren_kaas_tapas": "https://www.jumbo.com/producten/vleeswaren,-kaas-en-tapas/",
    "zuivel_eieren_boter": "https://www.jumbo.com/producten/zuivel,-eieren,-boter/",
    "conserven_soepen_sauzen_olien": "https://www.jumbo.com/producten/conserven,-soepen,-sauzen,-olien/",
    "wereldkeukens_kruiden_pasta_rijst": "https://www.jumbo.com/producten/wereldkeukens,-kruiden,-pasta-en-rijst/",
    "ontbijt_broodbeleg_bakproducten": "https://www.jumbo.com/producten/ontbijt,-broodbeleg-en-bakproducten/",
    "koek_snoep_chocolade_chips": "https://www.jumbo.com/producten/koek,-snoep,-chocolade-en-chips/",
    "koffie_thee": "https://www.jumbo.com/producten/koffie-en-thee/",
    "frisdrank_sappen": "https://www.jumbo.com/producten/frisdrank-en-sappen/",
    "bier_wijn": "https://www.jumbo.com/producten/bier-en-wijn/",
    "diepvries": "https://www.jumbo.com/producten/diepvries/",
    "drogisterij": "https://www.jumbo.com/producten/drogisterij/",
    "baby_peuter": "https://www.jumbo.com/producten/baby,-peuter/",
    "huishouden_dieren_servicebalie": "https://www.jumbo.com/producten/huishouden,-dieren,-servicebalie/"
}

In [12]:
# Initialize an empty DataFrame to hold all the data
df_jumbo = pd.DataFrame()

# Loop through each category and attempt to read the CSV file
for category in categories.keys():
    csv_filename = f"jumbo_{category}.csv"
    
    # Check if the CSV file exists
    if os.path.exists(csv_filename):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(csv_filename)
        
        # Append the data to the main DataFrame
        df_jumbo = pd.concat([df_jumbo, df], ignore_index=True)
    else:
        print(f"File {csv_filename} not found. Skipping...")

# Display the combined DataFrame
df_jumbo.head()

File jumbo_conserven_soepen_sauzen_olien.csv not found. Skipping...
File jumbo_wereldkeukens_kruiden_pasta_rijst.csv not found. Skipping...
File jumbo_ontbijt_broodbeleg_bakproducten.csv not found. Skipping...
File jumbo_koek_snoep_chocolade_chips.csv not found. Skipping...
File jumbo_koffie_thee.csv not found. Skipping...
File jumbo_frisdrank_sappen.csv not found. Skipping...
File jumbo_bier_wijn.csv not found. Skipping...
File jumbo_diepvries.csv not found. Skipping...
File jumbo_drogisterij.csv not found. Skipping...
File jumbo_baby_peuter.csv not found. Skipping...
File jumbo_huishouden_dieren_servicebalie.csv not found. Skipping...


Unnamed: 0,Product Name,Image URL,Price,Price per Unit
0,Dole Bananen Tros,https://jumbo.com/dam-images/fit-in/360x360/Pr...,1.89 €,"€ 0,38 per stuk0,38/stuk"
1,Jumbo Aardbeien Hollands 400g,https://jumbo.com/dam-images/fit-in/360x360/Pr...,3.69 €,"€ 9,23 per kilo9,23/kilo"
2,Hak Kikkererwten 190g,https://jumbo.com/dam-images/fit-in/360x360/Pr...,0.97 €,"€ 8,43 per kilo8,43/kilo"
3,Jumbo Mandarijnen 1kg,https://jumbo.com/dam-images/fit-in/360x360/Pr...,2.75 €,"€ 2,75 per kilo2,75/kilo"
4,Jumbo Druiven Wit Pitloos 500g,https://jumbo.com/dam-images/fit-in/360x360/Pr...,2.39 €,"€ 4,78 per kilo4,78/kilo"


In [11]:
my_grocery_list_df = pd.read_csv('my_grocery_list.csv')

In [16]:
# Step 2: Define a function for fuzzy matching with a confidence threshold
def fuzzy_match(item_name, choices, scorer=fuzz.WRatio, threshold=90):
    match = process.extractOne(item_name, choices, scorer=scorer)
    if match and match[1] >= threshold:
        return match[0]
    return None

# Step 3: Create a list to hold the matches
matches = []

# Step 4: Perform fuzzy matching for each item in my_grocery_list_df with a high confidence threshold
for item in my_grocery_list_df['Item Name']:
    match = fuzzy_match(item, df_jumbo['Product Name'], threshold=90)
    matches.append(match)

# Step 5: Add the matches to the my_grocery_list_df
my_grocery_list_df['Matched Product'] = matches

# Step 6: Merge the DataFrames on the matched product name to include 'Price' and 'Image URL'
merged_df = pd.merge(
    my_grocery_list_df, 
    df_jumbo[['Product Name', 'Price', 'Image URL', 'Price per Unit']], 
    how='left', 
    left_on='Matched Product', 
    right_on='Product Name'
)

# Drop the duplicate 'Product Name' column
merged_df.drop(columns=['Product Name'], inplace=True)

# Display the merged DataFrame
merged_df.head()

Unnamed: 0,Item Name,Amount,Quantity,Matched Product,Price,Image URL,Price per Unit
0,Noten Mix,200g,1 pack,,,,
1,Wasa Volkoren,275g,1 pack,,,,
2,Mager rundergehakt,500g,1 pack,,,,
3,Bananen,-,6 pieces,Dole Bananen Tros,1.89 €,https://jumbo.com/dam-images/fit-in/360x360/Pr...,"€ 0,38 per stuk0,38/stuk"
4,Maza Hoemoes,250g,1 tub,Maza Hoemoes Zongedroogde Tomaat 200g,2.49 €,https://jumbo.com/dam-images/fit-in/360x360/Pr...,"€ 12,45 per kilo12,45/kilo"


In [17]:
merged_df

Unnamed: 0,Item Name,Amount,Quantity,Matched Product,Price,Image URL,Price per Unit
0,Noten Mix,200g,1 pack,,,,
1,Wasa Volkoren,275g,1 pack,,,,
2,Mager rundergehakt,500g,1 pack,,,,
3,Bananen,-,6 pieces,Dole Bananen Tros,1.89 €,https://jumbo.com/dam-images/fit-in/360x360/Pr...,"€ 0,38 per stuk0,38/stuk"
4,Maza Hoemoes,250g,1 tub,Maza Hoemoes Zongedroogde Tomaat 200g,2.49 €,https://jumbo.com/dam-images/fit-in/360x360/Pr...,"€ 12,45 per kilo12,45/kilo"
5,Magere kwark,500g,2 tubs,Optimel Magere kwark vanille 1 x 500g,2.49 €,https://jumbo.com/dam-images/fit-in/360x360/Pr...,"€ 4,98 per kilo4,98/kilo"
6,Scharrel kipfiletblokjes,400g,1 pack,,,,
7,Volkorenbrood,800g,1 loaf,Fijn Volkorenbrood,0.99 €,https://jumbo.com/dam-images/fit-in/360x360/Pr...,"€ 0,99 per stuk0,99/stuk"
8,Aardbeien,250g,1 pack,Jumbo Aardbeien Hollands 400g,3.69 €,https://jumbo.com/dam-images/fit-in/360x360/Pr...,"€ 9,23 per kilo9,23/kilo"
9,Volkoren pasta,500g,1 pack,,,,


In [19]:

# -------ALDI Scraped Data-------

In [19]:
# import requests
# from bs4 import BeautifulSoup
# import json

# # URL of the page to scrape
# base_url = "https://www.aldi.nl"
# page_url = f"{base_url}/producten.html"

# # Send a GET request to the page
# response = requests.get(page_url)

# # Parse the HTML content
# soup = BeautifulSoup(response.content, 'html.parser')

# # Find all content tiles containing product information
# tiles = soup.find_all('div', class_='mod-content-tile')

# # List to store the product categories and links
# product_categories = []

# # Loop through each tile to extract the title and link
# for tile in tiles:
#     title = tile.find('h4', class_='mod-content-tile__title').get_text(strip=True)
#     link = tile.find('a', class_='mod-content-tile__action')['href']
#     full_link = f"{base_url}{link}"
#     product_categories.append({"title": title, "link": full_link})

# # Convert the list to JSON format
# json_output = json.dumps(product_categories, indent=4)

# # Print the JSON output
# print(json_output)

In [51]:
import requests
from bs4 import BeautifulSoup
import json

# Base URL of the Aldi website
base_url = "https://www.aldi.nl"

# URL of the main product page to scrape
page_url = f"{base_url}/producten.html"

def scrape_categories(page_url):
    """
    Scrape all category names and links from the main product page.
    
    Args:
    page_url (str): The URL of the main product page.
    
    Returns:
    dict: A dictionary containing category names as keys and their corresponding URLs as values.
    """
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    categories = {}

    # Find all content tiles containing category information
    tiles = soup.find_all('div', class_='mod-content-tile')

    # Loop through each tile to extract the category title and link
    for tile in tiles:
        try:
            category_name = tile.find('h4', class_='mod-content-tile__title').get_text(strip=True)
            category_link = tile.find('a', class_='mod-content-tile__action')['href']
            full_category_link = f"{base_url}{category_link}"
            categories[category_name] = full_category_link
        except AttributeError:
            continue
    
    return categories

def scrape_subcategories(category_url):
    """
    Scrape all subcategory names and links from a category page.
    
    Args:
    category_url (str): The URL of the category page.
    
    Returns:
    dict: A dictionary containing subcategory names as keys and their corresponding URLs as values.
    """
    response = requests.get(category_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    subcategories = {}

    # Find all content tiles containing subcategory information
    tiles = soup.find_all('div', class_='mod-content-tile')

    # Loop through each tile to extract the subcategory title and link
    for tile in tiles:
        try:
            subcategory_name = tile.find('h4', class_='mod-content-tile__title').get_text(strip=True)
            subcategory_link = tile.find('a', class_='mod-content-tile__action')['href']
            full_subcategory_link = f"{base_url}{subcategory_link}"
            subcategories[subcategory_name] = full_subcategory_link
        except AttributeError:
            continue
    
    return subcategories

def scrape_tiles_from_links(subcategory_links):
    """
    Go through each subcategory link, scrape 'tiles-grid' divs, and print contents of 'mod-content-tile__meta'.
    
    Args:
    subcategory_links (list): A list of subcategory URLs.
    """
    for link in subcategory_links:
        print(f"\nScraping tiles from: {link}")
        response = requests.get(link)
        
        # Check for a valid response
        if response.status_code != 200:
            print(f"Failed to retrieve {link}: Status code {response.status_code}")
            continue
        
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all <div> elements with the class 'tiles-grid'
        tiles = soup.find_all('div', class_='tiles-grid')

        # Loop through each 'tiles-grid' element to find 'mod-content-tile__meta'
        for tile in tiles:
            meta_elements = tile.find_all(class_='mod-content-tile__meta')
            for meta in meta_elements:
                print(meta.get_text(strip=True))  # Print the text of each 'mod-content-tile__meta' element

def main():
    # Scrape the categories from the main product page
    categories = scrape_categories(page_url)

    # Convert the dictionary to JSON format for categories
    json_categories = json.dumps(categories, indent=4, ensure_ascii=False)
    print("Categories JSON:")
    print(json_categories)

    # Initialize a dictionary to store all subcategories for each category
    all_subcategories = {}

    # Scrape subcategories for each category
    for category_name, category_url in categories.items():
        subcategories = scrape_subcategories(category_url)
        all_subcategories[category_name] = subcategories

    # Convert the dictionary to JSON format for subcategories
    json_subcategories = json.dumps(all_subcategories, indent=4, ensure_ascii=False)
    print("\nSubcategories JSON:")
    print(json_subcategories)

    # Extract all subcategory links into a list
    subcategory_links = []
    for subcategory_dict in all_subcategories.values():
        subcategory_links.extend(subcategory_dict.values())

    print("\nSubcategory Links List:")
    print(subcategory_links)

    # Scrape tiles from the subcategory links
    scrape_tiles_from_links(subcategory_links)

if __name__ == "__main__":
    main()


Categories JSON:
{
    "Winnaars": "https://www.aldi.nl/producten/winnaars.html",
    "Aardappels, groente, fruit": "https://www.aldi.nl/producten/aardappels-groente-fruit.html",
    "Brood, bakkerij": "https://www.aldi.nl/producten/brood-bakkerij.html",
    "Ontbijtgranen, broodbeleg, tussendoortjes": "https://www.aldi.nl/producten/ontbijtgranen-broodbeleg-tussendoortjes.html",
    "Zuivel, eieren, boter": "https://www.aldi.nl/producten/zuivel-eieren-boter.html",
    "Kaas, vleeswaren, tapas": "https://www.aldi.nl/producten/kaas-vleeswaren-tapas.html",
    "Wijn": "https://www.aldi.nl/producten/wijn.html",
    "Vlees, vis, vega": "https://www.aldi.nl/producten/vlees-vis-vega.html",
    "Maaltijden, salades": "https://www.aldi.nl/producten/maaltijden-salades.html",
    "Pasta, rijst, bakken, internationale keuken": "https://www.aldi.nl/producten/pasta-rijst-bakken-internationale-keuken.html",
    "Soepen, sauzen, smaakmakers, conserven": "https://www.aldi.nl/producten/soepen-sauzen-sma

: 