In [1]:
import os
import subprocess
import pandas as pd
from dotenv import load_dotenv

import albert_heijn.ah_scraper as ah_scraper
import aldi.aldi_scraper as aldi_scraper
import jumbo.jumbo_scraper as jumbo_scraper

load_dotenv()

True

In [2]:
# Function to create directory if it doesn't exist
def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Created directory: {path}") 

In [3]:
path_ah = 'albert_heijn/'
path_products_ah = 'albert_heijn/ah_products.csv'
create_directory(path_ah)

path_aldi = 'aldi/'
path_products_aldi = 'aldi/aldi_products.csv'
create_directory(path_aldi)

path_jumbo = 'jumbo/'
path_products_jumbo = 'jumbo/jumbo_products.csv'
create_directory(path_jumbo)

path_dirk = 'dirk/'
path_products_dirk = 'dirk/dirk_products.csv'
create_directory(path_dirk)

path_lidl = 'lidl/'
path_products_lidl = 'lidl/lidl_products.csv'
create_directory(path_lidl)

In [4]:
def fetch_products(name, product_path, scraper_module):
    print(f"Fetching {name} Products...")
    if os.path.exists(product_path):
        df = pd.read_csv(product_path)
        print(f"Loaded {len(df)} products from {product_path}")
    else:
        print(f"{name} CSV not found. Running scraper function...")
        df = scraper_module.scrape_products()
        df.to_csv(product_path, index=False)
        print(f"Scraped and saved {len(df)} products to {product_path}")

    df['store_name'] = name.lower().replace(' ', '_')
    df['row'] = range(1, len(df) + 1)
    df['product_id'] = df['store_name'] + "_" + df['row'].astype(str)
    
    return df

# Fetch products for each supermarket
df_ah = fetch_products("Albert Heijn", path_products_ah, ah_scraper)
df_aldi = fetch_products("Aldi", path_products_aldi, aldi_scraper)
df_jumbo = fetch_products("Jumbo", path_products_jumbo, jumbo_scraper)
# df_dirk = fetch_products("Dirk", path_products_dirk, path_scraper_dirk)
# df_lidl = fetch_products("Lidl", path_products_lidl, path_scraper_lidl)

Fetching Albert Heijn Products...
Loaded 4138 products from albert_heijn/ah_products.csv
Fetching Aldi Products...
Loaded 1880 products from aldi/aldi_products.csv
Fetching Jumbo Products...
Jumbo CSV not found. Running scraper function...
Scraping category: aardappelen_groente_fruit
Scraping page 1 with offSet 0...
Scraping page 2 with offSet 24...
Scraping page 3 with offSet 48...
Scraping page 4 with offSet 72...
Scraping page 5 with offSet 96...
Scraping page 6 with offSet 120...
Scraping page 7 with offSet 144...
Scraping page 8 with offSet 168...
Scraping page 9 with offSet 192...
Scraping page 10 with offSet 216...
Scraping page 11 with offSet 240...
Scraping page 12 with offSet 264...
Scraping page 13 with offSet 288...
Scraping page 14 with offSet 312...
Scraping page 15 with offSet 336...
Scraping page 16 with offSet 360...
Scraping page 17 with offSet 384...
Scraping page 18 with offSet 408...
Scraping page 19 with offSet 432...
Scraping page 20 with offSet 456...
Scraping p

KeyboardInterrupt: 

In [6]:
# df_jumbo

In [11]:
# removing products who price is not listed
df_ah = df_ah.dropna(subset=['product_price'])
df_ah = df_ah.reindex(columns=['product_id', 'product_name', 'product_category', 'product_link', 'product_price', 'product_quantity', 'store_name'])

df_aldi = df_aldi.dropna(subset=['product_price'])
df_aldi = df_aldi.reindex(columns=['product_id', 'product_name', 'product_category', 'product_link', 'product_price', 'product_quantity', 'store_name'])

In [12]:
print("Number of ALDI products : ", df_aldi.shape[0])
print("Number of AH products : ", df_ah.shape[0])

Number of ALDI products :  1880
Number of AH products :  4138


In [16]:
df_all_products = pd.concat([df_ah, df_aldi])

In [18]:
df_all_products.to_csv("df_all_products.csv", header=True)