This script automates the process of downloading receipt images from various North American stores using Bing Image Search. It uses Selenium to scroll through search results, BeautifulSoup to parse HTML, and Requests to download images. Each store has its own folder, and the script ensures that up to 35 images are saved per store.

In [3]:
import os
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.common.exceptions import InvalidSessionIdException, WebDriverException
# Dictionary of categorized North American stores
store_categories = {
    "food": [
        "Albertsons", "Aldi", "Applebee's", "Arby's", "BJ’s Wholesale Club", "Buffalo Wild Wings",
        "Burger King", "Carrabba’s Italian Grill", "Cheesecake Factory", "Chick-fil-A", "Chili's",
        "Chipotle", "Costco", "Cracker Barrel", "Dairy Queen", "Denny's", "Domino's Pizza",
        "Dunkin' Donuts", "Five Guys", "Food Lion", "H-E-B", "Hy-Vee", "IHOP", "In-N-Out Burger",
        "Jack in the Box", "Jimmy John's", "KFC", "Kroger", "Little Caesars", "LongHorn Steakhouse",
        "McDonald's", "Meijer", "Olive Garden", "Outback Steakhouse", "P.F. Chang’s", "Panera Bread",
        "Pizza Hut", "Popeyes", "Publix", "Red Lobster", "Safeway", "Sam's Club", "Shake Shack",
        "Sonic Drive-In", "Sprouts Farmers Market", "Starbucks", "Stop & Shop", "Subway", "Taco Bell",
        "Target", "Texas Roadhouse", "Tim Hortons", "Trader Joe's", "Walmart", "Wegmans", "Wendy's",
        "Whole Foods", "WinCo Foods"
    ],
    "transportation": [
    "Uber", "Lyft", "Curb", "Via",
    "TTC", "MTA", "TransLink", "GO Transit", "STM", "BART",
    "OC Transpo", "WMATA", "SEPTA",
    "Amtrak", "VIA Rail", "Greyhound", "Megabus", "FlixBus",
    "UP Express", "Coach Canada", "Red Arrow", "AirTrain", "PRESTO"
    ],
    "clothing": [
        "Guess", "Adidas", "American Eagle", "Aritzia", "Banana Republic", "CANADA GOOSE",
        "BOTTEGA VENETA", "Forever 21", "Gap", "Garage", "H&M", "Hollister", "Joe Fresh", "Levi's", "Nike",
        "Old Navy", "Puma", "CALVIN KLEIN", "Under Armour", "Uniqlo", "Victoria’s Secret", "Zara","GUCCI","MOOSE KNUCKLES","POLO RALPH LAUREN"
    ],
    "home/appliances": [
        "Ace Hardware", "Apple Store", "Ashley Furniture", "Bed Bath & Beyond", "Best Buy", "Bosch",
        "Canadian Tire", "Crate & Barrel", "Fry's Electronics", "Home Depot", "IKEA", "LG Brand Store",
        "Leon’s", "Lowe's", "Lowe’s Canada", "Menards", "Micro Center", "Microsoft Store", "Newegg",
        "Rona", "Rooms To Go", "Samsung Store", "Sleep Country", "The Brick", "Wayfair"
    ],
    "other": [
        "AliExpress", "Amazon", "B&H Photo", "Best Buy Canada", "Big Lots", "Bloomingdale's", "Burlington",
        "CVS Pharmacy", "Canada Post", "Cost Plus World Market", "Dillard's", "Dollar Tree", "Dollarama",
        "Family Dollar", "FedEx", "GameStop", "Giant Tiger", "Good Neighbor Pharmacy", "Harbor Freight Tools",
        "Hudson's Bay", "Indigo", "JCPenney", "Kohl's", "Loblaws", "London Drugs", "Marshalls",
        "Neiman Marcus", "No Frills", "Nordstrom", "Office Depot", "Real Canadian Superstore", "Rexall",
        "Rite Aid", "Ross Dress for Less", "Saks Fifth Avenue", "Shoppers Drug Mart", "Staples",
        "Staples Canada", "T&T Supermarket", "TJ Maxx", "Target Pharmacy", "The Source", "UPS Store",
        "USPS", "Walgreens", "Walmart Pharmacy", "Zellers", "eBay"
    ]
}

# Output root directory (change this to your local path)
output_folder = r"C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data"
os.makedirs(output_folder, exist_ok=True)

# Function to create headless Edge driver
def create_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    return webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()), options=options)

# Initialize driver
driver = create_driver()
image_count = 1

for category, stores in store_categories.items():
    safe_category = category.replace("/", "_")  # Replace slash to avoid subfolders
    category_folder = os.path.join(output_folder, safe_category)
    os.makedirs(category_folder, exist_ok=True)

    for store in stores:
        search_query = f"{store} paper receipt"
        url = f"https://www.bing.com/images/search?q={search_query}&form=HDRSC2"

        try:
            driver.get(url)
        except (InvalidSessionIdException, WebDriverException):
            print(f"⚠️ Browser session crashed! Restarting browser for {store}...")
            driver.quit()
            driver = create_driver()
            driver.get(url)

        time.sleep(3)

        for _ in range(5):
            driver.execute_script("window.scrollBy(0, 1000);")
            time.sleep(2)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        image_tags = soup.find_all("a", {"class": "iusc"})
        fallback_imgs = soup.find_all("img")

        count = 0
        for tag in image_tags:
            try:
                m = tag.get("m")
                img_url = m.split('"murl":"')[1].split('"')[0] if m and "murl" in m else None

                if not img_url:
                    for img in fallback_imgs:
                        img_url = img.get("src") or img.get("data-src")
                        if img_url and "http" in img_url:
                            break

                if img_url:
                    img_data = requests.get(img_url, stream=True, timeout=5).content
                    image_path = os.path.join(category_folder, f"{store}_receipt_{image_count}.jpg")
                    with open(image_path, "wb") as f:
                        f.write(img_data)
                    print(f"✅ Downloaded: {image_path}")
                    count += 1
                    image_count += 1

                time.sleep(1)

                if count >= 100:
                    break

            except Exception as e:
                print(f"⚠️ {store} - Failed to download image:", e)

        print(f"🎉 {store} - Downloaded {count} receipt images")

driver.quit()
print("✅ All receipts downloaded successfully!")


✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\food\Albertsons_receipt_1.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\food\Albertsons_receipt_2.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\food\Albertsons_receipt_3.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\food\Albertsons_receipt_4.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\food\Albertsons_receipt_5.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\food\Albertsons_receipt_6.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\food\Albertsons_receipt_7.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\food\Albertsons_receipt_8.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\food\Albertsons_receipt_9.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\

This step filters receipt images based on their size. It checks each image in the input folder and keeps only those that meet the minimum width and height requirements. Valid images are saved to the filtered_receipts folder, while smaller images are discarded.

In [4]:
import os
import cv2

input_folder = r"C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data"
output_folder = r"C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\filtered_receipts"
os.makedirs(output_folder, exist_ok=True)

min_width = 500
min_height = 300

for root, _, files in os.walk(input_folder):
    # Skip filtering the output folder itself
    if os.path.commonpath([root, output_folder]) == output_folder:
        continue

    for filename in files:
        if not filename.lower().endswith((".jpg", ".jpeg", ".png")):
            continue

        img_path = os.path.join(root, filename)

        try:
            img = cv2.imread(img_path)
            if img is None:
                print(f"Skipping {filename} (Unreadable)")
                continue

            height, width, _ = img.shape
            if width >= min_width and height >= min_height:
                rel_path = os.path.relpath(root, input_folder)
                output_subfolder = os.path.join(output_folder, rel_path)
                os.makedirs(output_subfolder, exist_ok=True)

                output_path = os.path.join(output_subfolder, filename)
                cv2.imwrite(output_path, img)
                print(f"✅ Kept: {filename} (Size: {width}x{height})")
            else:
                print(f"❌ Discarded: {filename} (Size: {width}x{height})")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

print(f"\n✅ Filtering complete! Clear receipts saved to '{output_folder}'")


✅ Kept: Adidas_receipt_8201.jpg (Size: 900x1200)
❌ Discarded: Adidas_receipt_8202.jpg (Size: 459x597)
✅ Kept: Adidas_receipt_8203.jpg (Size: 2200x3566)
✅ Kept: Adidas_receipt_8204.jpg (Size: 810x1080)
❌ Discarded: Adidas_receipt_8205.jpg (Size: 300x300)
✅ Kept: Adidas_receipt_8206.jpg (Size: 750x6079)
❌ Discarded: Adidas_receipt_8207.jpg (Size: 442x1364)
✅ Kept: Adidas_receipt_8208.jpg (Size: 1572x2099)
Skipping Adidas_receipt_8209.jpg (Unreadable)
✅ Kept: Adidas_receipt_8210.jpg (Size: 720x720)
✅ Kept: Adidas_receipt_8211.jpg (Size: 1116x1578)
✅ Kept: Adidas_receipt_8212.jpg (Size: 810x1080)
✅ Kept: Adidas_receipt_8213.jpg (Size: 1024x1024)
❌ Discarded: Adidas_receipt_8214.jpg (Size: 453x640)
❌ Discarded: Adidas_receipt_8215.jpg (Size: 650x232)
✅ Kept: Adidas_receipt_8216.jpg (Size: 950x600)
❌ Discarded: Adidas_receipt_8217.jpg (Size: 468x1024)
✅ Kept: Adidas_receipt_8218.jpg (Size: 800x534)
✅ Kept: Adidas_receipt_8219.jpg (Size: 800x800)
Skipping Adidas_receipt_8220.jpg (Unreadable)


This step filters valid receipts by checking for barcodes, keywords, and common receipt line patterns. If an image contains a barcode, it is directly classified as a receipt. Otherwise, OCR is used to extract text, and the image is kept if it contains specific keywords (e.g., "Total", "$", "Tax") or patterns like "----" or "====".

In [5]:
import easyocr
import os
import cv2
import shutil
from pyzbar.pyzbar import decode

# Initialize OCR reader with GPU
reader = easyocr.Reader(['en'], gpu=True)

# Input and output folders
input_folder = r"C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\filtered_receipts"
output_folder = r"C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\valid_receipts"
os.makedirs(output_folder, exist_ok=True)

# Receipt detection patterns
keywords = ["Total", "TOTAL", "Tax", "Thank you", "$", "Subtotal", "SUBTOTAL", "Balance",
            "Change", "CHANGE", "Cash", "Card", "CARD", "Amount", "Visa", "Store", "Items", "Receipt"]
line_patterns = ["----", "--", "==", "——", "____", "*", "**", "* * *", "XXX", "XXXXXXXX",
                 "- -", "= =", "~ ~ ~", "— — — —", r"\.{5,}"]

def contains_barcode(image):
    """Check if the image contains a barcode."""
    return len(decode(image)) > 0

# Walk through subdirectories (e.g., food, clothing, etc.)
for root, _, files in os.walk(input_folder):
    for filename in files:
        if not filename.lower().endswith((".jpg", ".jpeg", ".png")):
            continue

        img_path = os.path.join(root, filename)

        try:
            img = cv2.imread(img_path)
            if img is None:
                print(f"❌ Unreadable image: {filename}")
                continue

            # Check for barcode first
            if contains_barcode(img):
                rel_path = os.path.relpath(root, input_folder)
                output_subfolder = os.path.join(output_folder, rel_path)
                os.makedirs(output_subfolder, exist_ok=True)

                shutil.copy(img_path, os.path.join(output_subfolder, filename))
                print(f"✅ [BARCODE] {rel_path}/{filename}")
                continue

            # OCR inference
            text = reader.readtext(img_path, detail=0)
            extracted_text = " ".join(text)

            # Heuristic checks
            has_keywords = any(word in extracted_text for word in keywords)
            has_line_patterns = any(pattern in extracted_text for pattern in line_patterns)

            if has_keywords and has_line_patterns:
                rel_path = os.path.relpath(root, input_folder)
                output_subfolder = os.path.join(output_folder, rel_path)
                os.makedirs(output_subfolder, exist_ok=True)

                shutil.copy(img_path, os.path.join(output_subfolder, filename))
                print(f"✅ [OCR] {rel_path}/{filename}")
            else:
                reason = []
                if not has_keywords: reason.append("No keywords")
                if not has_line_patterns: reason.append("No line patterns")
                print(f"❌ {filename} skipped ({', '.join(reason)})")

        except Exception as e:
            print(f"⚠️ Error processing {filename}: {e}")


❌ Adidas_receipt_8201.jpg skipped (No line patterns)
❌ Adidas_receipt_8203.jpg skipped (No line patterns)
❌ Adidas_receipt_8204.jpg skipped (No keywords, No line patterns)
✅ [OCR] clothing/Adidas_receipt_8206.jpg
✅ [OCR] clothing/Adidas_receipt_8208.jpg
❌ Adidas_receipt_8210.jpg skipped (No keywords, No line patterns)
❌ Adidas_receipt_8211.jpg skipped (No keywords, No line patterns)
❌ Adidas_receipt_8212.jpg skipped (No keywords, No line patterns)
❌ Adidas_receipt_8213.jpg skipped (No keywords, No line patterns)
❌ Adidas_receipt_8216.jpg skipped (No keywords, No line patterns)
❌ Adidas_receipt_8218.jpg skipped (No keywords, No line patterns)
❌ Adidas_receipt_8219.jpg skipped (No keywords, No line patterns)
❌ Adidas_receipt_8223.jpg skipped (No line patterns)
❌ Adidas_receipt_8224.jpg skipped (No line patterns)
✅ [OCR] clothing/Adidas_receipt_8225.jpg
❌ Adidas_receipt_8227.jpg skipped (No keywords, No line patterns)
❌ Adidas_receipt_8229.jpg skipped (No line patterns)
❌ Adidas_receipt_8