This script automates the process of downloading receipt images from various North American stores using Bing Image Search. It uses Selenium to scroll through search results, BeautifulSoup to parse HTML, and Requests to download images. Each store has its own folder, and the script ensures that up to 35 images are saved per store.

In [4]:
import os
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager

# List of North American stores
stores = [
    # 🛒 Supermarkets & Retail
    "Walmart", "Costco", "Target", "Kroger", "Whole Foods", "Safeway", "Aldi", "Trader Joe's",
    "Publix", "Sam's Club", "Meijer", "Albertsons", "WinCo Foods", "BJ's Wholesale Club",
    "Food Lion", "Stop & Shop", "Piggly Wiggly", "Hy-Vee", "H-E-B", "Wegmans", "Sprouts Farmers Market",

    # 🍔 Fast Food & Coffee Shops
    "McDonald's", "Burger King", "KFC", "Taco Bell", "Subway", "Wendy's", "Dunkin' Donuts",
    "Tim Hortons", "Starbucks", "Five Guys", "Chick-fil-A", "Jack in the Box", "Popeyes",
    "Arby's", "Sonic Drive-In", "Dairy Queen", "Little Caesars", "Pizza Hut", "Domino's Pizza",
    "Jimmy John's", "Panera Bread", "Chipotle", "Shake Shack", "In-N-Out Burger",

    # 💻 Electronics & Office Supplies
    "Best Buy", "Apple Store", "Microsoft Store", "Staples", "Office Depot", "Micro Center",
    "Fry's Electronics", "Newegg", "GameStop",

    # ⛽ Gas Stations & Convenience Stores
    "7-Eleven", "Shell", "Chevron", "BP", "ExxonMobil", "Circle K", "Wawa", "Speedway", "Casey's",
    "QuikTrip", "Love’s Travel Stops", "Murphy USA",

    # 🏠 Home Improvement & Furniture
    "Home Depot", "Lowe's", "IKEA", "Menards", "Ace Hardware", "Ashley Furniture",
    "Bed Bath & Beyond", "Wayfair", "Rooms To Go", "Crate & Barrel",

    # 💊 Pharmacies & Drugstores
    "CVS Pharmacy", "Walgreens", "Rite Aid", "Walmart Pharmacy", "Target Pharmacy",
    "Good Neighbor Pharmacy",

    # 👗 Clothing & Fashion
    "Nike", "Adidas", "Under Armour", "Puma", "H&M", "Zara", "Gap", "Old Navy",
    "Banana Republic", "Uniqlo", "American Eagle", "Levi's", "Victoria’s Secret",
    "Nordstrom", "Macy's", "Dillard's", "Saks Fifth Avenue", "Burlington", "Ross Dress for Less",
    
    # 🏬 Department Stores & High-End Retail
    "Macy's", "Nordstrom", "JCPenney", "Saks Fifth Avenue", "Neiman Marcus",
    "Bloomingdale's", "Kohl's", "Burlington", "TJ Maxx", "Marshalls", "Ross",

    # 🍽️ Chain Restaurants
    "Olive Garden", "Cheesecake Factory", "Red Lobster", "Applebee's", "Chili's", "Outback Steakhouse",
    "Texas Roadhouse", "Buffalo Wild Wings", "IHOP", "Denny's", "Cracker Barrel", "TGI Fridays",
    "LongHorn Steakhouse", "P.F. Chang’s", "Carrabba’s Italian Grill"
]


# Output directory (Change to your own path)
output_folder = r"C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data"
os.makedirs(output_folder, exist_ok=True)

# Initialize Edge WebDriver
service = EdgeService(EdgeChromiumDriverManager().install())
driver = webdriver.Edge(service=service)

# Scrape receipts for each store
image_count = 1  # Unique index for all images

for store in stores:
    search_query = f"{store} paper receipt"

    # Open Bing image search
    url = f"https://www.bing.com/images/search?q={search_query}&form=HDRSC2"
    driver.get(url)
    time.sleep(3)

    # Scroll to load more images
    for _ in range(5):
        driver.execute_script("window.scrollBy(0, 1000);")
        time.sleep(2)

    # Parse HTML content
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Extract image URLs
    image_tags = soup.find_all("a", {"class": "iusc"})
    img_tags_extra = soup.find_all("img")

    count = 0
    for tag in image_tags:
        try:
            m = tag.get("m")
            img_url = m.split('"murl":"')[1].split('"')[0] if m and "murl" in m else None

            # Fallback to src or data-src
            if not img_url:
                for img in img_tags_extra:
                    img_url = img.get("src") or img.get("data-src")
                    if img_url and "http" in img_url:
                        break

            # Download image
            if img_url:
                img_data = requests.get(img_url, stream=True, timeout=5).content
                image_path = os.path.join(output_folder, f"{store}_receipt_{image_count}.jpg")
                with open(image_path, "wb") as f:
                    f.write(img_data)
                print(f"✅ Downloaded: {image_path}")
                count += 1
                image_count += 1  # Ensure unique naming

            # Rate limit to avoid blocking
            time.sleep(1)

            if count >= 35:  # Limit to 35 images per store
                break

        except Exception as e:
            print(f"⚠️ {store} - Error downloading:", e)

    print(f"🎉 {store} - Downloaded {count} receipt images")

# Close WebDriver
driver.quit()
print("✅ Receipt image download completed for all stores!")


✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\Walmart_receipt_1.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\Walmart_receipt_2.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\Walmart_receipt_3.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\Walmart_receipt_4.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\Walmart_receipt_5.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\Walmart_receipt_6.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\Walmart_receipt_7.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\Walmart_receipt_8.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\Walmart_receipt_9.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\Walmart_receipt_10.jpg
✅ Downloaded: C:\Users\Bruce Lan\Desktop\UOT\Gith

This step filters receipt images based on their size. It checks each image in the input folder and keeps only those that meet the minimum width and height requirements. Valid images are saved to the filtered_receipts folder, while smaller images are discarded.

In [5]:
import os
import cv2

# *******Change to your own path************
input_folder = r"C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data"
output_folder = r"C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\filtered_receipts"
os.makedirs(output_folder, exist_ok=True)

min_width = 500  
min_height = 300  

for filename in os.listdir(input_folder):
    img_path = os.path.join(input_folder, filename)
    
    try:
        img = cv2.imread(img_path)
        if img is None:
            print(f"Skipping {filename} (Unreadable)")
            continue
        
        height, width, _ = img.shape
        if width >= min_width and height >= min_height:
            output_path = os.path.join(output_folder, filename)
            cv2.imwrite(output_path, img)
            print(f"✅ Kept: {filename} (Size: {width}x{height})")
        else:
            print(f"❌ Discarded: {filename} (Size: {width}x{height})")
    
    except Exception as e:
        print(f"Error processing {filename}: {e}")

print(f"✅ Filtering complete! Clear receipts saved to '{output_folder}'")


Skipping 7-Eleven_receipt_1891.jpg (Unreadable)
✅ Kept: 7-Eleven_receipt_1892.jpg (Size: 3024x4032)
✅ Kept: 7-Eleven_receipt_1893.jpg (Size: 1536x2304)
✅ Kept: 7-Eleven_receipt_1894.jpg (Size: 640x853)
✅ Kept: 7-Eleven_receipt_1895.jpg (Size: 600x315)
✅ Kept: 7-Eleven_receipt_1896.jpg (Size: 628x314)
✅ Kept: 7-Eleven_receipt_1897.jpg (Size: 605x1064)
✅ Kept: 7-Eleven_receipt_1898.jpg (Size: 2448x3264)
✅ Kept: 7-Eleven_receipt_1899.jpg (Size: 768x1024)
Skipping 7-Eleven_receipt_1900.jpg (Unreadable)
✅ Kept: 7-Eleven_receipt_1901.jpg (Size: 1000x1000)
✅ Kept: 7-Eleven_receipt_1902.jpg (Size: 1200x675)
❌ Discarded: 7-Eleven_receipt_1903.jpg (Size: 300x400)
✅ Kept: 7-Eleven_receipt_1904.jpg (Size: 1100x780)
❌ Discarded: 7-Eleven_receipt_1905.jpg (Size: 472x1024)
✅ Kept: 7-Eleven_receipt_1906.jpg (Size: 676x1228)
✅ Kept: 7-Eleven_receipt_1907.jpg (Size: 1739x2000)
✅ Kept: 7-Eleven_receipt_1908.jpg (Size: 590x391)
✅ Kept: 7-Eleven_receipt_1909.jpg (Size: 800x800)
Skipping 7-Eleven_receipt_19

This step filters valid receipts by checking for barcodes, keywords, and common receipt line patterns. If an image contains a barcode, it is directly classified as a receipt. Otherwise, OCR is used to extract text, and the image is kept if it contains specific keywords (e.g., "Total", "$", "Tax") or patterns like "----" or "====".

In [6]:
import easyocr
import os
import cv2
import shutil
from pyzbar.pyzbar import decode

# Enable GPU acceleration
reader = easyocr.Reader(['en'], gpu=True)

# *******Change to your own path************
input_folder = r"C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\filtered_receipts"
output_folder = r"C:\Users\Bruce Lan\Desktop\UOT\Github\MIE1517-Project\data\valid_receipts"
os.makedirs(output_folder, exist_ok=True)

# Keywords and patterns for receipt detection
keywords = ["Total","TOTAL", "Tax", "Thank you", "$", "Subtotal","SUBTOTAL", "Balance", "Change","CHANGE",
            "Cash", "Card","CARD","Amount", "Visa", "Store", "Items", "Receipt"]
line_patterns = ["----","--", "==", "——", "____","*", "**", "* * *","XXX", "XXXXXXXX","- -", "= =", "~ ~ ~", "— — — —", r"\.{5,}"]

def contains_barcode(image):
    """Check for barcodes in the image."""
    return len(decode(image)) > 0

for filename in os.listdir(input_folder):
    img_path = os.path.join(input_folder, filename)

    try:
        img = cv2.imread(img_path)
        if img is None:
            print(f"❌ Cannot read: {filename}")
            continue

        # Check for barcode
        has_barcode = contains_barcode(img)
        if has_barcode:
            shutil.copy(img_path, os.path.join(output_folder, filename))
            print(f"✅ Receipt detected (Barcode found): {filename}")
            continue

        # Perform OCR with GPU
        text = reader.readtext(img_path, detail=0)
        extracted_text = " ".join(text)

        # Check for receipt patterns
        has_keywords = any(word in extracted_text for word in keywords)
        has_line_patterns = any(pattern in extracted_text for pattern in line_patterns)

        if has_keywords and has_line_patterns:
            shutil.copy(img_path, os.path.join(output_folder, filename))
            print(f"✅ Receipt detected: {filename}")
        else:
            reasons = []
            if not has_keywords:
                reasons.append("No keywords found")
            if not has_line_patterns:
                reasons.append("No line patterns found")
            reason_str = ", ".join(reasons)
            print(f"❌ Not a receipt: {filename} ({reason_str})")

    except Exception as e:
        print(f"⚠️ Error processing {filename}: {e}")


✅ Receipt detected: 7-Eleven_receipt_1892.jpg
❌ Not a receipt: 7-Eleven_receipt_1893.jpg (No line patterns found)
✅ Receipt detected: 7-Eleven_receipt_1894.jpg
❌ Not a receipt: 7-Eleven_receipt_1895.jpg (No line patterns found)
❌ Not a receipt: 7-Eleven_receipt_1896.jpg (No keywords found, No line patterns found)
✅ Receipt detected: 7-Eleven_receipt_1897.jpg
✅ Receipt detected (Barcode found): 7-Eleven_receipt_1898.jpg
✅ Receipt detected: 7-Eleven_receipt_1899.jpg
✅ Receipt detected: 7-Eleven_receipt_1901.jpg
✅ Receipt detected: 7-Eleven_receipt_1902.jpg
✅ Receipt detected: 7-Eleven_receipt_1904.jpg
✅ Receipt detected: 7-Eleven_receipt_1906.jpg
❌ Not a receipt: 7-Eleven_receipt_1907.jpg (No line patterns found)
❌ Not a receipt: 7-Eleven_receipt_1908.jpg (No keywords found, No line patterns found)
❌ Not a receipt: 7-Eleven_receipt_1909.jpg (No line patterns found)
❌ Not a receipt: 7-Eleven_receipt_1911.jpg (No line patterns found)
✅ Receipt detected: 7-Eleven_receipt_1913.jpg
❌ Not a re