In [None]:
import os
import time
import random
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0 Safari/537.36"
    )
}

CATEGORIES = {
    "Toyota_Supra_(A80)": "https://commons.wikimedia.org/wiki/Category:Toyota_Supra_(A80)",
    "Toyota_Land_Cruiser_(J40)": "https://commons.wikimedia.org/wiki/Category:Toyota_Land_Cruiser_(J40)",
    "Ford_F-150_(thirteenth_generation)": "https://commons.wikimedia.org/wiki/Category:Ford_F-150_(thirteenth_generation)",
    "Ford_Mustang_VI": "https://commons.wikimedia.org/wiki/Category:Ford_Mustang_VI",
    "Renault_Twizy_Z.E.": "https://commons.wikimedia.org/wiki/Category:Renault_Twizy_Z.E.",
    "Renault_Kangoo_I": "https://commons.wikimedia.org/wiki/Category:Renault_Kangoo_I",
    "Volkswagen_Type_1": "https://commons.wikimedia.org/wiki/Category:Volkswagen_Type_1",
    "Volkswagen_Passat_B6": "https://commons.wikimedia.org/wiki/Category:Volkswagen_Passat_B6",
}

def ensure_dir(path):
    os.makedirs(path, exist_ok=True)

def get_soup(url):
    resp = requests.get(url, headers=HEADERS, timeout=20)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")

def get_file_pages_from_category(category_url):

    file_pages = []
    next_url = category_url

    while next_url:
        soup = get_soup(next_url)

        for li in soup.select("div#mw-category-media li.gallerybox div.gallerytext a"):
            href = li.get("href")
            if href and href.startswith("/wiki/File:"):
                file_pages.append(urljoin("https://commons.wikimedia.org", href))

        next_link = soup.select_one("a[rel='next']")
        if next_link and next_link.get("href"):
            next_url = urljoin("https://commons.wikimedia.org", next_link["href"])
        else:
            next_url = None

        time.sleep(random.uniform(0.5, 1.5))

    return file_pages

def get_full_image_url(file_page_url):

    soup = get_soup(file_page_url)

    orig_link = soup.select_one("a.internal")
    if orig_link and orig_link.get("href"):
        return urljoin("https://commons.wikimedia.org", orig_link["href"])

    img = soup.select_one("div.fullImageLink a img")
    if img and img.get("src"):
        return urljoin("https://commons.wikimedia.org", img["src"])

    return None

def download_image(img_url, out_dir, prefix="img"):
    try:
        r = requests.get(img_url, headers=HEADERS, timeout=30)
        r.raise_for_status()
    except requests.RequestException as e:
        print(f"[WARN] Could not download {img_url}: {e}")
        return

    ts = int(time.time() * 1000)
    filename = f"{prefix}_{ts}.jpg"
    path = os.path.join(out_dir, filename)

    with open(path, "wb") as f:
        f.write(r.content)

    print(f"[OK] {path}")

def scrape_commons_category(class_name, category_url, base_dir="data", max_files=None):
    time.sleep(2)
    out_dir = os.path.join(base_dir, class_name)
    ensure_dir(out_dir)

    print(f"\n=== {class_name} ===")
    file_pages = get_file_pages_from_category(category_url)
    print(f"[INFO] Found {len(file_pages)} file pages in category")

    count = 0
    for fp in file_pages:
        if max_files is not None and count >= max_files:
            break

        img_url = get_full_image_url(fp)
        if not img_url:
            continue

        download_image(img_url, out_dir, prefix=class_name)
        count += 1
        time.sleep(random.uniform(0.5, 1.5))

    print(f"[DONE] Downloaded {count} images for {class_name}")

if __name__ == "__main__":
    for class_name, cat_url in CATEGORIES.items():
        scrape_commons_category(class_name, cat_url, base_dir="cars_dataset", max_files=200)



=== Ford_F-150_(thirteenth_generation) ===
[INFO] Found 200 file pages in category
[OK] cars_dataset4/Ford_F-150_(thirteenth_generation)/Ford_F-150_(thirteenth_generation)_1766247058735.jpg
[OK] cars_dataset4/Ford_F-150_(thirteenth_generation)/Ford_F-150_(thirteenth_generation)_1766247060908.jpg
[OK] cars_dataset4/Ford_F-150_(thirteenth_generation)/Ford_F-150_(thirteenth_generation)_1766247063102.jpg
[OK] cars_dataset4/Ford_F-150_(thirteenth_generation)/Ford_F-150_(thirteenth_generation)_1766247065139.jpg
[OK] cars_dataset4/Ford_F-150_(thirteenth_generation)/Ford_F-150_(thirteenth_generation)_1766247067325.jpg
[OK] cars_dataset4/Ford_F-150_(thirteenth_generation)/Ford_F-150_(thirteenth_generation)_1766247069120.jpg
[OK] cars_dataset4/Ford_F-150_(thirteenth_generation)/Ford_F-150_(thirteenth_generation)_1766247071241.jpg
[OK] cars_dataset4/Ford_F-150_(thirteenth_generation)/Ford_F-150_(thirteenth_generation)_1766247072816.jpg
[OK] cars_dataset4/Ford_F-150_(thirteenth_generation)/Ford_F

In [None]:
import shutil

folder_path = "/content/cars_dataset"
zip_output = "/content/cars_dataset_zipped"
shutil.make_archive(zip_output, 'zip', folder_path)

print("Completed: ", zip_output + ".zip")
