In [3]:
import os, re, csv, time, json, sys, argparse, random
from pathlib import Path
from urllib.parse import urljoin, urlparse
import urllib.robotparser as robotparser
import requests
from bs4 import BeautifulSoup

In [4]:
BASE = "https://churchillgowns.com"
INDEX_URL = f"{BASE}/pages/select-your-university"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; uni-scraper/1.0; +https://example.org/)",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

DEGREE_KEYWORDS = {
    # Map common keywords to your desired folder names
    "bsc": "BSc",
    "ba ": "BA",
    "bachelor": "Bachelors",
    "msc": "MSc",
    "ms ": "MSc",
    "ma ": "MA",
    "master": "Masters",
    "doctoral": "Doctoral",
    "phd": "Doctoral",
}

COLLECTION_FILTERS = {
    # Which collections to follow from each uni page
    # Keys are simple contains-matches on the collection title
    "hire": "Graduation Set (Hire)",
    "buy": "Full Graduation Set"
}

def polite_get(session, url, robots, delay=(0.5, 1.2), retries=3, backoff=1.6):
    if not robots.can_fetch(HEADERS["User-Agent"], url):
        raise PermissionError(f"robots.txt disallows fetch: {url}")
    for i in range(retries):
        try:
            resp = session.get(url, headers=HEADERS, timeout=20)
            if resp.status_code in (429, 500, 502, 503, 504):
                raise requests.HTTPError(f"retryable status {resp.status_code}")
            resp.raise_for_status()
            time.sleep(random.uniform(*delay))
            return resp
        except Exception as e:
            if i == retries - 1:
                raise
            time.sleep((backoff ** i) + random.random())

def sanitize(name):
    name = re.sub(r"\s+", " ", name.strip())
    name = re.sub(r"[\\/:*?\"<>|]", "_", name)  # Win-safe
    return name

def infer_degree_folder(text):
    t = text.lower() + " "
    for k, v in DEGREE_KEYWORDS.items():
        if k in t:
            return v
    return "Unknown"

def parse_university_links(html):
    soup = BeautifulSoup(html, "html.parser")
    links = []
    # Links are rendered as headings with anchor; grab all anchors under the “Select Your University” listing
    for a in soup.select("a[href*='/pages/']"):
        href = a.get("href") or ""
        text = a.get_text(strip=True)
        if "/pages/" in href and text and "university" in text.lower():
            links.append((text, urljoin(BASE, href)))
    # De-dup and preserve order
    seen, out = set(), []
    for name, href in links:
        if href not in seen:
            seen.add(href)
            out.append((name, href))
    return out

def find_degree_collections(html):
    soup = BeautifulSoup(html, "html.parser")
    items = []
    # university “Select Your Package” tiles link to collections
    for a in soup.select("a[href*='/collections/']"):
        title = a.get_text(" ", strip=True)
        href = urljoin(BASE, a.get("href"))
        if title:
            items.append((title, href))
    return items

def extract_product_links(collection_html):
    soup = BeautifulSoup(collection_html, "html.parser")
    prods = []
    for a in soup.select("a[href*='/products/']"):
        title = a.get_text(" ", strip=True)
        href = urljoin(BASE, a.get("href"))
        if title:
            prods.append((title, href))
    # de-dup
    seen, out = set(), []
    for t, h in prods:
        if h not in seen:
            seen.add(h)
            out.append((t, h))
    return out

def extract_image_urls(product_html):
    soup = BeautifulSoup(product_html, "html.parser")
    urls = set()
    # 1) typical Shopify product image <img> or <meta property="og:image">
    for tag in soup.select("img[src], source[srcset], meta[property='og:image']"):
        if tag.name == "meta":
            src = tag.get("content")
        else:
            src = tag.get("src") or tag.get("srcset")
            if src and " " in src and "," in src:
                # srcset: take first URL
                src = src.split(",")[0].strip().split(" ")[0]
        if not src:
            continue
        full = src if src.startswith("http") else urljoin(BASE, src)
        # Filter for Shopify CDN images or site-hosted images likely relevant
        if any(k in full for k in ("cdn.shopify.com", "churchillgowns.com")):
            urls.add(full)
    return sorted(urls)

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--out", default="output", help="Output folder")
    ap.add_argument("--include", nargs="+", default=["hire", "buy"],
                    help="Which collection types to include: hire, buy")
    ap.add_argument("--max-unis", type=int, default=9999, help="Limit universities crawled")
    ap.add_argument("--dry-run", action="store_true", help="List actions without downloading")
    args = ap.parse_args()

    outdir = Path(args.out)
    outdir.mkdir(parents=True, exist_ok=True)

    # robots.txt
    robots = robotparser.RobotFileParser()
    robots.set_url(urljoin(BASE, "/robots.txt"))
    try:
        robots.read()
    except Exception:
        print("Warning: could not read robots.txt; proceed with caution.", file=sys.stderr)

    session = requests.Session()

    # 1) index -> list universities
    idx = polite_get(session, INDEX_URL, robots)
    unis = parse_university_links(idx.text)
    if not unis:
        print("No universities parsed — site structure might have changed.", file=sys.stderr)
        return

    # Manifest
    manifest_path = outdir / "manifest.csv"
    mf = open(manifest_path, "w", newline="", encoding="utf-8")
    writer = csv.writer(mf)
    writer.writerow(["university", "degree_folder", "product_title", "image_url", "saved_path"])

    uni_count = 0
    for uni_name, uni_url in unis:
        uni_count += 1
        if uni_count > args.max_unis:
            break
        u_name = sanitize(uni_name)
        print(f"\n== {uni_name} == {uni_url}")

        try:
            uresp = polite_get(session, uni_url, robots)
        except PermissionError as e:
            print(f"Skip (robots): {e}")
            continue
        except Exception as e:
            print(f"Skip (fetch error): {e}")
            continue

        collections = find_degree_collections(uresp.text)
        # Filter by include types (hire/buy) using title contains heuristics
        filtered = []
        for title, href in collections:
            t = title.lower()
            take = False
            if "hire" in args.include and "hire" in t:
                take = True
            if "buy" in args.include and ("buy" in t or "purchase" in t):
                take = True
            if take:
                filtered.append((title, href))

        for coll_title, coll_href in filtered:
            try:
                cresp = polite_get(session, coll_href, robots)
            except Exception as e:
                print(f"  - Skip collection ({coll_title}): {e}")
                continue
            products = extract_product_links(cresp.text)
            if not products:
                # some collections may render products via JS; still try the collection page images
                products = [(coll_title, coll_href)]

            for prod_title, prod_href in products:
                try:
                    presp = polite_get(session, prod_href, robots)
                except Exception as e:
                    print(f"    - Skip product ({prod_title}): {e}")
                    continue

                imgs = extract_image_urls(presp.text)
                if not imgs:
                    continue

                degree_folder = infer_degree_folder(prod_title + " " + coll_title)
                # Normalize specific Masters/Bachelors into MA/MSc/BA/BSc if keywords found
                # (We keep Masters/Bachelors if subtype not present)
                pf_uni = outdir / u_name / degree_folder
                pf_uni.mkdir(parents=True, exist_ok=True)

                for i, url in enumerate(imgs, 1):
                    # Make filename from product + index
                    ext = os.path.splitext(urlparse(url).path)[1] or ".jpg"
                    fname = sanitize(f"{prod_title}-{i}")[:150] + ext
                    dest = pf_uni / fname

                    if args.dry_run:
                        print(f"DRY-RUN save: {dest} <- {url}")
                        writer.writerow([uni_name, degree_folder, prod_title, url, str(dest)])
                        continue

                    if dest.exists():
                        writer.writerow([uni_name, degree_folder, prod_title, url, str(dest)])
                        continue

                    try:
                        rimg = polite_get(session, url, robots, delay=(0.3, 0.8))
                        with open(dest, "wb") as f:
                            f.write(rimg.content)
                        print(f"Saved: {dest.name}")
                        writer.writerow([uni_name, degree_folder, prod_title, url, str(dest)])
                    except Exception as e:
                        print(f"      - Image failed: {url} :: {e}")

    mf.close()
    print(f"\nDone. Manifest: {manifest_path}")


In [5]:
main()

usage: ipykernel_launcher.py [-h] [--out OUT]
                             [--include INCLUDE [INCLUDE ...]]
                             [--max-unis MAX_UNIS] [--dry-run]
ipykernel_launcher.py: error: unrecognized arguments: --f="/Users/Haichen Shi/Library/Jupyter/runtime/kernel-v3445293826b2734c48148186687630e4636daeafd.json"


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
