In [23]:
import requests
from bs4 import BeautifulSoup
import json
import time
from urllib.parse import urljoin

no_of_ads = 1150
BASE_URL = "https://www.pakwheels.com/used-cars/toyota-corolla-islamabad/1571"
brand = "Toyota"
car = "Corolla"

DOMAIN = "https://www.pakwheels.com"
HEADERS = {"User-Agent": "Mozilla/5.0"}


def scrape_ad(url):
    """Scrape a single ad page and return structured car data."""
    try:
        r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        r.raise_for_status()
    except Exception as e:
        print(f"❌ Failed to fetch {url}: {e}")
        return None

    soup = BeautifulSoup(r.text, "html.parser")

    # -----------------------------
    # JSON-LD block
    # -----------------------------
    scripts = soup.find_all("script", type="application/ld+json")
    data = None
    for script in scripts:
        try:
            parsed = json.loads(script.string.strip())
            if isinstance(parsed, dict) and (parsed.get("@type") == "Product" or "Product" in parsed.get("@type", [])):
                offers = parsed.get("offers", {})
                if isinstance(offers, dict) and offers.get("url") == url:  # match the correct ad
                    data = parsed
                    break
        except Exception:
            continue

    if not data:
        print(f"⚠️ JSON-LD not found on {url}")
        return None

    # Handle brand safely
    brand = data.get("brand")
    if isinstance(brand, dict):
        brand_name = brand.get("name", "")
    elif isinstance(brand, str):
        brand_name = brand
    else:
        brand_name = ""

    car_data = {
        "Name": data.get("name", ""),
        "City": data.get("name", "").split("for sale in")[-1].strip() if "for sale in" in data.get("name", "") else "",
        "Brand": brand_name,
        "Model": data.get("model", ""),
        "Year": data.get("modelDate", ""),
        "Mileage": data.get("mileageFromOdometer", ""),
        "Fuel": data.get("fuelType", ""),
        "Transmission": data.get("vehicleTransmission", ""),
        "Engine Capacity": data.get("vehicleEngine", {}).get("engineDisplacement", ""),
        "Color": data.get("color", ""),
        "Body Type": data.get("bodyType", ""),
        "Price": f"{data['offers'].get('price', '')} {data['offers'].get('priceCurrency', '')}" if "offers" in data else "",
        "Ad URL": data.get("offers", {}).get("url", ""),
        "Description": data.get("description", "")
    }

    # -----------------------------
    # Extra specs
    # -----------------------------
    specs_ul = soup.find("ul", id="scroll_car_detail")
    if specs_ul:
        spec_list = specs_ul.find_all("li")
        for i in range(0, len(spec_list), 2):
            key = spec_list[i].get_text(strip=True)
            val = spec_list[i + 1].get_text(strip=True)
            car_data[key] = val

    # -----------------------------
    # Features
    # -----------------------------
    features = soup.find("ul", class_="car-feature-list")
    if features:
        car_data["Features"] = [li.get_text(strip=True) for li in features.find_all("li")]
    else:
        car_data["Features"] = []

    # -----------------------------
    # Seller comments
    # -----------------------------
    comments_heading = soup.find("h2", id="scroll_seller_comments")
    if comments_heading:
        comments_div = comments_heading.find_next("div")
        car_data["Seller Comments"] = comments_div.get_text(separator=" ", strip=True)

    return car_data


def scrape_all_ads(limit=100):
    ads = []
    visited = set()
    page = 1

    while len(ads) < limit:
        save_path = f"data/{car}_ads_data.csv"

        url = f"{BASE_URL}?page={page}"
        print(f"Scraping search page: {url}")
        r = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(r.text, "html.parser")

        ad_links = [urljoin(DOMAIN, a["href"]) for a in soup.select("a.car-name") if a.get("href")]
        if not ad_links:
            break

        for ad_url in ad_links:
            print(f"Scraping ad page: {ad_url}")
            if ad_url in visited:
                continue
            visited.add(ad_url)

            car_data = scrape_ad(ad_url)
            if car_data:
                ads.append(car_data)

            if len(ads) >= limit:
                break

        page += 1
        df = pd.DataFrame(ads)
        df.to_csv(save_path, index=False, encoding="utf-8-sig")

    return ads


# Example usage
all_ads = scrape_all_ads(limit=no_of_ads)
print(f"Scraped {len(all_ads)} {car} ads")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Scraping ad page: https://www.pakwheels.com/used-cars/toyota-corolla-2019-for-sale-in-islamabad-10358934
Scraping ad page: https://www.pakwheels.com/used-cars/toyota-corolla-2014-for-sale-in-islamabad-10358555
Scraping ad page: https://www.pakwheels.com/used-cars/toyota-corolla-2022-for-sale-in-islamabad-10357684
Scraping ad page: https://www.pakwheels.com/used-cars/toyota-corolla-2014-for-sale-in-islamabad-10357651
Scraping ad page: https://www.pakwheels.com/used-cars/toyota-corolla-1988-for-sale-in-islamabad-9989233
Scraping ad page: https://www.pakwheels.com/used-cars/toyota-corolla-2004-for-sale-in-islamabad-9989153
Scraping ad page: https://www.pakwheels.com/used-cars/toyota-corolla-2010-for-sale-in-islamabad-9989141
Scraping ad page: https://www.pakwheels.com/used-cars/toyota-corolla-2010-for-sale-in-islamabad-9988846
Scraping search page: https://www.pakwheels.com/used-cars/toyota-corolla-islamabad/1571?page=1003
S

KeyboardInterrupt: 