<a href="https://colab.research.google.com/github/wamiqulislam/Car_Price_Predictor/blob/main/PakWheels_webscraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import requests
from bs4 import BeautifulSoup
import json
import time
from urllib.parse import urljoin
import pandas as pd

#=======================Make Changes Here=======================================
#max no of ads you want to scrap
no_of_ads = 1150

#url for the search page with all the ads
BASE_URL = "https://www.pakwheels.com/used-cars/search/-/mk_suzuki/md_mehran/ct_islamabad/ca_jinnah-garden/"
#car name
car = "Corolla"

#path to save the data
save_path = f"{car}_ads_data.csv"

#===============================================================================

DOMAIN = "https://www.pakwheels.com"
HEADERS = {"User-Agent": "Mozilla/5.0"}


def scrape_ad(url):
    """Scrape a single ad page and return structured car data."""
    try:
        r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        r.raise_for_status()
    except Exception as e:
        print(f"❌ Failed to fetch {url}: {e}")
        return None

    soup = BeautifulSoup(r.text, "html.parser")


    # load data from JSON-LD block
    scripts = soup.find_all("script", type="application/ld+json")
    data = None
    for script in scripts:
        try:
            parsed = json.loads(script.string.strip())
            if isinstance(parsed, dict) and (parsed.get("@type") == "Product" or "Product" in parsed.get("@type", [])):
                offers = parsed.get("offers", {})
                if isinstance(offers, dict) and offers.get("url") == url:  # match the correct ad
                    data = parsed
                    break
        except Exception:
            continue

    if not data:
        print(f"⚠️ JSON-LD not found on {url}")
        return None

    # Handle brand safely
    brand = data.get("brand")
    if isinstance(brand, dict):
        brand_name = brand.get("name", "")
    elif isinstance(brand, str):
        brand_name = brand
    else:
        brand_name = ""

    #save all the data into car_data
    car_data = {
        "Name": data.get("name", ""),
        "City": data.get("name", "").split("for sale in")[-1].strip() if "for sale in" in data.get("name", "") else "",
        "Brand": brand_name,
        "Model": data.get("model", ""),
        "Year": data.get("modelDate", ""),
        "Mileage": data.get("mileageFromOdometer", ""),
        "Fuel": data.get("fuelType", ""),
        "Transmission": data.get("vehicleTransmission", ""),
        "Engine Capacity": data.get("vehicleEngine", {}).get("engineDisplacement", ""),
        "Color": data.get("color", ""),
        "Body Type": data.get("bodyType", ""),
        "Price": f"{data['offers'].get('price', '')} {data['offers'].get('priceCurrency', '')}" if "offers" in data else "",
        "Ad URL": data.get("offers", {}).get("url", ""),
        "Description": data.get("description", "")
    }


    # saving any extra specs
    specs_ul = soup.find("ul", id="scroll_car_detail")
    if specs_ul:
        spec_list = specs_ul.find_all("li")
        for i in range(0, len(spec_list), 2):
            key = spec_list[i].get_text(strip=True)
            val = spec_list[i + 1].get_text(strip=True)
            car_data[key] = val

    # saving the Features in a list
    features = soup.find("ul", class_="car-feature-list")
    if features:
        car_data["Features"] = [li.get_text(strip=True) for li in features.find_all("li")]
    else:
        car_data["Features"] = []

    # saving the seller comments
    comments_heading = soup.find("h2", id="scroll_seller_comments")
    if comments_heading:
        comments_div = comments_heading.find_next("div")
        car_data["Seller Comments"] = comments_div.get_text(separator=" ", strip=True)

    return car_data


def scrape_all_ads(limit=100):
    ads = []
    visited = set()
    page = 1

    while len(ads) < limit:

        url = f"{BASE_URL}?page={page}" #url updates for each page
        print(f"Scraping search page: {url}")
        r = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(r.text, "html.parser")

        #getting all the ad links on a page
        ad_links = [urljoin(DOMAIN, a["href"]) for a in soup.select("a.car-name") if a.get("href")]
        print(f"Found {len(ad_links)} ads on page {page}")
        if not ad_links or "No ads found" in soup.get_text():
            print("No more ads found. Stopping.")
            break

        new_ads_count = 0
        #scraping data from each page
        for ad_url in ad_links:
            print(f"Scraping ad page: {ad_url}")
            if ad_url in visited:
                continue
            visited.add(ad_url)

            car_data = scrape_ad(ad_url)
            if car_data:
                ads.append(car_data)
                new_ads_count += 1

            if len(ads) >= limit:
                break

        page += 1

        #saving the data of all ads from one page into the csv file
        df = pd.DataFrame(ads)
        df.to_csv(save_path, index=False, encoding="utf-8-sig")

        # if this page added nothing new → stop
        if new_ads_count == 0:
            print("No new ads scraped on this page. Stopping.")
            break
    return ads


all_ads = scrape_all_ads(limit=no_of_ads)
print(f"Scraped {len(all_ads)} {car} ads")


Scraping search page: https://www.pakwheels.com/used-cars/search/-/mk_suzuki/md_mehran/ct_islamabad/ca_jinnah-garden/?page=1
Found 25 ads on page 1
Scraping ad page: https://www.pakwheels.com/used-cars/suzuki-mehran-2018-for-sale-in-islamabad-10437685
Scraping ad page: https://www.pakwheels.com/used-cars/suzuki-mehran-2018-for-sale-in-islamabad-10493932
Scraping ad page: https://www.pakwheels.com/used-cars/suzuki-mehran-2018-for-sale-in-islamabad-10490350
Scraping ad page: https://www.pakwheels.com/used-cars/suzuki-mehran-2011-for-sale-in-islamabad-10490159
Scraping ad page: https://www.pakwheels.com/used-cars/suzuki-mehran-2008-for-sale-in-islamabad-10481922
Scraping ad page: https://www.pakwheels.com/used-cars/suzuki-mehran-2019-for-sale-in-islamabad-10477033
Scraping ad page: https://www.pakwheels.com/used-cars/suzuki-mehran-2017-for-sale-in-islamabad-10468935
Scraping ad page: https://www.pakwheels.com/used-cars/suzuki-mehran-2016-for-sale-in-islamabad-10433465
Scraping ad page: ht