In [157]:
from bs4 import BeautifulSoup #we use beatuiful soup to extract data
import time, re, requests, pandas as pd
import urllib.request
from urllib.parse import urljoin
from pprint import pprint
import numpy as np

In [158]:
BASE = "https://www.kijiji.ca"
SEARCH_URL = "https://www.kijiji.ca/b-cars-vehicles/canada/page-{page}/c27l0?view=list"
HEADERS = {"User-Agent": "Mozilla/5.0", "Accept-Language": "en-CA,en;q=0.9"}
ID_RE = re.compile(r"/v-[^/]+/(\d+)")

SESSION = requests.Session()
SESSION.headers.update(HEADERS)
DELAY = 0  # set to 1–2 if you start getting low counts or repeats

In [159]:
start_page = 1
end_page = 50
for page in range(start_page, end_page + 1):
    url = SEARCH_URL.format(page=page)                 # e.g., .../page-1/ , .../page-2/
    resp = requests.get(url, headers=HEADERS, timeout=50)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "lxml")

In [160]:
raw_links = []
for ad in soup.select('li[data-testid^="listing-card-list-item"] a[href]'):
    href = ad.get("href")
    if not href:
        continue
    if "/v-" in href:  # new site still uses /v- inside href
        raw_links.append(urljoin(BASE, href.split("?")[0]))

In [161]:
# 4) De-duplicate while keeping order (article: avoid repeated ads)
#id_re = re.compile(r"/v-[^/]+/(\d+)")
seen = set()
ad_links = []
for u in raw_links:
    if u not in seen:
        ad_links.append(u)
        seen.add(u)


In [162]:
len(ad_links), ad_links[:5]

(46,
 ['https://www.kijiji.ca/v-heavy-equipment-machinery/ottawa/doosan-300lx/1726752829',
  'https://www.kijiji.ca/v-cars-trucks/calgary/2007-jeep-wrangler-x-no-reported-accidents-4wd/1725559543',
  'https://www.kijiji.ca/v-cars-trucks/calgary/2023-toyota-tundra-hybrid-limited/1724548010',
  'https://www.kijiji.ca/v-heavy-equipment-machinery/calgary/2019-caterpillar-950gc/1724481185',
  'https://www.kijiji.ca/v-tires-rims/mississauga-peel-region/sale-235-45-18-new-winter-tires-free-installed-no-tax-cash/1725766137'])

In [163]:
def get_soup(url):
    r = requests.get(url, headers=HEADERS, timeout=50)
    return BeautifulSoup(r.text, "lxml")

In [164]:
def first_text(soup, sels):
    for s in sels:
        el = soup.select_one(s)
        if el and el.get_text(strip=True):
            return el.get_text(strip=True)
    return None

In [165]:
def price_num(txt):
    if not txt:
        return None
    # remove currency symbols and text
    clean = (
        txt.replace("\xa0", " ")
           .replace("$", "")
           .replace(",", "")
           .replace("CAD", "")
           .strip()
    )
    m = re.search(r"\d+(\.\d+)?", clean)
    return float(m.group()) if m else None

In [166]:
num_re = re.compile(r"[\d,\.]+")
rows = []
for i, url in enumerate(ad_links, 1):
    try:
        s = get_soup(url)
        title = first_text(s, ["h1", '[data-testid="item-title"]', 'h1[itemprop="name"]'])
        # try visible price then meta
        ptxt = first_text(s,['p[data-testid="vip-price"]',   # ← matches what you just showed
        '[data-testid="price"]',
        '[itemprop="price"]','span.price'])
        attrs = {}
        for grp in s.select('div[data-testid="vip-attributes-group"]'):
            for blk in grp.select(':scope > div'):
                ps = blk.find_all('p')
                if len(ps) >= 2:
                    k = ps[0].get_text(strip=True).rstrip(':')
                    v = " ".join(p.get_text(" ", strip=True) for p in ps[1:])
                    if k and v:
                        attrs[k] = v

        rows.append({
            "title": title,
            "price_text": ptxt,
            "price_num": price_num(ptxt),
            "url": url,
            **attrs
        })
    except Exception as e:
        rows.append({"title": None, "price_text": None, "price_num": None, "url": url, "error": str(e)})

In [167]:
df = pd.DataFrame(rows)
print(len(df), "ads scraped")
df.head(20)

46 ads scraped


Unnamed: 0,title,price_text,price_num,url,Condition,Kilometres,Transmission,Drivetrain,Fuel
0,doosan 300lx,"$230,000",230000.0,https://www.kijiji.ca/v-heavy-equipment-machin...,,,,,
1,2007 Jeep Wrangler X | No Reported Accidents |...,"$11,924",11924.0,https://www.kijiji.ca/v-cars-trucks/calgary/20...,Used,274890.0,Semi-Automatic,4 x 4,
2,2023 Toyota Tundra Hybrid Limited,"$70,727",70727.0,https://www.kijiji.ca/v-cars-trucks/calgary/20...,Used,61882.0,Automatic,4 x 4,Hybrid-Electric
3,2019 Caterpillar 950GC,"$215,000",215000.0,https://www.kijiji.ca/v-heavy-equipment-machin...,,,,,
4,SALE! 235/45/18 NEW WINTER TIRES FREE INSTALLE...,$120,120.0,https://www.kijiji.ca/v-tires-rims/mississauga...,,,,,
5,2024 Jeep Grand Cherokee L Summit,"$68,306",68306.0,https://www.kijiji.ca/v-cars-trucks/grande-pra...,Used,26049.0,Automatic,4 x 4,Gas
6,2020 Toyota RAV4 Hybrid XLE | Sunroof | Climat...,"$33,988",33988.0,https://www.kijiji.ca/v-cars-trucks/mississaug...,Used,63000.0,Automatic,All-wheel drive (AWD),Hybrid-Electric
7,2026 Kia Seltos X-Line AWD | Push Start | HUD ...,"$36,988",36988.0,https://www.kijiji.ca/v-cars-trucks/mississaug...,Used,1805.0,Automatic,All-wheel drive (AWD),Gas
8,2025 Kia Seltos LX AWD | Blindspot Alert | Hea...,"$27,188",27188.0,https://www.kijiji.ca/v-cars-trucks/mississaug...,Used,6752.0,Automatic,All-wheel drive (AWD),Gas
9,2019 Toyota Prius Prime Navigation | Climate C...,"$22,688",22688.0,https://www.kijiji.ca/v-cars-trucks/mississaug...,Used,78653.0,Automatic,Front-wheel drive (FWD),Gas
