In [90]:
# -- Imports
import httpx
from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag
from httpx._models import Response
from typing import Any, Optional
import json
import polars as pl
import pathlib

In [91]:
# -- URLs for Jiji Kenya
JIJI_HOMEPAGE = "https://jiji.co.ke/"
JIJI_VEHICLES = "https://jiji.co.ke/vehicles"
JIJI_PROPERTY = "https://jiji.co.ke/real-estate"
JIJI_ELECTRONICS = "https://jiji.co.ke/electronics"
JIJI_HOME = "https://jiji.co.ke/home-garden" # Home, Furniture & Appliances

In [92]:
# -- Fetch page content using httpx; returns the HTML content as a string
def fetch_page(url: str) -> str:
    response: Response = httpx.get(url, timeout=3)
    response.raise_for_status()
    return response.text

In [93]:
# -- Homepage; fetch and print the first 1000 characters
homepage: str = fetch_page(JIJI_HOMEPAGE)
print(homepage[:1000])  # Print the first 1000 characters of the homepage

<!DOCTYPE html><html  dir="ltr" lang="en"><head><meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<title>Free classifieds in Kenya ▷ Post free classified ads on Jiji.co.ke</title>
<link nonce="I7BsANDKmFasxntEjESptHmx" rel="preconnect" href="https://assets.jijistatic.com">
<link nonce="I7BsANDKmFasxntEjESptHmx" rel="stylesheet" href="https://assets.jijistatic.com/nuxt/entry.DQ4eE3Mv.css" crossorigin>
<link nonce="I7BsANDKmFasxntEjESptHmx" rel="stylesheet" href="https://assets.jijistatic.com/nuxt/shared.O78TMMWs.css" crossorigin>
<link nonce="I7BsANDKmFasxntEjESptHmx" rel="stylesheet" href="https://assets.jijistatic.com/nuxt/default-custom-header.DL93suWI.css" crossorigin>
<meta name="description" content="Free classifieds in Kenya ➔ ➔ ➔ Sell to Real People in Kenya on Jiji.co.ke ➔ Post Ad in Kenya ➔ local classified ads, more than online marketplace">
<link nonce="I7BsANDKmFasxntEjESptHmx" rel="icon" type="image/x-icon" href="/

In [94]:
# -- Custom function to parse ads from the HTML content
# Check README for expected structure from inspection using browser dev tools
def parse_ads(html: str) -> list[Any]:
    soup = BeautifulSoup(markup=html, features="html.parser")
    ads: list[Any] = []

    ad_cards: ResultSet[Tag] = soup.select("div.b-list-advert__gallery__item")

    for card in ad_cards:
        a_tag: Optional[Tag] = card.select_one("a.qa-advert-list-item") # anchor tag; contains the URL
        title: Optional[Tag] = card.select_one("div.qa-advert-title") # title of the ad/product
        price: Optional[Tag] = card.select_one("div.qa-advert-price") # price
        image: Optional[Tag] = card.select_one("img") # image tag; first image in the ad
        location: Optional[Tag] = card.select_one("span.b-list-advert__region__text")
        desc: Optional[Tag] = card.select_one("div.b-list-advert-base__description-text")

        ad: dict[str, Any] = {
            "title": title.get_text(strip=True) if title else None,
            "price": price.get_text(strip=True) if price else None,
            # "location": location.get_text(strip=True) if location else None, # rendered ::after; use Selenium
            "url": f"https://jiji.co.ke{a_tag['href']}" if a_tag and a_tag.has_attr('href') else None,
            "description": desc.get_text(strip=True) if desc else None,
            # "image_url": image['src'] if image and image.has_attr('src') else None,
        }

        ads.append(ad)
    
    return ads

In [95]:
# -- Homepage: https://jiji.co.ke/
homepage_ads: list[Any] = parse_ads(homepage)
print(f"Found {len(homepage_ads)} ads on the homepage.")
[print(ad) for ad in homepage_ads[:5]]

Found 60 ads on the homepage.
{'title': 'Toyota Supra 3.0 Premium 2020 Black', 'price': 'KSh 10,999,999', 'url': 'https://jiji.co.ke/kilimani/cars/toyota-supra-3-0-premium-2020-black-7qj2Z7pnLR28HMb6pErHVJfi.html?indexPosition=0', 'description': '2019 Toyota Supra GR \nGT4 concept\n3000cc Petrol \n2 door coupe\n8 speed automatic \nRWD\nGet in touch...'}
{'title': 'Lexus IS 250 Automatic 2012 Black', 'price': 'KSh 1,550,000', 'url': 'https://jiji.co.ke/upperhill/cars/lexus-is-250-automatic-2012-black-1oHOmNJwc2k8gd2CvEjpKRB7.html?indexPosition=4', 'description': 'Lexus IS// 2500cc// Petrol// front wheel drive// bose speaker// unregistered// beige interior//...'}
{'title': 'Volvo S90 2018 White', 'price': 'KSh 3,350,000', 'url': 'https://jiji.co.ke/kilimani/cars/volvo-s90-2018-white-uiimpift1lNecEdk89KZAIt6.html?indexPosition=8', 'description': '2018 volvo s90 t5_leather\n _specs_\n2000cc petrol :: 360 degres camera :: bowers & wilkins music...'}
{'title': 'Honda Fit 2011 Silver', 'price

[None, None, None, None, None]

In [96]:
# -- Vehicles: https://jiji.co.ke/vehicles
vehicle_ads: str = parse_ads(fetch_page(JIJI_VEHICLES)) # shortened for brevity
print(f"Found {len(vehicle_ads)} vehicle ads.")
[print(ad) for ad in vehicle_ads[:5]]

Found 20 vehicle ads.
{'title': 'Fuel Pump Honda', 'price': 'KSh 8,300', 'url': 'https://jiji.co.ke/ngara/car-parts-and-accessories/fuel-pump-honda-96qwpuXgysNllpcOx2ii4Jlo.html', 'description': 'All fuel pumps for Honda Accord, Civic, stream all available at simrose Autospares, Nairobi.'}
{'title': 'Toyota Hiace 7L New Aluminium Plastic Radiator on Sale', 'price': 'KSh 18,000', 'url': 'https://jiji.co.ke/viwandani-makadara/car-parts-and-accessories/toyota-hiace-7l-new-aluminium-plastic-radiator-on-sale-wdlm5LMI3jHIBBd2c25oM1a9.html', 'description': 'Toyota hiace new aluminium plastic radiator on sale \n- still in a carton, brand new\n- we stock most...'}
{'title': 'Weather-resistant Black Rubber Boot Seal 5m Roll For Cars', 'price': 'KSh 1,500', 'url': 'https://jiji.co.ke/ridgeways/car-parts-and-accessories/weather-resistant-black-rubber-boot-seal-5m-roll-for-cars-nQ6AWDSO2NLieuw5eaPxN0HD.html', 'description': 'Weatherproof Black Rubber Boot Seal – 5 Meters for Perfect Fit\nUpgrade yo

[None, None, None, None, None]

In [97]:
# -- Property: https://jiji.co.ke/real-estate
property_ads: str = parse_ads(fetch_page(JIJI_PROPERTY))
print(f"Found {len(property_ads)} property ads.")
[print(ad) for ad in property_ads[:5]]

Found 20 property ads.
{'title': '2bdrm Apartment in Upperhill for rent', 'price': 'KSh 65,000', 'url': 'https://jiji.co.ke/upperhill/houses-apartments-for-rent/2bdrm-apartment-in-upperhill-for-rent-3fLwVnooSMHvQZlG8gdOEp9t.html', 'description': 'This spacious 2 bedroom both ensuite apartment in a very secure apartment complex with ample parking...'}
{'title': '4bdrm Maisonette in Kitengela for rent', 'price': 'KSh 55,000', 'url': 'https://jiji.co.ke/kitengela/houses-apartments-for-rent/4bdrm-maisonette-in-kitengela-for-rent-hVeihzQdXBLONtYsGkqL9Nsw.html', 'description': 'IN KITENGELA\n4 Bedroom All En-suite For Rent in Kitengela, Milimani Area \n Call or...'}
{'title': 'Furnished Studio Apartment', 'price': 'KSh 3,000', 'url': 'https://jiji.co.ke/nairobi-west/temporary-and-vacation-rentals/furnished-studio-apartment-bdsnPtPqkkk7LnZ7BJTggu1F.html', 'description': 'Welcome to Your Home Away from Home!\nRelax and unwind in our stylish and cozy studio apartment...'}
{'title': 'Furnished S

[None, None, None, None, None]

In [98]:
# -- Electronics: https://jiji.co.ke/electronics
electronic_ads: str = parse_ads(fetch_page(JIJI_ELECTRONICS))
print(f"Found {len(electronic_ads)} electronic ads.")
[print(ad) for ad in electronic_ads[:5]]

Found 20 electronic ads.
{'title': 'On Pole Fiber Cable Storage Bracket', 'price': 'KSh 1,799', 'url': 'https://jiji.co.ke/nairobi-central/accessories-and-supplies-for-electronics/on-pole-fiber-cable-storage-bracket-3nWcr0Qs0DGyh36OEtgkHlPC.html', 'description': 'Fiber Cable Storage Bracket is used for winding extra cable on pole/tower, normally used with...'}
{'title': 'Hp Sh03xl Laptop Battery', 'price': 'KSh 5,500', 'url': 'https://jiji.co.ke/nairobi-central/computer-accessories/hp-sh03xl-laptop-battery-vVs8OvPA2SpHjyq1E686gApq.html', 'description': 'Battery Type: Lithium polymer\r\nVoltage: 11.55V\r\nCapacity: 57.9WH\r\nCells: 6 Cells\r\nColor:...'}
{'title': 'Capture Card USB to HDMI', 'price': 'KSh 1,800', 'url': 'https://jiji.co.ke/nairobi-central/audio-and-music-equipment/capture-card-usb-to-hdmi-k9VFi5uI6YMR0IAvMpejGFK0.html', 'description': 'Capture card usb to hdmi available'}
{'title': '32 Inch Frameless Smart Android TV 3200S', 'price': 'KSh 11,299', 'url': 'https://jiji.c

[None, None, None, None, None]

In [99]:
# -- Home, Furniture & Appliances: https://jiji.co.ke/home-garden
home_garden_ads: str = parse_ads(fetch_page(JIJI_HOME))
print(f"Found {len(home_garden_ads)} home and garden ads.")
[print(ad) for ad in home_garden_ads[:5]]

Found 20 home and garden ads.
{'title': '12pcs Set 6 Mugs 6 Dinner Plates', 'price': 'KSh 3,200', 'url': 'https://jiji.co.ke/nairobi-central/kitchen-and-dining/12pcs-set-6-mugs-6-dinner-plates-iscHpYnrVmgKe2hkijVFTYlx.html', 'description': '12pcs Set ,6 Mugs, 6 Dinner Plates'}
{'title': 'Coffee Maker Machine - Sokany', 'price': 'KSh 2,600', 'url': 'https://jiji.co.ke/nairobi-central/kitchen-appliances/coffee-maker-machine-sokany-17VmLAQAl22m2z4R5XrX5zBC.html', 'description': 'Coffee maker machine - sokany'}
{'title': 'Roch 20ltrs Manual Microwave', 'price': 'KSh 7,300', 'url': 'https://jiji.co.ke/nairobi-central/kitchen-appliances/roch-20ltrs-manual-microwave-zLPn5d0FzSD52Aul6kjSedRA.html', 'description': 'Roch manual microwave available'}
{'title': 'Black Aliyons Kettle', 'price': 'KSh 1,200', 'url': 'https://jiji.co.ke/nairobi-central/kitchen-appliances/black-aliyons-kettle-g7TXKOr5yTH1sn2elu0QBLcw.html', 'description': 'Located at Nairobi tearoom junction of Accra and Munyu road Sco

[None, None, None, None, None]

In [100]:
# -- Write ads to a JSON file
def write_to_json(ads: list, filename: str) -> None:
    numbered_ads = {f"{i+1:04}": ad for i, ad in enumerate(ads)}
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(numbered_ads, f, indent=2, ensure_ascii=False)
    print(f"✅ {len(ads)} ads written to {filename}")

In [101]:
# -- Save all ads to a single file
OUTPUT_DIR: pathlib.Path = pathlib.Path.cwd().parent / "output"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
write_to_json(homepage_ads, f"{OUTPUT_DIR}/homepage_ads.json")
write_to_json(vehicle_ads, f"{OUTPUT_DIR}/vehicle_ads.json")
write_to_json(property_ads, f"{OUTPUT_DIR}/property_ads.json")
write_to_json(electronic_ads, f"{OUTPUT_DIR}/electronic_ads.json")
write_to_json(home_garden_ads, f"{OUTPUT_DIR}/home_garden_ads.json")

✅ 60 ads written to /home/neo/Projects/phoenixke-masterclass/output/homepage_ads.json
✅ 20 ads written to /home/neo/Projects/phoenixke-masterclass/output/vehicle_ads.json
✅ 20 ads written to /home/neo/Projects/phoenixke-masterclass/output/property_ads.json
✅ 20 ads written to /home/neo/Projects/phoenixke-masterclass/output/electronic_ads.json
✅ 20 ads written to /home/neo/Projects/phoenixke-masterclass/output/home_garden_ads.json


In [102]:
# -- Write ads to dataframe; use unique IDs
def write_to_dataframe(metalist: list) -> None:
    # Use a list of lists i.e a metalist to create a DataFrame with unique IDs
    all_ads = [ad for ad_list in metalist for ad in ad_list]
    df = pl.DataFrame(all_ads)

    # Add 5-digit ad_id column
    ad_ids: list[str] = [f"{i+1:05}" for i in range(df.height)]
    df: pl.DataFrame = df.with_columns(pl.Series("ad_id", ad_ids))
    
    return df

# -- Write df to CSV file
def write_to_csv(df: pl.DataFrame, filename: str) -> None:
    df.write_csv(filename)
    print(f"✅ DataFrame written to {filename}")

In [103]:
data_frame = write_to_dataframe([homepage_ads, vehicle_ads, property_ads, electronic_ads, home_garden_ads])
print(f"DataFrame shape: {data_frame.shape}")
print(data_frame.head(5))  # Print the first 5 rows of the DataFrame

DataFrame shape: (140, 5)
shape: (5, 5)
┌────────────────────────┬────────────────┬────────────────────────┬───────────────────────┬───────┐
│ title                  ┆ price          ┆ url                    ┆ description           ┆ ad_id │
│ ---                    ┆ ---            ┆ ---                    ┆ ---                   ┆ ---   │
│ str                    ┆ str            ┆ str                    ┆ str                   ┆ str   │
╞════════════════════════╪════════════════╪════════════════════════╪═══════════════════════╪═══════╡
│ Toyota Supra 3.0       ┆ KSh 10,999,999 ┆ https://jiji.co.ke/kil ┆ 2019 Toyota Supra GR  ┆ 00001 │
│ Premium 2020 …         ┆                ┆ imani/ca…              ┆ GT4 conc…             ┆       │
│ Lexus IS 250 Automatic ┆ KSh 1,550,000  ┆ https://jiji.co.ke/upp ┆ Lexus IS// 2500cc//   ┆ 00002 │
│ 2012 Bl…               ┆                ┆ erhill/c…              ┆ Petrol// f…           ┆       │
│ Volvo S90 2018 White   ┆ KSh 3,350,000  ┆ https:/

In [104]:
write_to_csv(data_frame, F"{OUTPUT_DIR}/jiji_ads.csv")

✅ DataFrame written to /home/neo/Projects/phoenixke-masterclass/output/jiji_ads.csv
