This notebook scrapes classified ads from [Jiji.co.ke](https://www.jiji.co.ke), inclusing vehicles, electronics, property, and home appliances. It uses BeautifulSoup to parse HTML and extract key ad data such as title, price, and description. The extracted data is saved to structured JSON files and combined into a single CSV, all stored in an `output` directory.

In [None]:
# -- Imports
import httpx
from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag
from httpx._models import Response
from typing import Any, Optional
import json
import polars as pl # pandas alternative
import pathlib

In [None]:
# -- URLs for Jiji Kenya
JIJI_HOMEPAGE = "https://jiji.co.ke/"
JIJI_VEHICLES = "https://jiji.co.ke/vehicles"
JIJI_PROPERTY = "https://jiji.co.ke/real-estate"
JIJI_ELECTRONICS = "https://jiji.co.ke/electronics"
JIJI_HOME = "https://jiji.co.ke/home-garden" # Home, Furniture & Appliances

In [None]:
# -- Fetch page content using httpx; returns the HTML content as a string
def fetch_page(url: str) -> str:
    response: Response = httpx.get(url, timeout=3)
    response.raise_for_status()
    return response.text

In [None]:
# -- Homepage; fetch and print the first 1000 characters
homepage: str = fetch_page(JIJI_HOMEPAGE)
print(homepage[:1000])  # Print the first 1000 characters of the homepage

In [None]:
# -- Custom function to parse ads from the HTML content
# Check README for expected structure from inspection using browser dev tools
def parse_ads(html: str) -> list[Any]:
    soup = BeautifulSoup(markup=html, features="html.parser")
    ads: list[Any] = []

    ad_cards: ResultSet[Tag] = soup.select("div.b-list-advert__gallery__item")

    for card in ad_cards:
        a_tag: Optional[Tag] = card.select_one("a.qa-advert-list-item") # anchor tag; contains the URL
        title: Optional[Tag] = card.select_one("div.qa-advert-title") # title of the ad/product
        price: Optional[Tag] = card.select_one("div.qa-advert-price") # price
        image: Optional[Tag] = card.select_one("img") # image tag; first image in the ad
        location: Optional[Tag] = card.select_one("span.b-list-advert__region__text")
        desc: Optional[Tag] = card.select_one("div.b-list-advert-base__description-text")

        ad: dict[str, Any] = {
            "title": title.get_text(strip=True) if title else None,
            "price": price.get_text(strip=True) if price else None,
            # "location": location.get_text(strip=True) if location else None, # rendered ::after; use Selenium
            "url": f"https://jiji.co.ke{a_tag['href']}" if a_tag and a_tag.has_attr('href') else None,
            "description": desc.get_text(strip=True) if desc else None,
            # "image_url": image['src'] if image and image.has_attr('src') else None,
        }

        ads.append(ad)
    
    return ads

In [None]:
# -- Homepage: https://jiji.co.ke/
homepage_ads: list[Any] = parse_ads(homepage)
print(f"Found {len(homepage_ads)} ads on the homepage.")
[print(ad) for ad in homepage_ads[:5]]

In [None]:
# -- Vehicles: https://jiji.co.ke/vehicles
vehicle_ads: str = parse_ads(fetch_page(JIJI_VEHICLES)) # shortened for brevity
print(f"Found {len(vehicle_ads)} vehicle ads.")
[print(ad) for ad in vehicle_ads[:5]]

In [None]:
# -- Property: https://jiji.co.ke/real-estate
property_ads: str = parse_ads(fetch_page(JIJI_PROPERTY))
print(f"Found {len(property_ads)} property ads.")
[print(ad) for ad in property_ads[:5]]

In [None]:
# -- Electronics: https://jiji.co.ke/electronics
electronic_ads: str = parse_ads(fetch_page(JIJI_ELECTRONICS))
print(f"Found {len(electronic_ads)} electronic ads.")
[print(ad) for ad in electronic_ads[:5]]

In [None]:
# -- Home, Furniture & Appliances: https://jiji.co.ke/home-garden
home_garden_ads: str = parse_ads(fetch_page(JIJI_HOME))
print(f"Found {len(home_garden_ads)} home and garden ads.")
[print(ad) for ad in home_garden_ads[:5]]

In [None]:
# -- Write ads to a JSON file
def write_to_json(ads: list, filename: str) -> None:
    numbered_ads = {f"{i+1:04}": ad for i, ad in enumerate(ads)}
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(numbered_ads, f, indent=2, ensure_ascii=False)
    print(f"✅ {len(ads)} ads written to {filename}")

In [None]:
# -- Save all ads to a single file
OUTPUT_DIR: pathlib.Path = pathlib.Path.cwd().parent / "output"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
write_to_json(homepage_ads, f"{OUTPUT_DIR}/homepage_ads.json")
write_to_json(vehicle_ads, f"{OUTPUT_DIR}/vehicle_ads.json")
write_to_json(property_ads, f"{OUTPUT_DIR}/property_ads.json")
write_to_json(electronic_ads, f"{OUTPUT_DIR}/electronic_ads.json")
write_to_json(home_garden_ads, f"{OUTPUT_DIR}/home_garden_ads.json")

In [None]:
# -- Write ads to dataframe; use unique IDs
def write_to_dataframe(metalist: list) -> None:
    # Use a list of lists i.e a metalist to create a DataFrame with unique IDs
    all_ads = [ad for ad_list in metalist for ad in ad_list]
    df = pl.DataFrame(all_ads)

    # Add 5-digit ad_id column
    ad_ids: list[str] = [f"{i+1:05}" for i in range(df.height)]
    df: pl.DataFrame = df.with_columns(pl.Series("ad_id", ad_ids))
    
    return df

# -- Write df to CSV file
def write_to_csv(df: pl.DataFrame, filename: str) -> None:
    df.write_csv(filename)
    print(f"✅ DataFrame written to {filename}")

In [None]:
data_frame = write_to_dataframe([homepage_ads, vehicle_ads, property_ads, electronic_ads, home_garden_ads])
print(f"DataFrame shape: {data_frame.shape}")
print(data_frame.head(5))  # Print the first 5 rows of the DataFrame

In [None]:
write_to_csv(data_frame, f"{OUTPUT_DIR}/jiji_ads.csv")