### Intro to Web Scraping - Assignment

#### Imports

In [22]:
import httpx
from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag
from httpx._models import Response
from typing import Any, Optional


#### Scraping the Jiji homepage for classifieds

In [23]:
JIJI_HOMEPAGE = "https://jiji.co.ke/"
JIJI_VEHICLES = "https://jiji.co.ke/vehicles"
JIJI_PROPERTY = "https://jiji.co.ke/real-estate"
JIJI_ELECTRONICS = "https://jiji.co.ke/electronics"
JIJI_HOME = "https://jiji.co.ke/home-garden"

In [24]:
def fetch_page(url: str) -> str:
    response: Response = httpx.get(url, timeout=3)
    response.raise_for_status()
    return response.text

In [25]:
homepage: str = fetch_page(JIJI_HOMEPAGE)
print(homepage[:1000])  # Print the first 1000 characters of the homepage

<!DOCTYPE html><html  dir="ltr" lang="en"><head><meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<title>Free classifieds in Kenya ▷ Post free classified ads on Jiji.co.ke</title>
<link nonce="5gmd9Xps15Z+8qNqMZsych+i" rel="preconnect" href="https://assets.jijistatic.com">
<link nonce="5gmd9Xps15Z+8qNqMZsych+i" rel="stylesheet" href="https://assets.jijistatic.com/nuxt/entry.DQ4eE3Mv.css" crossorigin>
<link nonce="5gmd9Xps15Z+8qNqMZsych+i" rel="stylesheet" href="https://assets.jijistatic.com/nuxt/shared.O78TMMWs.css" crossorigin>
<link nonce="5gmd9Xps15Z+8qNqMZsych+i" rel="stylesheet" href="https://assets.jijistatic.com/nuxt/default-custom-header.DL93suWI.css" crossorigin>
<meta name="description" content="Free classifieds in Kenya ➔ ➔ ➔ Sell to Real People in Kenya on Jiji.co.ke ➔ Post Ad in Kenya ➔ local classified ads, more than online marketplace">
<link nonce="5gmd9Xps15Z+8qNqMZsych+i" rel="icon" type="image/x-icon" href="/

In [26]:
def parse_ads(html) -> list[Any]:
    soup = BeautifulSoup(html, "html.parser")
    ads: list[Any] = []

    ad_cards: ResultSet[Tag] = soup.select("div.b-list-advert__gallery__item")

    for card in ad_cards:
        a_tag: Optional[Tag] = card.select_one("a.qa-advert-list-item")
        title: Optional[Tag] = card.select_one("div.qa-advert-title")
        price: Optional[Tag] = card.select_one("div.qa-advert-price")
        image: Optional[Tag] = card.select_one("img")
        location: Optional[Tag] = card.select_one("span.b-list-advert__region__text")
        desc: Optional[Tag] = card.select_one("div.b-list-advert-base__description-text")
        attrs: ResultSet[Tag] = card.select("div.b-list-advert-base__item-attr")
        verified: Optional[Tag] = card.select_one("div.b-list-advert-base__label--blue")

        ad: dict[str, Any] = {
            "title": title.get_text(strip=True) if title else None,
            "price": price.get_text(strip=True) if price else None,
            "location": location.get_text(strip=True) if location else None,
            "condition": attrs[0].get_text(strip=True) if len(attrs) > 0 else None,
            "transmission": attrs[1].get_text(strip=True) if len(attrs) > 1 else None,
            "verified_id": bool(verified),
            "description": desc.get_text(strip=True) if desc else None,
            "image_url": image['src'] if image and image.has_attr('src') else None,
            "url": f"https://jiji.co.ke{a_tag['href']}" if a_tag and a_tag.has_attr('href') else None,
        }

        ads.append(ad)
    
    return ads

In [27]:
homepage_ads: list[Any] = parse_ads(homepage)
print(f"Found {len(homepage_ads)} ads on the homepage.")
[print(ad) for ad in homepage_ads[:5]]


Found 60 ads on the homepage.
{'title': 'Toyota Supra 3.0 Premium 2020 Black', 'price': 'KSh 10,999,999', 'location': None, 'condition': 'Foreign Used', 'transmission': 'Automatic', 'verified_id': False, 'description': '2019 Toyota Supra GR \nGT4 concept\n3000cc Petrol \n2 door coupe\n8 speed automatic \nRWD\nGet in touch...', 'image_url': 'https://pictures-kenya.jijistatic.com/76535591_MzAwLTQwMC1kYjM4NTU2ZGJk.webp', 'url': 'https://jiji.co.ke/kilimani/cars/toyota-supra-3-0-premium-2020-black-7qj2Z7pnLR28HMb6pErHVJfi.html?indexPosition=0'}
{'title': 'Lexus IS 250 Automatic 2012 Black', 'price': 'KSh 1,550,000', 'location': None, 'condition': 'Local Used', 'transmission': 'Automatic', 'verified_id': False, 'description': 'Lexus IS// 2500cc// Petrol// front wheel drive// bose speaker// unregistered// beige interior//...', 'image_url': 'https://pictures-kenya.jijistatic.com/76672738_MzAwLTIwMC02Yzg0MWY4ODQx.webp', 'url': 'https://jiji.co.ke/upperhill/cars/lexus-is-250-automatic-2012-blac

[None, None, None, None, None]