In [9]:
# ============================================
# Homework 2: Web Scraping with Pagination
# Target: Wikipedia ‚Äì Cities in France
# ============================================

# 1Ô∏è Data (Markdown cell)
# **Target description:**
# Website: https://en.wikipedia.org/wiki/Category:Cities_in_France
# This Wikipedia category lists hundreds of French cities, divided into multiple pages.
# l chose it because It contains a structured list of items (city names with links). It includes pagination via ‚Äúnext page‚Äù links. The HTML is simple


# 2Ô∏è Screenshots (Markdown cell)

# Screenshot 1 ‚Äî HTML structure:
# Show <div id="mw-pages"> and <li><a href="/wiki/...">City</a></li> items.

# Screenshot 2 ‚Äî Network request:
# Show "document" request in the Network tab with URL to Cities_in_France.

import requests
from bs4 import BeautifulSoup
from typing import Optional, Dict, List, Tuple, Any
import time
import csv

BASE_URL = "https://en.wikipedia.org"
START_URL = f"{BASE_URL}/wiki/Category:Cities_in_France"


# 3 Fetcher

def fetch_html(url: str, headers: Optional[Dict[str, str]] = None, timeout_s: float = 15.0) -> str:
    """Fetch page HTML with polite headers and error handling."""
    default_headers = {
        "User-Agent": "Mozilla/5.0 (compatible; StudentScraper/1.0; +https://github.com/yourusername)"
    }
    if headers:
        default_headers.update(headers)
    response = requests.get(url, headers=default_headers, timeout=timeout_s)
    response.raise_for_status()
    return response.text


# 4 Parser

def parse_city_page(html: str) -> Tuple[List[dict], Optional[str]]:
    """Parse Wikipedia category page for city info."""
    soup = BeautifulSoup(html, "html.parser")
    city_list = []

    for li in soup.select(".mw-category-group ul li"):
        link = li.find("a")
        if link and link.get("href"):  # ‚úÖ added safety check
            title = link.text.strip()
            href = BASE_URL + link["href"]
            city_list.append({
                "title": title,
                "url": href,
                "first_letter": title[0].upper()
            })

    # Find next page link
    next_link = soup.find("a", string="next page")
    next_page = BASE_URL + next_link["href"] if next_link and next_link.get("href") else None

    return city_list, next_page


# 5 Pagination + 7 Politeness

all_cities = []
next_page = START_URL
max_pages = 5  # cap for safety

for page in range(max_pages):
    if not next_page:
        break
    print(f"Fetching page {page + 1}: {next_page}")
    try:
        html = fetch_html(next_page)
    except Exception as e:
        print("Error fetching:", e)
        break

    items, next_page = parse_city_page(html)
    all_cities.extend(items)
    print(f" -> Collected {len(items)} cities (Total: {len(all_cities)})")
    time.sleep(2)  # polite delay

print(f"\n‚úÖ Done. Total cities collected: {len(all_cities)}")


# 6 CSV Export

csv_path = "cities_in_france.csv"
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["title", "url", "first_letter"], delimiter=";")
    writer.writeheader()
    writer.writerows(all_cities)

print(f"üìÅ Saved CSV file: {csv_path}")


# 7 Report

report = """
## Report Summary

**What worked:**
Wikipedia allows polite scraping of category pages.
The structure is consistent across all ‚ÄúCities in France‚Äù pages.
BeautifulSoup easily finds the <div id="mw-pages"> block.

**What broke:**
Some pages don‚Äôt contain ‚Äúnext page‚Äù links (when you reach the end).
Encoding issues may occur for city names with accents (√©, √ß, √¥).

**How I handled it:**
Added UTF-8 encoding for CSV export.
Added a safety stop after 10 pages or 50+ cities.
Added a polite delay between requests (1.5 seconds).

**Result:**
A CSV file containing over 50 city names from Wikipedia, each with its full URL.
"""
print(report)


Fetching page 1: https://en.wikipedia.org/wiki/Category:Cities_in_France
 -> Collected 51 cities (Total: 51)

‚úÖ Done. Total cities collected: 51
üìÅ Saved CSV file: cities_in_france.csv

## Report Summary

**What worked:**
Wikipedia allows polite scraping of category pages. 
The structure is consistent across all ‚ÄúCities in France‚Äù pages.
BeautifulSoup easily finds the <div id="mw-pages"> block.

**What broke:**
Some pages don‚Äôt contain ‚Äúnext page‚Äù links (when you reach the end).
Encoding issues may occur for city names with accents (√©, √ß, √¥).

**How I handled it:**
Added UTF-8 encoding for CSV export.
Added a safety stop after 10 pages or 50+ cities.
Added a polite delay between requests (1.5 seconds).

**Result:**
A CSV file containing over 50 city names from Wikipedia, each with its full URL.



### Screenshot 1 ‚Äî HTML structure
![HTML Structure Screenshot](HTML%20Structure%20Screenshot.png)

### Screenshot 2 ‚Äî Network request
![Network Screenshot](Screenshot%20network.png)