# Data Scrapping Project

In [35]:
!pip -q install requests beautifulsoup4 pandas lxml


In [36]:
import re
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

## Configuration

In [37]:
BASE = "https://daftarsekolah.net"

LEVELS = ["mts", "mi", "ma"]

CITIES = {
    "jakarta-timur": "kota-jakarta-timur",
    "jakarta-barat": "kota-jakarta-barat",
    "jakarta-selatan": "kota-jakarta-selatan",
    "jakarta-utara": "kota-jakarta-utara",
    "jakarta-pusat": "kota-jakarta-pusat"
    # kepulauan-seribu removed
}


## HTTP Session

In [38]:
session = requests.Session()
session.headers.update({
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120 Safari/537.36"
    )
})


## Fetch HTML

In [39]:
def fetch_soup(url: str) -> BeautifulSoup:
    r = session.get(url, timeout=30)
    r.raise_for_status()
    return BeautifulSoup(r.text, "lxml")


## Pagination Detection

In [40]:
def get_max_page(soup: BeautifulSoup) -> int:
    pages = []
    for a in soup.select('a[href*="/page/"]'):
        m = re.search(r"/page/(\d+)", a.get("href", ""))
        if m:
            pages.append(int(m.group(1)))
    return max(pages) if pages else 1


## Parse Listing

In [41]:
def parse_listing_page(url, level, city):
    soup = fetch_soup(url)
    rows = []

    for h2 in soup.select("h2"):
        a = h2.find("a", href=True)
        if not a:
            continue

        school_name = h2.get_text(strip=True)
        profile_url = urljoin(BASE, a["href"])

        block = h2
        while block and "Profil Sekolah" not in block.get_text(" ", strip=True):
            block = block.parent

        if not block:
            continue

        texts = list(block.stripped_strings)

        accreditation = next(
            (t for t in texts if t.startswith("Terakreditasi") or t.startswith("Belum Terakreditasi")),
            None
        )

        address = None
        if accreditation and accreditation in texts:
            idx = texts.index(accreditation)
            if idx + 1 < len(texts):
                address = texts[idx + 1]

        rows.append({
            "NPSN": None,
            "Nama Sekolah": school_name,
            "Jenjang Pendidikan": level.upper(),
            "Kab. / Kota / Negara": city.replace("-", " ").title(),
            "Akreditasi": accreditation,
            "Alamat": address,
            "profile_url": profile_url
        })

    return rows, soup


Normalize Keys

In [42]:
def _norm_key(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[\u00A0\s]+", " ", s)
    s = re.sub(r"[.:,-]+", "", s)
    return s

## Extract Key-Value Pairs

In [43]:
def extract_kv_from_tables(soup: BeautifulSoup) -> dict:
    kv = {}

    for tr in soup.select("tr"):
        th = tr.find("th")
        tds = tr.find_all("td")

        if th and len(tds) >= 1:
            kv[_norm_key(th.get_text())] = tds[0].get_text(" ", strip=True)
        elif len(tds) >= 2:
            kv[_norm_key(tds[0].get_text())] = tds[1].get_text(" ", strip=True)

    return kv


Parse School Profiles

In [44]:
def parse_school_profile(profile_url: str) -> dict:
    soup = fetch_soup(profile_url)
    kv = extract_kv_from_tables(soup)

    key_map = {
        "npsn": "NPSN",
        "akreditasi": "Akreditasi",
        "status sekolah": "Status Sekolah",
        "naungan": "Naungan",
        "kepala sekolah": "Kepala Sekolah",
        "operator": "Operator",
    }

    out = {v: None for v in key_map.values()}

    for k_norm, target in key_map.items():
        if k_norm in kv:
            out[target] = kv[k_norm]

    # Fallback: regex for NPSN
    if not out["NPSN"]:
        m = re.search(r"\bNPSN\b\s*[:\-]?\s*(\d{8})\b", soup.get_text())
        if m:
            out["NPSN"] = m.group(1)

    # Normalize Akreditasi
    if out["Akreditasi"]:
        out["Akreditasi"] = out["Akreditasi"].strip()
    else:
        m = re.search(r"Terakreditasi\s*([A-C])", soup.get_text(), re.I)
        if m:
            out["Akreditasi"] = m.group(1).upper()

    return out

## Master Scrapper

In [45]:
def scrape_mts_mi_ma_jakarta(delay_listing=0.7, delay_profile=0.4):
    listing_rows = []

    print("▶ Stage 1: Listing pages")

    for level in LEVELS:
        for city_name, city_slug in CITIES.items():
            start_url = f"{BASE}/sekolah/{level}/all/jakarta/{city_slug}"
            rows, soup = parse_listing_page(start_url, level, city_name)
            max_page = get_max_page(soup)
            listing_rows.extend(rows)

            for p in range(2, max_page + 1):
                rows, _ = parse_listing_page(f"{start_url}/page/{p}", level, city_name)
                listing_rows.extend(rows)
                time.sleep(delay_listing)

    df_list = (
        pd.DataFrame(listing_rows)
        .drop_duplicates(subset=["profile_url"])
        .reset_index(drop=True)
    )

    print(f"▶ Stage 2: Profile pages ({len(df_list)})")

    full_rows = []

    for i, row in df_list.iterrows():
        try:
            profile_data = parse_school_profile(row["profile_url"])
        except Exception:
            profile_data = {}

        full_rows.append({**row.to_dict(), **profile_data})

        if i % 100 == 0:
            print(f"  Scraped {i}/{len(df_list)}")

        time.sleep(delay_profile)

    return pd.DataFrame(full_rows)



## Run Scrapper

In [46]:
df = scrape_mts_mi_ma_jakarta()
print("TOTAL SCHOOLS:", len(df))


▶ Stage 1: Listing pages
▶ Stage 2: Profile pages (827)
  Scraped 0/827
  Scraped 100/827
  Scraped 200/827
  Scraped 300/827
  Scraped 400/827
  Scraped 500/827
  Scraped 600/827
  Scraped 700/827
  Scraped 800/827
TOTAL SCHOOLS: 827


## Clean and Export

In [47]:
df = df.drop_duplicates(subset=["profile_url"]).reset_index(drop=True)

df.to_csv(
    "mts_mi_ma_jakarta_npsn_akreditasi.csv",
    index=False,
    encoding="utf-8-sig"
)

df.head()


Unnamed: 0,NPSN,Nama Sekolah,Jenjang Pendidikan,Kab. / Kota / Negara,Akreditasi,Alamat,profile_url,Status Sekolah,Naungan,Kepala Sekolah,Operator
0,70025907,MTs pesantren modern pkp jis,MTS,Jakarta Timur,,"Jl. Raya pkp, kelapa dua wetan, ciracas Jakart...",https://daftarsekolah.net/sekolah/3918/mts-pes...,Swasta,Kementerian Agama,,-
1,20178099,MTSS AL KAHFI,MTS,Jakarta Timur,A,Jl. Raya Bogor Km. 22 No. 22 Rambutan Kec. Cir...,https://daftarsekolah.net/sekolah/3920/mtss-al...,Swasta,Kementerian Agama,,Dede Ruspandi
2,20178100,MTSS AL WAHYU,MTS,Jakarta Timur,B,Jl. Madrasah No. 24 RT/RW. 004/014 Cibubur Kec...,https://daftarsekolah.net/sekolah/3921/mtss-al...,Swasta,Kementerian Agama,,"Mardani, S. Ag"
3,20178071,MTSS AL WATHONIYAH 07,MTS,Jakarta Timur,B,Jl. Krt. Radjiman Wd/ Kp. Rawa Badung Rt 02/07...,https://daftarsekolah.net/sekolah/4849/mtss-al...,Swasta,Kementerian Agama,,Sarah Aini
4,20178133,MTSN 14 JAKARTA,MTS,Jakarta Timur,A,Jl. Elang Komp. Rajawali Halim Perdana Kusumah...,https://daftarsekolah.net/sekolah/4164/mtsn-14...,Negeri,Kementerian Agama,,Dian Kusumadewi
