# **Data Scrapper: DaftarSekolah.net**

**Import Files & Softwares**

In [97]:
!pip -q install requests beautifulsoup4 pandas lxml


In [98]:
import re
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin


## Configuration

This section of the code is called the configuration block, and its purpose is to define the base parameters that the scraper will use when collecting data from the website **daftarsekolah.net**


This cell stores the main website URL. The scraper will use this as the starting point for building complete links (for example, combining it later with city and school level paths to access specific pages).

In [99]:
BASE = "https://daftarsekolah.net"

LEVELS = ["sd", "smp", "sma"]

CITIES = {
    "jakarta-timur": "kota-jakarta-timur",
    "jakarta-barat": "kota-jakarta-barat",
    "jakarta-selatan": "kota-jakarta-selatan",
    "jakarta-utara": "kota-jakarta-utara",
    "jakarta-pusat": "kota-jakarta-pusat"

}


## HTTP Session

the requests library to handle web requests more efficiently and reliably.

By using the User-Agent, the website is less likely to block or reject the requests, since it looks like normal browser activity rather than automated scraping. In short, this step ensures smoother, faster, and more stable communication between the scraper and the website.

In [100]:
session = requests.Session()
session.headers.update({
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120 Safari/537.36"
    )
})


## Fetch HTML

In [101]:
def fetch_soup(url: str) -> BeautifulSoup:
    r = session.get(url, timeout=30)
    r.raise_for_status()
    return BeautifulSoup(r.text, "lxml")


This function gets the HTML content from a web page so the program can read and extract data from it.

It works by:

Checking if the request was successful.

Converting the page’s HTML into a BeautifulSoup object, which makes it easy to find and extract specific parts of the page.

In simple terms, it’s like asking a website for a page and turning it into a format the computer can “read” easily.

## Pagination Detection

In [102]:
def get_max_page(soup: BeautifulSoup) -> int:
    pages = []
    for a in soup.select('a[href*="/page/"]'):
        m = re.search(r"/page/(\d+)", a.get("href", ""))
        if m:
            pages.append(int(m.group(1)))
    return max(pages) if pages else 1


## Parse Listing

In [103]:
def parse_listing_page(url, level, city):
    soup = fetch_soup(url)
    rows = []

    for h2 in soup.select("h2"):
        a = h2.find("a", href=True)
        if not a:
            continue

        school_name = h2.get_text(strip=True)
        profile_url = urljoin(BASE, a["href"])

        block = h2
        while block and "Profil Sekolah" not in block.get_text(" ", strip=True):
            block = block.parent

        if not block:
            continue

        texts = list(block.stripped_strings)

        accreditation = next(
            (t for t in texts if t.startswith("Terakreditasi") or t.startswith("Belum Terakreditasi")),
            None
        )

        address = None
        if accreditation and accreditation in texts:
            idx = texts.index(accreditation)
            if idx + 1 < len(texts):
                address = texts[idx + 1]

        rows.append({
            "NPSN": None,
            "Nama Sekolah": school_name,
            "Jenjang Pendidikan": level.upper(),
            "Kab. / Kota / Negara": city.replace("-", " ").title(),
            "Akreditasi": accreditation,
            "Alamat": address,
            "profile_url": profile_url
        })

    return rows, soup


## Normalize Keys

In [104]:
def _norm_key(s: str) -> str:
    s = s.strip().lower()
    # Replace ANY non-alphanumeric (including /, (), ., etc.) with spaces
    s = re.sub(r"[^0-9a-z]+", " ", s)
    # Collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s


## Extract Key-Value Pairs

In [105]:
def extract_kv_from_tables(soup: BeautifulSoup) -> dict:
    kv = {}

    for row in soup.select("tr"):
        cells = row.find_all(["th", "td"])
        if len(cells) >= 2:
            key = _norm_key(cells[0].get_text(" ", strip=True))
            val = cells[1].get_text(" ", strip=True)
            if key and val:
                kv[key] = val

    return kv



## Parse School Profiles

In [106]:
def parse_school_profile(profile_url: str) -> dict:
    soup = fetch_soup(profile_url)
    kv = extract_kv_from_tables(soup)  # uses _norm_key internally

    # Expanded mapping for location fields (normalized with new _norm_key)
    key_map = {
        "npsn": "NPSN",
        "akreditasi": "Akreditasi",
        "status sekolah": "Status Sekolah",
        "naungan": "Naungan",

        # Location fields
        "desa kelurahan": "Desa / Kelurahan",
        "kecamatan kota ln": "Kecamatan / Kota (LN)",
        "kab kota negara ln": "Kab. / Kota / Negara (LN)",
        "provinsi ln": "Provinsi / LN",

        # (Optional) other variants that sometimes appear
        "kecamatan kota": "Kecamatan / Kota (LN)",
        "kab kota": "Kab. / Kota / Negara (LN)",
        "provinsi": "Provinsi / LN",

        # Contacts/admin (optional)
        "kepala sekolah": "Kepala Sekolah",
        "operator": "Operator",
        "email": "Email",
        "website": "Website",
        "no telepon": "No Telepon",
        "telepon": "No Telepon",
        "fax": "Fax",
    }

    out = {
        "NPSN": None,
        "Akreditasi": None,
        "Status Sekolah": None,
        "Naungan": None,
        "Desa / Kelurahan": None,
        "Kecamatan / Kota (LN)": None,
        "Kab. / Kota / Negara (LN)": None,
        "Provinsi / LN": None,
        "Kepala Sekolah": None,
        "Operator": None,
        "Email": None,
        "Website": None,
        "No Telepon": None,
        "Fax": None,
    }

    for k_norm, target in key_map.items():
        if k_norm in kv and not out[target]:
            out[target] = kv[k_norm]

    # Fallbacks
    if not out["NPSN"]:
        m = re.search(r"\bNPSN\b\D*(\d{8})\b", soup.get_text(" ", strip=True), re.I)
        if m:
            out["NPSN"] = m.group(1)

    if not out["Akreditasi"]:
        txt = soup.get_text(" ", strip=True)
        m = re.search(r"Terakreditasi\s*([A-C])\b", txt, re.I)
        if m:
            out["Akreditasi"] = m.group(1).upper()
        elif re.search(r"Belum\s+Terakreditasi", txt, re.I):
            out["Akreditasi"] = "Belum Terakreditasi"

    return out


## Master Scrapper

In [107]:
def scrape_sd_smp_sma_jakarta(delay_listing=0.7, delay_profile=0.4):
    listing_rows = []

    print("▶ Stage 1: Listing pages")

    for level in LEVELS:
        for city_name, city_slug in CITIES.items():
            start_url = f"{BASE}/sekolah/{level}/all/jakarta/{city_slug}"
            rows, soup = parse_listing_page(start_url, level, city_name)
            max_page = get_max_page(soup)
            listing_rows.extend(rows)

            for p in range(2, max_page + 1):
                rows, _ = parse_listing_page(f"{start_url}/page/{p}", level, city_name)
                listing_rows.extend(rows)
                time.sleep(delay_listing)

    df_list = (
        pd.DataFrame(listing_rows)
        .drop_duplicates(subset=["profile_url"])
        .reset_index(drop=True)
    )

    print(f"▶ Stage 2: Profile pages ({len(df_list)})")

    full_rows = []

    for i, row in df_list.iterrows():
        try:
            profile_data = parse_school_profile(row["profile_url"])
        except Exception:
            profile_data = {}

        full_rows.append({**row.to_dict(), **profile_data})

        if i % 100 == 0:
            print(f"  Scraped {i}/{len(df_list)}")

        time.sleep(delay_profile)

    return pd.DataFrame(full_rows)


## Run Scrapper

In [108]:
df = scrape_sd_smp_sma_jakarta()
print("TOTAL SCHOOLS:", len(df))


▶ Stage 1: Listing pages
▶ Stage 2: Profile pages (3794)
  Scraped 0/3794
  Scraped 100/3794
  Scraped 200/3794
  Scraped 300/3794
  Scraped 400/3794
  Scraped 500/3794
  Scraped 600/3794
  Scraped 700/3794
  Scraped 800/3794
  Scraped 900/3794
  Scraped 1000/3794
  Scraped 1100/3794
  Scraped 1200/3794
  Scraped 1300/3794
  Scraped 1400/3794
  Scraped 1500/3794
  Scraped 1600/3794
  Scraped 1700/3794
  Scraped 1800/3794
  Scraped 1900/3794
  Scraped 2000/3794
  Scraped 2100/3794
  Scraped 2200/3794
  Scraped 2300/3794
  Scraped 2400/3794
  Scraped 2500/3794
  Scraped 2600/3794
  Scraped 2700/3794
  Scraped 2800/3794
  Scraped 2900/3794
  Scraped 3000/3794
  Scraped 3100/3794
  Scraped 3200/3794
  Scraped 3300/3794
  Scraped 3400/3794
  Scraped 3500/3794
  Scraped 3600/3794
  Scraped 3700/3794
TOTAL SCHOOLS: 3794


## Clean and Export CSV

In [109]:
df = df.drop_duplicates(subset=["profile_url"]).reset_index(drop=True)

df.to_csv(
    "sd_smp_sma_jakarta_npsn_akreditasi.csv",
    index=False,
    encoding="utf-8-sig"
)

df.head()


Unnamed: 0,NPSN,Nama Sekolah,Jenjang Pendidikan,Kab. / Kota / Negara,Akreditasi,Alamat,profile_url,Status Sekolah,Naungan,Desa / Kelurahan,Kecamatan / Kota (LN),Kab. / Kota / Negara (LN),Provinsi / LN,Kepala Sekolah,Operator,Email,Website,No Telepon,Fax
0,20121008,SD EMBUN PG,SD,Jakarta Timur,A,"Jl Raya Kapin No. 8, Kalimalang, Jakarta Timur...",https://daftarsekolah.net/sekolah/4590/sd-embu...,Swasta,Kementerian Pendidikan dan Kebudayaan,Pondok Kelapa,Kec. Duren Sawit,Kota Jakarta Timur,D.K.I. Jakarta,Yargustiwan,Hafidz D Wahyudi,[email protected],www.embunpagischool.com,0218651578,218602025.0
1,69994898,SDIT Darul Maarif Islamic School 3,SD,Jakarta Timur,,"Jl. Pule No. 27 RT 17/09 Ciracas, Jakarta Timu...",https://daftarsekolah.net/sekolah/3935/sdit-da...,Swasta,Kementerian Pendidikan dan Kebudayaan,Ciracas,Kec. Ciracas,Kota Jakarta Timur,D.K.I. Jakarta,"Nasir, S. Pd. I",Mahmud,,,Belum Tersedia,
2,20104403,SDN Cibubur 01 Pagi,SD,Jakarta Timur,A,Jl. Masjid Fathul Ghofur Rt.002/04 Cibubur Kec...,https://daftarsekolah.net/sekolah/3936/sdn-cib...,Negeri,Kementerian Pendidikan dan Kebudayaan,Cibubur,Kec. Ciracas,Kota Jakarta Timur,D.K.I. Jakarta,"H. Miftahul Hidayat, S. P. D. I",Ambar Nugroho,[email protected],,02187706530,
3,20109086,SD LABORATORIUM JAKARTA,SD,Jakarta Timur,A,Jl. Rawa Jaya No.37 PONDOK KOPI Kec. Duren Saw...,https://daftarsekolah.net/sekolah/4599/sd-labo...,Swasta,Kementerian Pendidikan dan Kebudayaan,PONDOK KOPI,Kec. Duren Sawit,Kota Jakarta Timur,D.K.I. Jakarta,Fieta Faristianty,Vina Putri Faisal,[email protected],sd.laboratorium-jakarta.com,0218646965,2170932596.0
4,69959107,SD GLOBAL MANDIRI,SD,Jakarta Timur,A,JAKARTA GARDEN CITY JL. RAYA BEKASI KM 24 Caku...,https://daftarsekolah.net/sekolah/4872/sd-glob...,Swasta,Kementerian Pendidikan dan Kebudayaan,Cakung Timur,Kec. Cakung,Kota Jakarta Timur,D.K.I. Jakarta,Daryono,Cheppy Parlindungan,[email protected],www.globalmandiri.sch.id,-,81287150060.0


# Save in Excel / .xlsx Format

In [110]:
!pip -q install openpyxl xlsxwriter


In [111]:
output_path = "sd_smp_sma_jakarta_npsn_akreditasi.xlsx"

df.to_excel(
    output_path,
    index=False,
    engine="openpyxl"
)

print(f"✅ Data saved to {output_path}")


✅ Data saved to sd_smp_sma_jakarta_npsn_akreditasi.xlsx
