In [1]:
# jalankan ini di Colab / environment Python
%pip install pymupdf tqdm gensim

import os
import re
import fitz  # pymupdf
import pandas as pd
from tqdm import tqdm
from datetime import datetime

Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting gensim
  Downloading gensim-4.4.0-cp311-cp311-win_amd64.whl.metadata (8.6 kB)
Collecting smart_open>=1.8.1 (from gensim)
  Downloading smart_open-7.5.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart_open>=1.8.1->gensim)
  Downloading wrapt-2.0.1-cp311-cp311-win_amd64.whl.metadata (9.2 kB)
Downloading pymupdf-1.26.6-cp310-abi3-win_amd64.whl (18.4 MB)
   ---------------------------------------- 0.0/18.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.4 MB ? eta -:--:--
    --------------------------------------- 0.3/18.4 MB ? eta -:--:--
   - -------------------------------------- 0.8/18.4 MB 1.7 MB/s eta 0:00:11
   -- ------------------------------------- 1.0/18.4 MB 1.4 MB/s eta 0:00:13
   -- ------------------------------------- 1.3/18.4 MB 1.4 MB/s eta 0:00:13
   --- ------


[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: C:\Users\ainun\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [26]:
# path folder yang berisi PDF (sesuaikan jika berbeda)
PDF_DIR = "pdf"   # misal: "/content/pdf" di Colab, atau "./pdf" di lokal
os.makedirs(PDF_DIR, exist_ok=True)

# cek berapa file pdf
pdf_files = [f for f in os.listdir(PDF_DIR) if f.lower().endswith('.pdf')]
print("Jumlah PDF di folder:", len(pdf_files))
pdf_files[:20]


Jumlah PDF di folder: 50


['InfoBencana_bandung_11jan2024.pdf',
 'InfoBencana_bandung_15feb2020.pdf',
 'InfoBencana_bandung_16jan2021.pdf',
 'InfoBencana_bandung_16mar2020.pdf',
 'InfoBencana_bandung_20jun2021.pdf',
 'InfoBencana_bandung_23jan2020.pdf',
 'InfoBencana_bandung_24des2020.pdf',
 'Infobencana_bandung_24mar2021.pdf',
 'InfoBencana_bandung_25nov2021.pdf',
 'InfoBencana_bandung_27nov2021.pdf',
 'InfoBencana_bandung_28feb2020.pdf',
 'InfoBencana_bandung_7feb2020.pdf',
 'InfoBencana_bekasi_14des2024.pdf',
 'InfoBencana_bekasi_20feb2021.pdf',
 'InfoBencana_bekasi_24feb2023.pdf',
 'InfoBencana_bekasi_25feb2020.pdf',
 'InfoBencana_bekasi_28jan2025.pdf',
 'InfoBencana_bekasi_7jul2025.pdf',
 'InfoBencana_bogor_24okto2020.pdf',
 'InfoBencana_bogor_2maret2025.pdf']

In [27]:
def extract_text_from_pdf(path):
    try:
        doc = fitz.open(path)
        texts = []
        for page in doc:
            texts.append(page.get_text("text"))
        doc.close()
        return "\n".join(texts).strip()
    except Exception as e:
        print(f"Error reading {path}: {e}")
        return ""


In [28]:
# contoh nama: InfoBencana_bandung_7feb2020.pdf, info_bencana_karawang_februari2021.pdf, InfoBencana_karawang_10jan2020.pdf
MONTH_MAP = {
    'jan':'01','januari':'01',
    'feb':'02','februari':'02',
    'mar':'03','maret':'03',
    'apr':'04','april':'04',
    'mei':'05',
    'jun':'06','juni':'06',
    'jul':'07','juli':'07',
    'agu':'08','agustus':'08','aug':'08',
    'sep':'09','september':'09',
    'okt':'10','oktober':'10',
    'nov':'11','november':'11',
    'des':'12','desember':'12'
}

def parse_filename(fname):
    base = os.path.splitext(fname)[0].lower()
    # try extract date patterns like 10jan2020, 7feb2020, 10-01-2020, 2020-01-10
    date = None
    # pattern1: ddmonYYYY or dmonYYYY (e.g., 7feb2020, 10jan2020)
    m = re.search(r'(\d{1,2})(?:[\-_ ]?)([a-z]+)(?:[\-_ ]?)(\d{4})', base)
    if m:
        d, mon_raw, y = m.groups()
        mon = MONTH_MAP.get(mon_raw[:3], MONTH_MAP.get(mon_raw, None))
        if mon:
            date = f"{y}-{mon}-{int(d):02d}"
    # pattern2: YYYY-mm-dd or dd-mm-YYYY
    if not date:
        m2 = re.search(r'(\d{4})[^\d](\d{1,2})[^\d](\d{1,2})', base)
        if m2:
            y, mth, d = m2.groups()
            date = f"{y}-{int(mth):02d}-{int(d):02d}"
    # try find place/kabupaten: pick token that is not 'info','bencana','sitrep','laporan'
    # split by underscore/space/hyphen
    tokens = re.split(r'[_\-\s]+', base)
    tokens = [t for t in tokens if t not in ('info','info','infoBencana'.lower(),'bencana','sitrep','laporan','report','pdf')]
    place = None
    if tokens:
        # prefer tokens that are alphabetic and not month words
        for t in tokens:
            if t.isalpha() and t[:3] not in MONTH_MAP.keys():
                place = t
                break
    return place, date


In [29]:
rows = []
for fname in tqdm(pdf_files):
    path = os.path.join(PDF_DIR, fname)
    raw = extract_text_from_pdf(path)
    place, date = parse_filename(fname)
    rows.append({
        "id_pdf": fname,
        "kabupaten_pdf": place,
        "tanggal_pdf": date,
        "raw_text": raw
    })

tabel_pdf_raw = pd.DataFrame(rows)
print("Sampel hasil parsing filename + ringkasan teks:")
display(tabel_pdf_raw.head(10))


100%|██████████| 50/50 [00:03<00:00, 16.47it/s]

Sampel hasil parsing filename + ringkasan teks:





Unnamed: 0,id_pdf,kabupaten_pdf,tanggal_pdf,raw_text
0,InfoBencana_bandung_11jan2024.pdf,bandung,2024-01-11,"Beranda / Banjir Genangi Bandung, 600 Jiwa Ter..."
1,InfoBencana_bandung_15feb2020.pdf,bandung,2020-02-15,B\nandung (ANTARA) - Banjir yang melanda sebag...
2,InfoBencana_bandung_16jan2021.pdf,bandung,2021-01-16,Beranda / Banjir Rendam Empat Kecamatan di Kab...
3,InfoBencana_bandung_16mar2020.pdf,bandung,2020-03-16,\nAdvertisement\nSurabaya: Hunian Modern\nyan...
4,InfoBencana_bandung_20jun2021.pdf,bandung,2021-06-20,Beranda / 76 KK Terdampak Banjir Bandang di Ka...
5,InfoBencana_bandung_23jan2020.pdf,bandung,2020-01-23,Beranda / Lima Kecamatan di Kabupaten Bandung ...
6,InfoBencana_bandung_24des2020.pdf,bandung,2020-12-24,Beranda / Sembilan Orang Hilang Pada Peristiwa...
7,Infobencana_bandung_24mar2021.pdf,bandung,2021-03-24,\nSembilan Warga Hilang Akibat Banjir dan Lon...
8,InfoBencana_bandung_25nov2021.pdf,bandung,2021-11-25,"Hujan Deras Melanda Kota Bandung,\nSejumlah Ti..."
9,InfoBencana_bandung_27nov2021.pdf,bandung,2021-11-27,Beranda / Empat Kecamatan di Wilayah Kabupaten...


In [30]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stopw = set(stopwords.words('indonesian'))
extra_noise = {"info","bencana","sitrep","laporan","bnpb","bpbd"}

def clean_text_basic(text):
    if not text or pd.isna(text):
        return ""
    txt = text.lower()
    # Normalize whitespace
    txt = re.sub(r'\s+', ' ', txt)
    # remove URLs
    txt = re.sub(r'http\S+|www\.\S+', ' ', txt)
    # remove punctuation except keep -/ (if needed)
    txt = txt.translate(str.maketrans('', '', string.punctuation))
    # remove isolated short tokens (1-2 chars) except known abbreviations
    tokens = [t for t in txt.split() if len(t) > 2]
    # remove stopwords + extra noise
    tokens = [t for t in tokens if t not in stopw and t not in extra_noise]
    return " ".join(tokens)

tabel_pdf_raw['Teks_Bersih'] = tabel_pdf_raw['raw_text'].apply(clean_text_basic)
tabel_pdf_raw.to_csv("tabel_pdf_raw_extracted.csv", index=False)
print("cleaning selesai. contoh Teks_Bersih:")
display(tabel_pdf_raw[['id_pdf','kabupaten_pdf','tanggal_pdf','Teks_Bersih']].head(10))


cleaning selesai. contoh Teks_Bersih:


[nltk_data] Downloading package stopwords to C:\Users\Rendy Devano
[nltk_data]     Danendr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id_pdf,kabupaten_pdf,tanggal_pdf,Teks_Bersih
0,InfoBencana_bandung_11jan2024.pdf,bandung,2024-01-11,beranda banjir genangi bandung 600 jiwa terdam...
1,InfoBencana_bandung_15feb2020.pdf,bandung,2020-02-15,andung banjir melanda wilayah kabupaten bandun...
2,InfoBencana_bandung_16jan2021.pdf,bandung,2021-01-16,beranda banjir rendam kecamatan kabupaten band...
3,InfoBencana_bandung_16mar2020.pdf,bandung,2020-03-16,advertisement surabaya hunian modern cocok usi...
4,InfoBencana_bandung_20jun2021.pdf,bandung,2021-06-20,beranda terdampak banjir bandang kabupaten ban...
5,InfoBencana_bandung_23jan2020.pdf,bandung,2020-01-23,beranda kecamatan kabupaten bandung dilanda ba...
6,InfoBencana_bandung_24des2020.pdf,bandung,2020-12-24,beranda sembilan orang hilang peristiwa banjir...
7,Infobencana_bandung_24mar2021.pdf,bandung,2021-03-24,sembilan warga hilang akibat banjir longsor ba...
8,InfoBencana_bandung_25nov2021.pdf,bandung,2021-11-25,hujan deras melanda kota bandung titik tergena...
9,InfoBencana_bandung_27nov2021.pdf,bandung,2021-11-27,beranda kecamatan wilayah kabupaten bandung te...


In [31]:
from gensim.utils import simple_preprocess

def tokenize(text):
    return simple_preprocess(text, deacc=True)  # deacc removes accents/punct

tabel_pdf_raw['Teks_Tokenized'] = tabel_pdf_raw['Teks_Bersih'].apply(tokenize)
display(tabel_pdf_raw[['id_pdf','Teks_Tokenized']].head(6))


Unnamed: 0,id_pdf,Teks_Tokenized
0,InfoBencana_bandung_11jan2024.pdf,"[beranda, banjir, genangi, bandung, jiwa, terd..."
1,InfoBencana_bandung_15feb2020.pdf,"[andung, banjir, melanda, wilayah, kabupaten, ..."
2,InfoBencana_bandung_16jan2021.pdf,"[beranda, banjir, rendam, kecamatan, kabupaten..."
3,InfoBencana_bandung_16mar2020.pdf,"[advertisement, surabaya, hunian, modern, coco..."
4,InfoBencana_bandung_20jun2021.pdf,"[beranda, terdampak, banjir, bandang, kabupate..."
5,InfoBencana_bandung_23jan2020.pdf,"[beranda, kecamatan, kabupaten, bandung, dilan..."


In [32]:
# kita simpankan kolom id_pdf sebagai identifier sementara
tabel_teks_final = tabel_pdf_raw[['id_pdf','kabupaten_pdf','tanggal_pdf','raw_text','Teks_Bersih','Teks_Tokenized']].copy()
tabel_teks_final.to_csv("tabel_teks_final.csv", index=False)
print("tabel_teks_final dibuat dan disimpan: tabel_teks_final.csv")
tabel_teks_final.head()


tabel_teks_final dibuat dan disimpan: tabel_teks_final.csv


Unnamed: 0,id_pdf,kabupaten_pdf,tanggal_pdf,raw_text,Teks_Bersih,Teks_Tokenized
0,InfoBencana_bandung_11jan2024.pdf,bandung,2024-01-11,"Beranda / Banjir Genangi Bandung, 600 Jiwa Ter...",beranda banjir genangi bandung 600 jiwa terdam...,"[beranda, banjir, genangi, bandung, jiwa, terd..."
1,InfoBencana_bandung_15feb2020.pdf,bandung,2020-02-15,B\nandung (ANTARA) - Banjir yang melanda sebag...,andung banjir melanda wilayah kabupaten bandun...,"[andung, banjir, melanda, wilayah, kabupaten, ..."
2,InfoBencana_bandung_16jan2021.pdf,bandung,2021-01-16,Beranda / Banjir Rendam Empat Kecamatan di Kab...,beranda banjir rendam kecamatan kabupaten band...,"[beranda, banjir, rendam, kecamatan, kabupaten..."
3,InfoBencana_bandung_16mar2020.pdf,bandung,2020-03-16,\nAdvertisement\nSurabaya: Hunian Modern\nyan...,advertisement surabaya hunian modern cocok usi...,"[advertisement, surabaya, hunian, modern, coco..."
4,InfoBencana_bandung_20jun2021.pdf,bandung,2021-06-20,Beranda / 76 KK Terdampak Banjir Bandang di Ka...,beranda terdampak banjir bandang kabupaten ban...,"[beranda, terdampak, banjir, bandang, kabupate..."


In [33]:
import pandas as pd

tabel_bencana_cuaca = pd.read_csv("tabel_bencana_cuaca_clean.csv")
print(tabel_bencana_cuaca.head())


                                     id Tanggal___Waktu_Kejadian  Minggu  \
0  fa5b474a-2804-45bd-b267-88e05cc3b663  2025-10-13 15:00:00.177      42   
1  5286a193-f56a-4efb-91d3-d5664ac0a351  2025-10-13 00:30:00.967      42   
2  4ea374b8-6e31-4898-9b61-6b72d714e0d5  2025-09-30 15:30:00.667      40   
3  20563f27-09f7-4403-bc59-0084ed9f0241  2025-09-26 19:30:00.167      39   
4  e4f66518-3149-4ae8-b4f9-23ff30525e26  2025-09-18 09:45:00.714      38   

   Bulan  Tahun  Kode_Provinsi    Provinsi  Kode_Kabupaten    Kabupaten  \
0     10   2025             32  Jawa Barat           32.18  Pangandaran   
1     10   2025             32  Jawa Barat           32.06  Tasikmalaya   
2      9   2025             32  Jawa Barat           32.01        Bogor   
3      9   2025             32  Jawa Barat           32.01        Bogor   
4      9   2025             32  Jawa Barat           32.15     Karawang   

   Kode_Jenis_Kejadian  ... Rumah_Rusak_Ringan Rumah_Terendam  \
0                 1010  ...

In [36]:
tabel_bencana_cuaca['Tanggal_Kejadian_Key'] = (
    pd.to_datetime(tabel_bencana_cuaca['Tanggal___Waktu_Kejadian'])
    .dt.date
    .astype(str)
)
print(tabel_bencana_cuaca.head())

                                     id Tanggal___Waktu_Kejadian  Minggu  \
0  fa5b474a-2804-45bd-b267-88e05cc3b663  2025-10-13 15:00:00.177      42   
1  5286a193-f56a-4efb-91d3-d5664ac0a351  2025-10-13 00:30:00.967      42   
2  4ea374b8-6e31-4898-9b61-6b72d714e0d5  2025-09-30 15:30:00.667      40   
3  20563f27-09f7-4403-bc59-0084ed9f0241  2025-09-26 19:30:00.167      39   
4  e4f66518-3149-4ae8-b4f9-23ff30525e26  2025-09-18 09:45:00.714      38   

   Bulan  Tahun  Kode_Provinsi    Provinsi  Kode_Kabupaten    Kabupaten  \
0     10   2025             32  Jawa Barat           32.18  Pangandaran   
1     10   2025             32  Jawa Barat           32.06  Tasikmalaya   
2      9   2025             32  Jawa Barat           32.01        Bogor   
3      9   2025             32  Jawa Barat           32.01        Bogor   
4      9   2025             32  Jawa Barat           32.15     Karawang   

   Kode_Jenis_Kejadian  ... Rumah_Rusak_Ringan Rumah_Terendam  \
0                 1010  ...

In [38]:
# contoh: buat kolom tanggal_key pada tabel_bencana_cuaca
tabel_bencana_cuaca['Tanggal_Kejadian_Key'] = pd.to_datetime(tabel_bencana_cuaca['Tanggal___Waktu_Kejadian']).dt.date.astype(str)
# pada tabel_teks_final pastikan tanggal_pdf dalam format YYYY-MM-DD string
tabel_teks_final['tanggal_pdf'] = pd.to_datetime(tabel_teks_final['tanggal_pdf'], errors='coerce').dt.date.astype(str)

# coba merge berdasar kabupaten (lower-case) + tanggal
tabel_bencana_cuaca['kab_lower'] = tabel_bencana_cuaca['Kabupaten'].str.lower().str.strip()
tabel_teks_final['kab_lower'] = tabel_teks_final['kabupaten_pdf'].str.lower().str.strip()

merged = pd.merge(
    tabel_bencana_cuaca,
    tabel_teks_final,
    left_on=['kab_lower','Tanggal_Kejadian_Key'],
    right_on=['kab_lower','tanggal_pdf'],
    how='left'
)
print("Baris dengan teks ter-merge:", merged['Teks_Bersih'].notna().sum())
merged[['id','Tanggal_Kejadian_Key','kab_lower','id_pdf','Teks_Bersih']].head()


Baris dengan teks ter-merge: 50


Unnamed: 0,id,Tanggal_Kejadian_Key,kab_lower,id_pdf,Teks_Bersih
0,fa5b474a-2804-45bd-b267-88e05cc3b663,2025-10-13,pangandaran,,
1,5286a193-f56a-4efb-91d3-d5664ac0a351,2025-10-13,tasikmalaya,,
2,4ea374b8-6e31-4898-9b61-6b72d714e0d5,2025-09-30,bogor,,
3,20563f27-09f7-4403-bc59-0084ed9f0241,2025-09-26,bogor,,
4,e4f66518-3149-4ae8-b4f9-23ff30525e26,2025-09-18,karawang,,


In [42]:
Kunci_Kebutuhan = ["logistik","selimut","air bersih","makanan siap saji","alat kebersihan","sekop"]
Kunci_Infrastruktur = ["jembatan","jalan terputus","terisolir","fasilitas ibadah","mushola","jembatan putus","jalan putus"]

def flag_keywords(text, keyword_list):
    if not text or pd.isna(text):
        return 0
    t = text.lower()
    for kw in keyword_list:
        if kw in t:
            return 1
    return 0

tabel_teks_final['Fitur_Butuh_Logistik'] = tabel_teks_final['Teks_Bersih'].apply(lambda x: flag_keywords(x, Kunci_Kebutuhan))
tabel_teks_final['Fitur_Infrastruktur_Rusak'] = tabel_teks_final['Teks_Bersih'].apply(lambda x: flag_keywords(x, Kunci_Infrastruktur))

tabel_teks_final.to_csv("tabel_teks_final_with_flags.csv", index=False)
display(tabel_teks_final[['id_pdf','Fitur_Butuh_Logistik','Fitur_Infrastruktur_Rusak']].head(52))


Unnamed: 0,id_pdf,Fitur_Butuh_Logistik,Fitur_Infrastruktur_Rusak
0,InfoBencana_bandung_11jan2024.pdf,1,0
1,InfoBencana_bandung_15feb2020.pdf,1,1
2,InfoBencana_bandung_16jan2021.pdf,0,0
3,InfoBencana_bandung_16mar2020.pdf,0,0
4,InfoBencana_bandung_20jun2021.pdf,0,0
5,InfoBencana_bandung_23jan2020.pdf,1,0
6,InfoBencana_bandung_24des2020.pdf,1,0
7,Infobencana_bandung_24mar2021.pdf,1,1
8,InfoBencana_bandung_25nov2021.pdf,0,0
9,InfoBencana_bandung_27nov2021.pdf,0,1


In [44]:
import pandas as pd

pdf_mapping = pd.read_csv("pdf_mapping.csv")
print(pdf_mapping.head(50))


                                          id;filename
0   eee63243-5005-4ad4-9506-05d1771f737d;InfoBenca...
1   d9a79ec3-9dad-42e9-bbe7-a5866067a5c0;InfoBenca...
2   764cd4a6-c47f-4eb2-9a3c-1c520992808b;InfoBenca...
3   0ea7a9d9-74f4-4ef4-b96d-d1b85bc6abb4;info_benc...
4   165587a2-130e-4da3-b7db-0bac4804a63a;info_benc...
5   0380c116-20b5-4e15-ba6e-f2e4f7d9ed69;info_benc...
6   2102fd81-93c9-47ad-9cea-e09dd8c3fb50;info_benc...
7   5619370b-059e-4e47-bce9-8db8f32311ef;info_benc...
8   4d2add39-f367-4d01-b423-2e7f824a97ec;info_benc...
9   d3d1714d-862b-4d96-b55f-8c7f28ac8915;info_benc...
10  e5eaa81c-1cfd-49c7-a5c6-19b429fc03ec;info_benc...
11  2e8e70c3-a118-4023-a091-a55da6eba33f;info_benc...
12  fcaa14bd-1519-4bfb-87ad-32eaabbab921;info_benc...
13  c337acfc-d62c-4e73-a7c2-1f947e0ccea1;info_benc...
14  3fe185f8-f4fe-407c-b6de-3b1fd6ca13b7;info_benc...
15  be731dfb-7d2d-4d50-a429-f64a9c425f94;info_benc...
16  7e6c24f8-b78e-40f1-a7c7-2e47875e6958;info_benc...
17  33cf0cf4-800a-4a51-bf2a-

In [45]:
df_teks = pd.read_csv("tabel_teks_final.csv")
df_dibi = pd.read_csv("tabel_bencana_cuaca_clean.csv")

df_teks.head(5)

Unnamed: 0,id_pdf,kabupaten_pdf,tanggal_pdf,raw_text,Teks_Bersih,Teks_Tokenized
0,InfoBencana_bandung_11jan2024.pdf,bandung,2024-01-11,"Beranda / Banjir Genangi Bandung, 600 Jiwa Ter...",beranda banjir genangi bandung 600 jiwa terdam...,"['beranda', 'banjir', 'genangi', 'bandung', 'j..."
1,InfoBencana_bandung_15feb2020.pdf,bandung,2020-02-15,B\nandung (ANTARA) - Banjir yang melanda sebag...,andung banjir melanda wilayah kabupaten bandun...,"['andung', 'banjir', 'melanda', 'wilayah', 'ka..."
2,InfoBencana_bandung_16jan2021.pdf,bandung,2021-01-16,Beranda / Banjir Rendam Empat Kecamatan di Kab...,beranda banjir rendam kecamatan kabupaten band...,"['beranda', 'banjir', 'rendam', 'kecamatan', '..."
3,InfoBencana_bandung_16mar2020.pdf,bandung,2020-03-16,\nAdvertisement\nSurabaya: Hunian Modern\nyan...,advertisement surabaya hunian modern cocok usi...,"['advertisement', 'surabaya', 'hunian', 'moder..."
4,InfoBencana_bandung_20jun2021.pdf,bandung,2021-06-20,Beranda / 76 KK Terdampak Banjir Bandang di Ka...,beranda terdampak banjir bandang kabupaten ban...,"['beranda', 'terdampak', 'banjir', 'bandang', ..."


In [46]:
df_dibi.head(5)

Unnamed: 0,id,Tanggal___Waktu_Kejadian,Minggu,Bulan,Tahun,Kode_Provinsi,Provinsi,Kode_Kabupaten,Kabupaten,Kode_Jenis_Kejadian,...,Rumah_Rusak_Ringan,Rumah_Terendam,Satuan_Pendidikan_Rusak,Rumah_Ibadat_Rusak,Fasilitas_Pelayanan_Kesehatan_Rusak,Kantor_Rusak,Jembatan_Rusak,Tanggal,Tanggal_Kejadian_Key,kabupaten_lower
0,fa5b474a-2804-45bd-b267-88e05cc3b663,2025-10-13 15:00:00.177,42,10,2025,32,Jawa Barat,32.18,Pangandaran,1010,...,0,457,0,0,0,0,0,2025-10-13 15:00:00.177,2025-10-13,pangandaran
1,5286a193-f56a-4efb-91d3-d5664ac0a351,2025-10-13 00:30:00.967,42,10,2025,32,Jawa Barat,32.06,Tasikmalaya,1010,...,0,19,0,0,0,0,0,2025-10-13 00:30:00.967,2025-10-13,tasikmalaya
2,4ea374b8-6e31-4898-9b61-6b72d714e0d5,2025-09-30 15:30:00.667,40,9,2025,32,Jawa Barat,32.01,Bogor,1010,...,0,40,0,0,0,0,0,2025-09-30 15:30:00.667,2025-09-30,bogor
3,20563f27-09f7-4403-bc59-0084ed9f0241,2025-09-26 19:30:00.167,39,9,2025,32,Jawa Barat,32.01,Bogor,1020,...,7,0,0,0,0,0,0,2025-09-26 19:30:00.167,2025-09-26,bogor
4,e4f66518-3149-4ae8-b4f9-23ff30525e26,2025-09-18 09:45:00.714,38,9,2025,32,Jawa Barat,32.15,Karawang,1010,...,0,45,0,0,0,0,0,2025-09-18 09:45:00.714,2025-09-18,karawang


In [47]:
import pandas as pd

# Load file
teks = pd.read_csv("tabel_teks_final.csv")
bencana = pd.read_csv("tabel_bencana_cuaca_clean.csv")

print("Jumlah PDF:", len(teks))
print("Jumlah baris DIBI:", len(bencana))

# --- Normalisasi kolom kabupaten ---
teks['kabupaten_pdf'] = teks['kabupaten_pdf'].astype(str).str.lower().str.strip()
bencana['kabupaten_lower'] = bencana['kabupaten_lower'].astype(str).str.lower().str.strip()

# --- Normalisasi tanggal ---
teks['tanggal_pdf'] = pd.to_datetime(teks['tanggal_pdf'], errors='coerce')
bencana['Tanggal_Kejadian_Key'] = pd.to_datetime(bencana['Tanggal_Kejadian_Key'], errors='coerce')

# Cek nilai unik kabupaten PDF vs DIBI
print("Kabupaten pdf unik:", teks['kabupaten_pdf'].unique())
print("Kabupaten DIBI unik contoh:", bencana['kabupaten_lower'].unique()[:20])


Jumlah PDF: 50
Jumlah baris DIBI: 2507
Kabupaten pdf unik: ['bandung' 'bekasi' 'bogor' 'cianjur' 'garut' 'indramayu' 'karawang'
 'subang' 'sukabumi']
Kabupaten DIBI unik contoh: ['pangandaran' 'tasikmalaya' 'bogor' 'karawang' 'bandung barat' 'sukabumi'
 'garut' 'cianjur' 'kota cimahi' 'bekasi' 'indramayu' 'bandung' 'subang'
 'kota bandung' 'kuningan' 'sumedang' 'purwakarta' 'kota tasikmalaya']


In [48]:
from tqdm import tqdm

matches = []

for i, row in teks.iterrows():
    kab = row['kabupaten_pdf']
    tgl = row['tanggal_pdf']
    
    # Filter berdasarkan kabupaten dulu
    subset = bencana[bencana['kabupaten_lower'] == kab]
    
    if len(subset) == 0:
        matches.append({
            'id_pdf': row['id_pdf'],
            'kabupaten_pdf': kab,
            'tanggal_pdf': tgl,
            'match_status': 'NO_KABUPATEN_MATCH',
            'id_dibi': None
        })
        continue
    
    # Filter tanggal ± 2 hari
    subset2 = subset[
        (subset['Tanggal_Kejadian_Key'] >= tgl - pd.Timedelta(days=2)) &
        (subset['Tanggal_Kejadian_Key'] <= tgl + pd.Timedelta(days=2))
    ]
    
    if len(subset2) == 0:
        matches.append({
            'id_pdf': row['id_pdf'],
            'kabupaten_pdf': kab,
            'tanggal_pdf': tgl,
            'match_status': 'NO_DATE_MATCH',
            'id_dibi': None
        })
        continue
    
    # Ambil tanggal paling dekat
    subset2['diff'] = (subset2['Tanggal_Kejadian_Key'] - tgl).abs()
    best_match = subset2.sort_values('diff').iloc[0]
    
    matches.append({
        'id_pdf': row['id_pdf'],
        'kabupaten_pdf': kab,
        'tanggal_pdf': tgl,
        'match_status': 'OK',
        'id_dibi': best_match['id']
    })

tabel_match = pd.DataFrame(matches)
tabel_match.to_csv("pdf_dibi_match.csv", index=False)
tabel_match.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset2['diff'] = (subset2['Tanggal_Kejadian_Key'] - tgl).abs()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset2['diff'] = (subset2['Tanggal_Kejadian_Key'] - tgl).abs()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset2['diff'] = (subset2['Tanggal_Kejadian_Key'] - tgl).abs()
A value is tr

Unnamed: 0,id_pdf,kabupaten_pdf,tanggal_pdf,match_status,id_dibi
0,InfoBencana_bandung_11jan2024.pdf,bandung,2024-01-11,OK,0bed2012-92ab-438d-9d40-933a1dd5f4e5
1,InfoBencana_bandung_15feb2020.pdf,bandung,2020-02-15,OK,764cd4a6-c47f-4eb2-9a3c-1c520992808b
2,InfoBencana_bandung_16jan2021.pdf,bandung,2021-01-16,OK,27bf1148-b9b3-47ab-8d9c-6ecf7058594e
3,InfoBencana_bandung_16mar2020.pdf,bandung,2020-03-16,OK,8b6e6695-aad2-4803-9a60-60e979094720
4,InfoBencana_bandung_20jun2021.pdf,bandung,2021-06-20,OK,b0fae1f1-0915-43f5-a05f-7235b27013aa


In [49]:
master = tabel_match.merge(
    bencana,
    left_on='id_dibi',
    right_on='id',
    how='left'
).merge(
    teks,
    on='id_pdf',
    how='left'
)

master.to_csv("tabel_analitik_master.csv", index=False)
master.head()


Unnamed: 0,id_pdf,kabupaten_pdf_x,tanggal_pdf_x,match_status,id_dibi,id,Tanggal___Waktu_Kejadian,Minggu,Bulan,Tahun,...,Kantor_Rusak,Jembatan_Rusak,Tanggal,Tanggal_Kejadian_Key,kabupaten_lower,kabupaten_pdf_y,tanggal_pdf_y,raw_text,Teks_Bersih,Teks_Tokenized
0,InfoBencana_bandung_11jan2024.pdf,bandung,2024-01-11,OK,0bed2012-92ab-438d-9d40-933a1dd5f4e5,0bed2012-92ab-438d-9d40-933a1dd5f4e5,2024-01-11 18:45:00.000,2.0,1.0,2024.0,...,0.0,0.0,2024-01-11 18:45:00.000,2024-01-11,bandung,bandung,2024-01-11,"Beranda / Banjir Genangi Bandung, 600 Jiwa Ter...",beranda banjir genangi bandung 600 jiwa terdam...,"['beranda', 'banjir', 'genangi', 'bandung', 'j..."
1,InfoBencana_bandung_15feb2020.pdf,bandung,2020-02-15,OK,764cd4a6-c47f-4eb2-9a3c-1c520992808b,764cd4a6-c47f-4eb2-9a3c-1c520992808b,2020-02-15 00:01:00.000,7.0,2.0,2020.0,...,0.0,0.0,2020-02-15 00:01:00.000,2020-02-15,bandung,bandung,2020-02-15,B\nandung (ANTARA) - Banjir yang melanda sebag...,andung banjir melanda wilayah kabupaten bandun...,"['andung', 'banjir', 'melanda', 'wilayah', 'ka..."
2,InfoBencana_bandung_16jan2021.pdf,bandung,2021-01-16,OK,27bf1148-b9b3-47ab-8d9c-6ecf7058594e,27bf1148-b9b3-47ab-8d9c-6ecf7058594e,2021-01-16 20:00:00.000,2.0,1.0,2021.0,...,0.0,0.0,2021-01-16 20:00:00.000,2021-01-16,bandung,bandung,2021-01-16,Beranda / Banjir Rendam Empat Kecamatan di Kab...,beranda banjir rendam kecamatan kabupaten band...,"['beranda', 'banjir', 'rendam', 'kecamatan', '..."
3,InfoBencana_bandung_16mar2020.pdf,bandung,2020-03-16,OK,8b6e6695-aad2-4803-9a60-60e979094720,8b6e6695-aad2-4803-9a60-60e979094720,2020-03-16 00:01:00.000,12.0,3.0,2020.0,...,0.0,0.0,2020-03-16 00:01:00.000,2020-03-16,bandung,bandung,2020-03-16,\nAdvertisement\nSurabaya: Hunian Modern\nyan...,advertisement surabaya hunian modern cocok usi...,"['advertisement', 'surabaya', 'hunian', 'moder..."
4,InfoBencana_bandung_20jun2021.pdf,bandung,2021-06-20,OK,b0fae1f1-0915-43f5-a05f-7235b27013aa,b0fae1f1-0915-43f5-a05f-7235b27013aa,2021-06-20 12:00:00.000,24.0,6.0,2021.0,...,0.0,0.0,2021-06-20 12:00:00.000,2021-06-20,bandung,bandung,2021-06-20,Beranda / 76 KK Terdampak Banjir Bandang di Ka...,beranda terdampak banjir bandang kabupaten ban...,"['beranda', 'terdampak', 'banjir', 'bandang', ..."


In [50]:
df_master = pd.read_csv("tabel_analitik_master.csv")
df_pdf = pd.read_csv("pdf_dibi_match.csv")

In [51]:
df_pdf.head(10)

Unnamed: 0,id_pdf,kabupaten_pdf,tanggal_pdf,match_status,id_dibi
0,InfoBencana_bandung_11jan2024.pdf,bandung,2024-01-11,OK,0bed2012-92ab-438d-9d40-933a1dd5f4e5
1,InfoBencana_bandung_15feb2020.pdf,bandung,2020-02-15,OK,764cd4a6-c47f-4eb2-9a3c-1c520992808b
2,InfoBencana_bandung_16jan2021.pdf,bandung,2021-01-16,OK,27bf1148-b9b3-47ab-8d9c-6ecf7058594e
3,InfoBencana_bandung_16mar2020.pdf,bandung,2020-03-16,OK,8b6e6695-aad2-4803-9a60-60e979094720
4,InfoBencana_bandung_20jun2021.pdf,bandung,2021-06-20,OK,b0fae1f1-0915-43f5-a05f-7235b27013aa
5,InfoBencana_bandung_23jan2020.pdf,bandung,2020-01-23,OK,eb714d4e-adf7-4c0a-9a33-22a2a948aff5
6,InfoBencana_bandung_24des2020.pdf,bandung,2020-12-24,OK,3d26b483-3e90-48c4-8e80-74b63111c0ec
7,Infobencana_bandung_24mar2021.pdf,bandung,2021-03-24,OK,ddd875dd-8059-4d1c-9036-cc958c5cfce9
8,InfoBencana_bandung_25nov2021.pdf,bandung,2021-11-25,OK,d1bb7cdd-a262-4660-800b-1289f642191e
9,InfoBencana_bandung_27nov2021.pdf,bandung,2021-11-27,OK,8b3af6ea-5313-4180-a48d-2afb2340028d


In [52]:
df_master.head(10)

Unnamed: 0,id_pdf,kabupaten_pdf_x,tanggal_pdf_x,match_status,id_dibi,id,Tanggal___Waktu_Kejadian,Minggu,Bulan,Tahun,...,Kantor_Rusak,Jembatan_Rusak,Tanggal,Tanggal_Kejadian_Key,kabupaten_lower,kabupaten_pdf_y,tanggal_pdf_y,raw_text,Teks_Bersih,Teks_Tokenized
0,InfoBencana_bandung_11jan2024.pdf,bandung,2024-01-11,OK,0bed2012-92ab-438d-9d40-933a1dd5f4e5,0bed2012-92ab-438d-9d40-933a1dd5f4e5,2024-01-11 18:45:00.000,2.0,1.0,2024.0,...,0.0,0.0,2024-01-11 18:45:00.000,2024-01-11,bandung,bandung,2024-01-11,"Beranda / Banjir Genangi Bandung, 600 Jiwa Ter...",beranda banjir genangi bandung 600 jiwa terdam...,"['beranda', 'banjir', 'genangi', 'bandung', 'j..."
1,InfoBencana_bandung_15feb2020.pdf,bandung,2020-02-15,OK,764cd4a6-c47f-4eb2-9a3c-1c520992808b,764cd4a6-c47f-4eb2-9a3c-1c520992808b,2020-02-15 00:01:00.000,7.0,2.0,2020.0,...,0.0,0.0,2020-02-15 00:01:00.000,2020-02-15,bandung,bandung,2020-02-15,B\nandung (ANTARA) - Banjir yang melanda sebag...,andung banjir melanda wilayah kabupaten bandun...,"['andung', 'banjir', 'melanda', 'wilayah', 'ka..."
2,InfoBencana_bandung_16jan2021.pdf,bandung,2021-01-16,OK,27bf1148-b9b3-47ab-8d9c-6ecf7058594e,27bf1148-b9b3-47ab-8d9c-6ecf7058594e,2021-01-16 20:00:00.000,2.0,1.0,2021.0,...,0.0,0.0,2021-01-16 20:00:00.000,2021-01-16,bandung,bandung,2021-01-16,Beranda / Banjir Rendam Empat Kecamatan di Kab...,beranda banjir rendam kecamatan kabupaten band...,"['beranda', 'banjir', 'rendam', 'kecamatan', '..."
3,InfoBencana_bandung_16mar2020.pdf,bandung,2020-03-16,OK,8b6e6695-aad2-4803-9a60-60e979094720,8b6e6695-aad2-4803-9a60-60e979094720,2020-03-16 00:01:00.000,12.0,3.0,2020.0,...,0.0,0.0,2020-03-16 00:01:00.000,2020-03-16,bandung,bandung,2020-03-16,\nAdvertisement\nSurabaya: Hunian Modern\nyan...,advertisement surabaya hunian modern cocok usi...,"['advertisement', 'surabaya', 'hunian', 'moder..."
4,InfoBencana_bandung_20jun2021.pdf,bandung,2021-06-20,OK,b0fae1f1-0915-43f5-a05f-7235b27013aa,b0fae1f1-0915-43f5-a05f-7235b27013aa,2021-06-20 12:00:00.000,24.0,6.0,2021.0,...,0.0,0.0,2021-06-20 12:00:00.000,2021-06-20,bandung,bandung,2021-06-20,Beranda / 76 KK Terdampak Banjir Bandang di Ka...,beranda terdampak banjir bandang kabupaten ban...,"['beranda', 'terdampak', 'banjir', 'bandang', ..."
5,InfoBencana_bandung_23jan2020.pdf,bandung,2020-01-23,OK,eb714d4e-adf7-4c0a-9a33-22a2a948aff5,eb714d4e-adf7-4c0a-9a33-22a2a948aff5,2020-01-23 20:30:00.000,4.0,1.0,2020.0,...,0.0,0.0,2020-01-23 20:30:00.000,2020-01-23,bandung,bandung,2020-01-23,Beranda / Lima Kecamatan di Kabupaten Bandung ...,beranda kecamatan kabupaten bandung dilanda ba...,"['beranda', 'kecamatan', 'kabupaten', 'bandung..."
6,InfoBencana_bandung_24des2020.pdf,bandung,2020-12-24,OK,3d26b483-3e90-48c4-8e80-74b63111c0ec,3d26b483-3e90-48c4-8e80-74b63111c0ec,2020-12-24 19:20:00.000,52.0,12.0,2020.0,...,0.0,0.0,2020-12-24 19:20:00.000,2020-12-24,bandung,bandung,2020-12-24,Beranda / Sembilan Orang Hilang Pada Peristiwa...,beranda sembilan orang hilang peristiwa banjir...,"['beranda', 'sembilan', 'orang', 'hilang', 'pe..."
7,Infobencana_bandung_24mar2021.pdf,bandung,2021-03-24,OK,ddd875dd-8059-4d1c-9036-cc958c5cfce9,ddd875dd-8059-4d1c-9036-cc958c5cfce9,2021-03-24 15:00:00.000,12.0,3.0,2021.0,...,0.0,0.0,2021-03-24 15:00:00.000,2021-03-24,bandung,bandung,2021-03-24,\nSembilan Warga Hilang Akibat Banjir dan Lon...,sembilan warga hilang akibat banjir longsor ba...,"['sembilan', 'warga', 'hilang', 'akibat', 'ban..."
8,InfoBencana_bandung_25nov2021.pdf,bandung,2021-11-25,OK,d1bb7cdd-a262-4660-800b-1289f642191e,d1bb7cdd-a262-4660-800b-1289f642191e,2021-11-25 19:00:00.000,47.0,11.0,2021.0,...,0.0,0.0,2021-11-25 19:00:00.000,2021-11-25,bandung,bandung,2021-11-25,"Hujan Deras Melanda Kota Bandung,\nSejumlah Ti...",hujan deras melanda kota bandung titik tergena...,"['hujan', 'deras', 'melanda', 'kota', 'bandung..."
9,InfoBencana_bandung_27nov2021.pdf,bandung,2021-11-27,OK,8b3af6ea-5313-4180-a48d-2afb2340028d,8b3af6ea-5313-4180-a48d-2afb2340028d,2021-11-27 21:40:00.000,47.0,11.0,2021.0,...,0.0,0.0,2021-11-27 21:40:00.000,2021-11-27,bandung,bandung,2021-11-27,Beranda / Empat Kecamatan di Wilayah Kabupaten...,beranda kecamatan wilayah kabupaten bandung te...,"['beranda', 'kecamatan', 'wilayah', 'kabupaten..."


In [53]:
import pandas as pd
import numpy as np
import re

# coba load tabel_analitik_master jika sudah ada, kalau tidak buat dari dua file yang sudah kamu punya
try:
    master = pd.read_csv("tabel_analitik_master.csv")
    print("Loaded tabel_analitik_master.csv")
except FileNotFoundError:
    print("tabel_analitik_master.csv not found — akan coba gabungkan dari tabel_bencana_cuaca_clean.csv + tabel_teks_final.csv")
    bencana = pd.read_csv("tabel_bencana_cuaca_clean.csv")
    teks = pd.read_csv("tabel_teks_final.csv")
    # jika teks punya kolom id_pdf dan bencana punya id, gunakan matching file yang sudah kamu buat (pdf_dibi_match.csv)
    try:
        match = pd.read_csv("pdf_dibi_match.csv")
        # gabungkan: match -> bencana -> teks
        merged = match.merge(bencana, left_on="id_dibi", right_on="id", how="left").merge(teks, on="id_pdf", how="left")
        master = merged.copy()
        print("Merged from pdf_dibi_match.csv + bencana + teks")
    except FileNotFoundError:
        # fallback: try join by kabupaten + tanggal if present (less robust)
        print("pdf_dibi_match.csv not found — fallback simple merge by kabupaten+date (may be imperfect).")
        bencana['kab_lower'] = bencana['kabupaten_lower'].astype(str).str.lower().str.strip()
        teks['kab_lower'] = teks['kabupaten_pdf'].astype(str).str.lower().str.strip()
        bencana['Tanggal_Kejadian_Key'] = pd.to_datetime(bencana['Tanggal_Kejadian_Key'], errors='coerce').dt.date
        teks['tanggal_pdf'] = pd.to_datetime(teks['tanggal_pdf'], errors='coerce').dt.date
        master = pd.merge(bencana, teks, left_on=['kab_lower','Tanggal_Kejadian_Key'], right_on=['kab_lower','tanggal_pdf'], how='left')
        print("Fallback merge done (check results).")

print("master shape:", master.shape)
master.head(2)


Loaded tabel_analitik_master.csv
master shape: (50, 41)


Unnamed: 0,id_pdf,kabupaten_pdf_x,tanggal_pdf_x,match_status,id_dibi,id,Tanggal___Waktu_Kejadian,Minggu,Bulan,Tahun,...,Kantor_Rusak,Jembatan_Rusak,Tanggal,Tanggal_Kejadian_Key,kabupaten_lower,kabupaten_pdf_y,tanggal_pdf_y,raw_text,Teks_Bersih,Teks_Tokenized
0,InfoBencana_bandung_11jan2024.pdf,bandung,2024-01-11,OK,0bed2012-92ab-438d-9d40-933a1dd5f4e5,0bed2012-92ab-438d-9d40-933a1dd5f4e5,2024-01-11 18:45:00.000,2.0,1.0,2024.0,...,0.0,0.0,2024-01-11 18:45:00.000,2024-01-11,bandung,bandung,2024-01-11,"Beranda / Banjir Genangi Bandung, 600 Jiwa Ter...",beranda banjir genangi bandung 600 jiwa terdam...,"['beranda', 'banjir', 'genangi', 'bandung', 'j..."
1,InfoBencana_bandung_15feb2020.pdf,bandung,2020-02-15,OK,764cd4a6-c47f-4eb2-9a3c-1c520992808b,764cd4a6-c47f-4eb2-9a3c-1c520992808b,2020-02-15 00:01:00.000,7.0,2.0,2020.0,...,0.0,0.0,2020-02-15 00:01:00.000,2020-02-15,bandung,bandung,2020-02-15,B\nandung (ANTARA) - Banjir yang melanda sebag...,andung banjir melanda wilayah kabupaten bandun...,"['andung', 'banjir', 'melanda', 'wilayah', 'ka..."


In [54]:
# kamus dasar — kamu bisa tambahkan kata lain sesuai kebutuhan
KAMUS_LOGISTIK = [
    "logistik","air bersih","airbersih","selimut","makanan","makanan siap saji","makanan_siap_saji",
    "obat","airminum","air minum","makanan instan","matras","tenda","evakuasi","evakuasi massal","evakuasi mandiri"
]
KAMUS_INFRA = [
    "jembatan","jalan putus","jalan terputus","terisolir","jaringan listrik putus","listrik padam",
    "sungai meluap","terendam","jembatan putus","jembatan rusak","tanah longsor","longsor","jalan rusak","akses terganggu"
]
KAMUS_KEBUTUHAN = [
    "kebutuhan mendesak","butuh","membutuhkan","darurat","tanggap darurat"
]

def contains_any(text, keywords):
    if not isinstance(text, str) or text.strip()=="":
        return 0
    t = text.lower()
    for kw in keywords:
        if kw in t:
            return 1
    return 0


In [55]:
# buat kolom fitur biner
master['butuh_logistik'] = master['Teks_Bersih'].apply(lambda x: contains_any(x, KAMUS_LOGISTIK))
master['infrastruktur_rusak_flag'] = master['Teks_Bersih'].apply(lambda x: contains_any(x, KAMUS_INFRA))
master['kebutuhan_umum_flag'] = master['Teks_Bersih'].apply(lambda x: contains_any(x, KAMUS_KEBUTUHAN))

# summary cepat
print("Butuh logistik:", master['butuh_logistik'].sum())
print("Infra rusak flag:", master['infrastruktur_rusak_flag'].sum())
print("Kebutuhan umum flag:", master['kebutuhan_umum_flag'].sum())


Butuh logistik: 35
Infra rusak flag: 43
Kebutuhan umum flag: 24


In [None]:
def extract_number_patterns(text):
    results = {
        "jumlah_kk": np.nan,
        "jumlah_jiwa": np.nan,
        "jumlah_rumah": np.nan,
        "jumlah_orang": np.nan
    }

    if not isinstance(text, str):
        return results

    text = text.lower()
    
    patterns = {
        "jumlah_kk": r"(\d+)\s*kk",
        "jumlah_jiwa": r"(\d+)\s*jiwa",
        "jumlah_rumah": r"(\d+)\s*rumah",
        "jumlah_orang": r"(\d+)\s*orang"
    }
    
    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            results[key] = int(match.group(1))
    
    return results

# Terapkan ke teks
numbers_extracted = master['Teks_Bersih'].apply(extract_number_patterns)

# Expand dictionary ke kolom dataframe
num_df = pd.DataFrame(numbers_extracted.tolist())
master = pd.concat([master, num_df], axis=1)

print("Ekstraksi angka selesai!")
master[['Teks_Bersih','jumlah_kk','jumlah_jiwa','jumlah_rumah','jumlah_orang']].head(5)


Ekstraksi angka selesai!


Unnamed: 0,Teks_Bersih,jumlah_kk,jumlah_jiwa,jumlah_rumah,jumlah_orang
0,beranda banjir genangi bandung 600 jiwa terdam...,,600.0,600.0,
1,andung banjir melanda wilayah kabupaten bandun...,,,1880.0,
2,beranda banjir rendam kecamatan kabupaten band...,,,361.0,
3,advertisement surabaya hunian modern cocok usi...,,10739.0,2638.0,
4,beranda terdampak banjir bandang kabupaten ban...,,,,


In [57]:
output_path = "tabel_analitik_master_enriched.csv"
master.to_csv(output_path, index=False)

print("File saved to:", output_path)


File saved to: tabel_analitik_master_enriched.csv


In [58]:
print("\n===== SUMMARY FLAG =====")
print(master[['butuh_logistik','infrastruktur_rusak_flag','kebutuhan_umum_flag']].sum())

print("\n===== SAMPLE ROWS YANG ADA FLAG =====")
print(master[master['butuh_logistik']==1].head(3)[['Teks_Bersih','butuh_logistik']])
print(master[master['infrastruktur_rusak_flag']==1].head(3)[['Teks_Bersih','infrastruktur_rusak_flag']])



===== SUMMARY FLAG =====
butuh_logistik              35
infrastruktur_rusak_flag    43
kebutuhan_umum_flag         24
dtype: int64

===== SAMPLE ROWS YANG ADA FLAG =====
                                         Teks_Bersih  butuh_logistik
0  beranda banjir genangi bandung 600 jiwa terdam...               1
1  andung banjir melanda wilayah kabupaten bandun...               1
2  beranda banjir rendam kecamatan kabupaten band...               1
                                         Teks_Bersih  infrastruktur_rusak_flag
1  andung banjir melanda wilayah kabupaten bandun...                         1
3  advertisement surabaya hunian modern cocok usi...                         1
4  beranda terdampak banjir bandang kabupaten ban...                         1


In [60]:
# fungsi ekstraksi angka korban/rumah/kk/jiwa/pengungsi
def extract_number_patterns(text):
    if not isinstance(text, str) or text.strip()=="":
        return np.nan

    pattern = r"(\d+)\s*(kk|jiwa|orang|pengungsi|rumah|kepala keluarga)"
    matches = re.findall(pattern, text.lower())
    if not matches:
        return np.nan

    result = []
    for num, unit in matches:
        result.append(f"{num} {unit}")
    return "; ".join(result)

master['angka_korban_extracted'] = master['Teks_Bersih'].apply(extract_number_patterns)

print(master[['Teks_Bersih','angka_korban_extracted']].head())


                                         Teks_Bersih  \
0  beranda banjir genangi bandung 600 jiwa terdam...   
1  andung banjir melanda wilayah kabupaten bandun...   
2  beranda banjir rendam kecamatan kabupaten band...   
3  advertisement surabaya hunian modern cocok usi...   
4  beranda terdampak banjir bandang kabupaten ban...   

                              angka_korban_extracted  
0  600 jiwa; 600 jiwa; 600 jiwa; 600 rumah; 150 jiwa  
1                   1413 kepala keluarga; 1880 rumah  
2                     237 kepala keluarga; 361 rumah  
3  2638 rumah; 2638 rumah; 2638 rumah; 2638 rumah...  
4                                                NaN  


In [61]:
output_path = "tabel_analitik_master_enriched1.csv"
master.to_csv(output_path, index=False)
print("Saved to:", output_path)


Saved to: tabel_analitik_master_enriched1.csv


In [62]:
print("\n===== Summary Fitur Baru =====")
print(master[['butuh_logistik','infrastruktur_rusak_flag','kebutuhan_umum_flag']].sum())

print("\n===== Contoh 10 Laporan yang Butuh Logistik =====")
print(master[master['butuh_logistik'] == 1][['id_pdf','Teks_Bersih']].head(10))

print("\n===== Contoh 10 Laporan dengan Angka Korban =====")
print(master[master['angka_korban_extracted'].notna()][['id_pdf','angka_korban_extracted']].head(10))



===== Summary Fitur Baru =====
butuh_logistik              35
infrastruktur_rusak_flag    43
kebutuhan_umum_flag         24
dtype: int64

===== Contoh 10 Laporan yang Butuh Logistik =====
                               id_pdf  \
0   InfoBencana_bandung_11jan2024.pdf   
1   InfoBencana_bandung_15feb2020.pdf   
2   InfoBencana_bandung_16jan2021.pdf   
5   InfoBencana_bandung_23jan2020.pdf   
6   InfoBencana_bandung_24des2020.pdf   
7   Infobencana_bandung_24mar2021.pdf   
8   InfoBencana_bandung_25nov2021.pdf   
11   InfoBencana_bandung_7feb2020.pdf   
13   InfoBencana_bekasi_20feb2021.pdf   
14   InfoBencana_bekasi_24feb2023.pdf   

                                          Teks_Bersih  
0   beranda banjir genangi bandung 600 jiwa terdam...  
1   andung banjir melanda wilayah kabupaten bandun...  
2   beranda banjir rendam kecamatan kabupaten band...  
5   beranda kecamatan kabupaten bandung dilanda ba...  
6   beranda sembilan orang hilang peristiwa banjir...  
7   sembilan warga hila

In [64]:
!pip install --quiet gensim nltk

import os, re, ast
import pandas as pd
import numpy as np
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.utils import simple_preprocess


In [63]:
# Ganti path kalau beda
candidates = [
    "tabel_analitik_master_enriched1.csv",
    "tabel_analitik_master.csv",
    "tabel_analitik_master.csv",
    "tabel_teks_final.csv"
]

for p in candidates:
    if os.path.exists(p):
        file_path = p
        break
else:
    raise FileNotFoundError("Tidak menemukan file master/enriched di /mnt/data. Upload dulu atau sesuaikan path.")

print("Menggunakan file:", file_path)
df = pd.read_csv(file_path)
print("Shape:", df.shape)
df.columns


Menggunakan file: tabel_analitik_master_enriched1.csv
Shape: (50, 49)


Index(['id_pdf', 'kabupaten_pdf_x', 'tanggal_pdf_x', 'match_status', 'id_dibi',
       'id', 'Tanggal___Waktu_Kejadian', 'Minggu', 'Bulan', 'Tahun',
       'Kode_Provinsi', 'Provinsi', 'Kode_Kabupaten', 'Kabupaten',
       'Kode_Jenis_Kejadian', 'Nama_Kejadian', 'Jenis_Bencana', 'is_bencana',
       'Jumlah_Kejadian', 'Meninggal', 'Hilang', 'Luka___Sakit', 'Menderita',
       'Mengungsi', 'Rumah_Rusak_Berat', 'Rumah_Rusak_Sedang',
       'Rumah_Rusak_Ringan', 'Rumah_Terendam', 'Satuan_Pendidikan_Rusak',
       'Rumah_Ibadat_Rusak', 'Fasilitas_Pelayanan_Kesehatan_Rusak',
       'Kantor_Rusak', 'Jembatan_Rusak', 'Tanggal', 'Tanggal_Kejadian_Key',
       'kabupaten_lower', 'kabupaten_pdf_y', 'tanggal_pdf_y', 'raw_text',
       'Teks_Bersih', 'Teks_Tokenized', 'butuh_logistik',
       'infrastruktur_rusak_flag', 'kebutuhan_umum_flag', 'jumlah_kk',
       'jumlah_jiwa', 'jumlah_rumah', 'jumlah_orang',
       'angka_korban_extracted'],
      dtype='object')

In [65]:
# Tentukan kolom token atau buat dari Teks_Bersih
if 'Teks_Tokenized' in df.columns:
    token_col = 'Teks_Tokenized'
elif 'Teks_Tokenized_clean' in df.columns:
    token_col = 'Teks_Tokenized_clean'
elif 'Teks_Bersih' in df.columns:
    token_col = None
else:
    raise ValueError("Tidak menemukan kolom Teks_Tokenized atau Teks_Bersih di file. Cek nama kolom.")

# fungsi bantu konversi
def ensure_list(x):
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    s = str(x).strip()
    if s.startswith('[') and s.endswith(']'):
        try:
            val = ast.literal_eval(s)
            if isinstance(val, list):
                return [str(t) for t in val]
        except Exception:
            pass
    # fallback: tokenize Teks_Bersih if x is raw text
    return simple_preprocess(s, deacc=True)

if token_col is None:
    df['Teks_Tokenized_clean'] = df['Teks_Bersih'].fillna("").apply(lambda x: simple_preprocess(str(x), deacc=True))
    token_col = 'Teks_Tokenized_clean'
else:
    df['Teks_Tokenized_clean'] = df[token_col].apply(ensure_list)

# dokumen yang punya token
df_lda = df[df['Teks_Tokenized_clean'].apply(len) > 0].copy()
print("Documents with tokens:", len(df_lda), "of", len(df))


Documents with tokens: 50 of 50


In [66]:
texts = df_lda['Teks_Tokenized_clean'].tolist()
dictionary = Dictionary(texts)
dictionary.filter_extremes(no_below=2, no_above=0.5)   # sesuaikan no_below jika dokumen sedikit
corpus = [dictionary.doc2bow(t) for t in texts]
print("Tokens (vocab size):", len(dictionary))


Tokens (vocab size): 1852


In [67]:
NUM_TOPICS = 7
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, passes=15, random_state=42, eval_every=None)
print("LDA trained. Top topics (words):")
for idx, topic in lda.print_topics(num_words=12):
    print(f"Topic {idx}: {topic}")


LDA trained. Top topics (words):
Topic 0: 0.082*"bandung" + 0.019*"dayeuhkolot" + 0.014*"baleendah" + 0.013*"raya" + 0.012*"bekasi" + 0.010*"bojongsoang" + 0.010*"sentimeter" + 0.009*"deras" + 0.008*"citarum" + 0.008*"ketinggian" + 0.008*"tma" + 0.008*"tergenang"
Topic 1: 0.030*"longsor" + 0.021*"kejadian" + 0.021*"tanah" + 0.018*"korban" + 0.012*"unit" + 0.011*"mengalami" + 0.010*"rusak" + 0.009*"gambar" + 0.009*"provinsi" + 0.009*"meninggal" + 0.008*"tim" + 0.008*"mei"
Topic 2: 0.081*"bekasi" + 0.030*"titik" + 0.011*"pusat" + 0.011*"meter" + 0.010*"evakuasi" + 0.010*"cikarang" + 0.009*"margahayu" + 0.009*"sekolah" + 0.008*"korban" + 0.007*"sabtu" + 0.007*"meluap" + 0.006*"senin"
Topic 3: 0.022*"karawang" + 0.016*"kompascom" + 0.016*"advertisement" + 0.013*"bogor" + 0.012*"new" + 0.010*"longsor" + 0.010*"komentar" + 0.008*"news" + 0.007*"iklan" + 0.007*"travel" + 0.007*"video" + 0.007*"meter"
Topic 4: 0.018*"subang" + 0.016*"bandung" + 0.014*"sumedang" + 0.013*"rusak" + 0.013*"cikeruh

In [68]:
coh = CoherenceModel(model=lda, texts=texts, dictionary=dictionary, coherence='c_v')
coh_score = coh.get_coherence()
print("Coherence (c_v):", coh_score)


Coherence (c_v): 0.40761846265310414


In [69]:
# buat mapping doc->topic
def get_doc_topic(bow):
    topics = lda.get_document_topics(bow)
    if not topics:
        return -1, 0.0, "No Topic"
    topics = sorted(topics, key=lambda x: x[1], reverse=True)
    tnum, prop = topics[0]
    wp = lda.show_topic(tnum, topn=10)
    keywords = ", ".join([w for w,_ in wp])
    return int(tnum), float(prop), keywords

doc_topics = [get_doc_topic(b) for b in corpus]
topic_df = pd.DataFrame(doc_topics, columns=['Dominant_Topic','Perc_Contribution','Topic_Keywords'], index=df_lda.index)

# merge ke df asli
df_out = df.copy()
df_out[['Dominant_Topic','Perc_Contribution','Topic_Keywords']] = (df_out
    .join(topic_df[['Dominant_Topic','Perc_Contribution','Topic_Keywords']], how='left')
    [['Dominant_Topic','Perc_Contribution','Topic_Keywords']]
)

df_out['Dominant_Topic'] = df_out['Dominant_Topic'].fillna(-1).astype(int)
df_out['Perc_Contribution'] = df_out['Perc_Contribution'].fillna(0.0)
df_out['Topic_Keywords'] = df_out['Topic_Keywords'].fillna('No Topic')

out_path = "tabel_analitik_master_with_topics.csv"
df_out.to_csv(out_path, index=False)
print("Saved file:", out_path)


Saved file: tabel_analitik_master_with_topics.csv


In [71]:
print("Topic distribution:")
print(df_out['Dominant_Topic'].value_counts().sort_index())

# contoh 3 dokumen tiap topik
for t in sorted([x for x in df_out['Dominant_Topic'].unique() if x!=-1]):
    print("\n--- Topic", t, "---")
    display(df_out[df_out['Dominant_Topic']==t][['id_pdf','Perc_Contribution','Topic_Keywords','Teks_Bersih']].head(10))


Topic distribution:
Dominant_Topic
0    13
1    12
2     4
3     7
4     4
5     6
6     4
Name: count, dtype: int64

--- Topic 0 ---


Unnamed: 0,id_pdf,Perc_Contribution,Topic_Keywords,Teks_Bersih
1,InfoBencana_bandung_15feb2020.pdf,0.991499,"bandung, dayeuhkolot, baleendah, raya, bekasi,...",andung banjir melanda wilayah kabupaten bandun...
5,InfoBencana_bandung_23jan2020.pdf,0.786918,"bandung, dayeuhkolot, baleendah, raya, bekasi,...",beranda kecamatan kabupaten bandung dilanda ba...
8,InfoBencana_bandung_25nov2021.pdf,0.996617,"bandung, dayeuhkolot, baleendah, raya, bekasi,...",hujan deras melanda kota bandung titik tergena...
9,InfoBencana_bandung_27nov2021.pdf,0.497881,"bandung, dayeuhkolot, baleendah, raya, bekasi,...",beranda kecamatan wilayah kabupaten bandung te...
11,InfoBencana_bandung_7feb2020.pdf,0.992195,"bandung, dayeuhkolot, baleendah, raya, bekasi,...",beranda banjir rendam enam kecamatan kabupaten...
12,InfoBencana_bekasi_14des2024.pdf,0.987447,"bandung, dayeuhkolot, baleendah, raya, bekasi,...",beranda bekasi bekasi banjir rob rendam desa m...
18,InfoBencana_bogor_24okto2020.pdf,0.880167,"bandung, dayeuhkolot, baleendah, raya, bekasi,...",home peristiwa diguyur hujan sore ratusan ruma...
33,info_bencana_bandung_feb2020.pdf,0.994943,"bandung, dayeuhkolot, baleendah, raya, bekasi,...",english terkini terpopuler top news pilihan ed...
34,info_bencana_bandung_januari2020_2.pdf,0.995306,"bandung, dayeuhkolot, baleendah, raya, bekasi,...",english terkini terpopuler top news pilihan ed...
35,info_bencana_bandung_maret2020.pdf,0.995225,"bandung, dayeuhkolot, baleendah, raya, bekasi,...",beranda banjir rendam 9285 rumah kabupaten ban...



--- Topic 1 ---


Unnamed: 0,id_pdf,Perc_Contribution,Topic_Keywords,Teks_Bersih
0,InfoBencana_bandung_11jan2024.pdf,0.78707,"longsor, kejadian, tanah, korban, unit, mengal...",beranda banjir genangi bandung 600 jiwa terdam...
2,InfoBencana_bandung_16jan2021.pdf,0.67547,"longsor, kejadian, tanah, korban, unit, mengal...",beranda banjir rendam kecamatan kabupaten band...
4,InfoBencana_bandung_20jun2021.pdf,0.872369,"longsor, kejadian, tanah, korban, unit, mengal...",beranda terdampak banjir bandang kabupaten ban...
6,InfoBencana_bandung_24des2020.pdf,0.681555,"longsor, kejadian, tanah, korban, unit, mengal...",beranda sembilan orang hilang peristiwa banjir...
7,Infobencana_bandung_24mar2021.pdf,0.715672,"longsor, kejadian, tanah, korban, unit, mengal...",sembilan warga hilang akibat banjir longsor ba...
16,InfoBencana_bekasi_28jan2025.pdf,0.982669,"longsor, kejadian, tanah, korban, unit, mengal...",beranda perkembangan situasi penanganan tangga...
20,InfoBencana_cianjur_3des2024.pdf,0.995385,"longsor, kejadian, tanah, korban, unit, mengal...",beranda update banjir tanah longsor kabupaten ...
22,InfoBencana_indramayu_13des2024.pdf,0.899671,"longsor, kejadian, tanah, korban, unit, mengal...",beranda banjir rob terjang kabupaten indramayu...
24,InfoBencana_indramayu_7feb2021.pdf,0.997033,"longsor, kejadian, tanah, korban, unit, mengal...",beranda sungai meluap sebabkan banjir kabupate...
30,InfoBencana_subang_7feb2021.pdf,0.658266,"longsor, kejadian, tanah, korban, unit, mengal...",beranda banjir berangsur surut warga subang ru...



--- Topic 2 ---


Unnamed: 0,id_pdf,Perc_Contribution,Topic_Keywords,Teks_Bersih
13,InfoBencana_bekasi_20feb2021.pdf,0.992468,"bekasi, titik, pusat, meter, evakuasi, cikaran...",redaksi kontak rabu november 2025 pencarian vi...
14,InfoBencana_bekasi_24feb2023.pdf,0.993181,"bekasi, titik, pusat, meter, evakuasi, cikaran...",berita olahraga gaya hidup ekonomi hukum berit...
15,InfoBencana_bekasi_25feb2020.pdf,0.995729,"bekasi, titik, pusat, meter, evakuasi, cikaran...",kegiatan banjir kota bekasi sebabkan titik ban...
17,InfoBencana_bekasi_7jul2025.pdf,0.824602,"bekasi, titik, pusat, meter, evakuasi, cikaran...",scroll membaca berita penyebab tambang ilegal ...



--- Topic 3 ---


Unnamed: 0,id_pdf,Perc_Contribution,Topic_Keywords,Teks_Bersih
19,InfoBencana_bogor_2maret2025.pdf,0.998409,"karawang, kompascom, advertisement, bogor, new...",baca berita iklan gabung kompascom banjir long...
26,InfoBencana_karawang_1jan2020.pdf,0.99771,"karawang, kompascom, advertisement, bogor, new...",baca berita iklan gabung kompascom 423 rumah k...
28,InfoBencana_karawang_4mar2025.pdf,0.59366,"karawang, kompascom, advertisement, bogor, new...",advertisement advertisement kecamatan karawang...
43,info_bencana_bogor_januari2020.pdf,0.99785,"karawang, kompascom, advertisement, bogor, new...",baca berita iklan gabung kompascom banjir long...
47,info_bencana_karawang_februari2023_2.pdf,0.99793,"karawang, kompascom, advertisement, bogor, new...",baca berita iklan gabung kompascom kecamatan t...
48,info_bencana_karawang_februari2023_3.pdf,0.96299,"karawang, kompascom, advertisement, bogor, new...",install baca artikel idn times idn app home ne...
49,info_bencana_subang_februari2021.pdf,0.997752,"karawang, kompascom, advertisement, bogor, new...",baca berita iklan gabung kompascom orang tewas...



--- Topic 4 ---


Unnamed: 0,id_pdf,Perc_Contribution,Topic_Keywords,Teks_Bersih
3,InfoBencana_bandung_16mar2020.pdf,0.995146,"subang, bandung, sumedang, rusak, cikeruh, feb...",advertisement surabaya hunian modern cocok usi...
10,InfoBencana_bandung_28feb2020.pdf,0.995039,"subang, bandung, sumedang, rusak, cikeruh, feb...",scroll membaca berita foto meluapnya sungai ci...
23,InfoBencana_indramayu_6des2020.pdf,0.992598,"subang, bandung, sumedang, rusak, cikeruh, feb...",advertisement cirebon purwakarta kab bandung k...
29,InfoBencana_subang_27feb2023.pdf,0.995171,"subang, bandung, sumedang, rusak, cikeruh, feb...",intensitas hujan kecamatan subang terendam air...



--- Topic 5 ---


Unnamed: 0,id_pdf,Perc_Contribution,Topic_Keywords,Teks_Bersih
21,InfoBencana_garut_15jul2022.pdf,0.99781,"karawang, news, garut, ang, bola, bandung, ban...",terpopuler koleksi pilihan detikbali berita fa...
27,InfoBencana_karawang_27feb2023.pdf,0.997677,"karawang, news, garut, ang, bola, bandung, ban...",explore cantonese flavors curated chinese teas...
32,info_bencana_bandung_10januari2021.pdf,0.944146,"karawang, news, garut, ang, bola, bandung, ban...",advertisement temukan berita lingkungan kota b...
44,info_bencana_karawang_februari2020.pdf,0.995908,"karawang, news, garut, ang, bola, bandung, ban...",karawang menetapkan status tanggap darurat ala...
45,info_bencana_karawang_februari2021 (1).pdf,0.998088,"karawang, news, garut, ang, bola, bandung, ban...",adsmart terpopuler daerah hoax not suara pemba...
46,info_bencana_karawang_februari2021.pdf,0.998088,"karawang, news, garut, ang, bola, bandung, ban...",adsmart terpopuler daerah hoax not suara pemba...



--- Topic 6 ---


Unnamed: 0,id_pdf,Perc_Contribution,Topic_Keywords,Teks_Bersih
25,InfoBencana_karawang_10jan2020.pdf,0.995807,"bekasi, ribuan, infrastruktur, nov, rusak, par...",home jawa barat banjir terjang karawang ribuan...
36,info_bencana_bandung_maret2020_3.pdf,0.997552,"bekasi, ribuan, infrastruktur, nov, rusak, par...",home regional jawa barat bandung dikepung banj...
41,info_bencana_bekasi_februari2021.pdf,0.562843,"bekasi, ribuan, infrastruktur, nov, rusak, par...",redaksi kontak selasa november 2025 pencarian ...
42,info_bencana_bekasi_maret2025.pdf,0.998558,"bekasi, ribuan, infrastruktur, nov, rusak, par...",home news peristiwa akibat banjir bekasi ribua...


In [72]:
extra_stopwords = {
    "beranda", "scroll", "advertisement", "iklan", "baca",
    "home", "news", "update", "artikel", "english", "foto",
    "times", "idn", "kompascom", "detikbali", "bola",
    "kab", "kec", "kecamatan", "kabupaten",
    "terpopuler", "pilihan", "editor", "lainnya",
    "info", "bencana", "bpbd", "bnpb", "banj"
}
stopw = stopw.union(extra_stopwords)  # update stopwords utama


In [75]:
tabel_pdf_raw['Teks_Bersih'] = tabel_pdf_raw['raw_text'].apply(clean_text_basic)
tabel_pdf_raw['Teks_Tokenized'] = tabel_pdf_raw['Teks_Bersih'].apply(tokenize)

# simpan lagi hasil baru
tabel_pdf_raw.to_csv("tabel_pdf_raw_extracted_v2.csv", index=False)
print("Cleaning ulang selesai.")


Cleaning ulang selesai.


In [74]:
tabel_pdf_raw.head(60)

Unnamed: 0,id_pdf,kabupaten_pdf,tanggal_pdf,raw_text,Teks_Bersih,Teks_Tokenized
0,InfoBencana_bandung_11jan2024.pdf,bandung,2024-01-11,"Beranda / Banjir Genangi Bandung, 600 Jiwa Ter...",banjir genangi bandung 600 jiwa terdampak tim ...,"[banjir, genangi, bandung, jiwa, terdampak, ti..."
1,InfoBencana_bandung_15feb2020.pdf,bandung,2020-02-15,B\nandung (ANTARA) - Banjir yang melanda sebag...,andung banjir melanda wilayah bandung jawa bar...,"[andung, banjir, melanda, wilayah, bandung, ja..."
2,InfoBencana_bandung_16jan2021.pdf,bandung,2021-01-16,Beranda / Banjir Rendam Empat Kecamatan di Kab...,banjir rendam bandung puluhan warga mengungsi ...,"[banjir, rendam, bandung, puluhan, warga, meng..."
3,InfoBencana_bandung_16mar2020.pdf,bandung,2020-03-16,\nAdvertisement\nSurabaya: Hunian Modern\nyan...,surabaya hunian modern cocok usia emas telusur...,"[surabaya, hunian, modern, cocok, usia, emas, ..."
4,InfoBencana_bandung_20jun2021.pdf,bandung,2021-06-20,Beranda / 76 KK Terdampak Banjir Bandang di Ka...,terdampak banjir bandang bandung kondisi rumah...,"[terdampak, banjir, bandang, bandung, kondisi,..."
5,InfoBencana_bandung_23jan2020.pdf,bandung,2020-01-23,Beranda / Lima Kecamatan di Kabupaten Bandung ...,bandung dilanda banjir bandung dilanda banjir ...,"[bandung, dilanda, banjir, bandung, dilanda, b..."
6,InfoBencana_bandung_24des2020.pdf,bandung,2020-12-24,Beranda / Sembilan Orang Hilang Pada Peristiwa...,sembilan orang hilang peristiwa banjir bandang...,"[sembilan, orang, hilang, peristiwa, banjir, b..."
7,Infobencana_bandung_24mar2021.pdf,bandung,2021-03-24,\nSembilan Warga Hilang Akibat Banjir dan Lon...,sembilan warga hilang akibat banjir longsor ba...,"[sembilan, warga, hilang, akibat, banjir, long..."
8,InfoBencana_bandung_25nov2021.pdf,bandung,2021-11-25,"Hujan Deras Melanda Kota Bandung,\nSejumlah Ti...",hujan deras melanda kota bandung titik tergena...,"[hujan, deras, melanda, kota, bandung, titik, ..."
9,InfoBencana_bandung_27nov2021.pdf,bandung,2021-11-27,Beranda / Empat Kecamatan di Wilayah Kabupaten...,wilayah bandung terdampak banjir korban jiwa b...,"[wilayah, bandung, terdampak, banjir, korban, ..."
