In [25]:
from collections import Counter
import re, pandas as pd, json, os, pathlib
from collections import defaultdict
from pathlib import Path
from collections import Counter


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopwords_id = set(stopwords.words('indonesian'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aufii\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
# -------------------------------------------------
# 0) KONFIGURASI INPUT / OUTPUT
# -------------------------------------------------
DESC_PATH = "data/tokopedia_500.csv"   
REV_PATH  = "data/reviews_only.csv" 
COL_DESC  = "description"
COL_REVIEW= "review"

OUT_DIR = Path("reports")
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [27]:
# -------------------------------
# 1) LOAD CSV
# -------------------------------
def try_load_csv(path, encoding=None):
    try:
        return pd.read_csv(path, encoding=encoding)
    except FileNotFoundError:
        print(f"[WARN] File tidak ditemukan: {path}. Menghasilkan DataFrame kosong.")
        return pd.DataFrame()
    except Exception as e:
        print(f"[WARN] Gagal membaca {path}: {e}. Menghasilkan DataFrame kosong.")
        return pd.DataFrame()

df_desc    = try_load_csv(DESC_PATH)                # biasanya UTF-8 default
df_reviews = try_load_csv(REV_PATH, encoding="utf-8")

In [28]:
df_desc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3972 entries, 0 to 3971
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_id    3972 non-null   int64  
 1   product_name  3972 non-null   object 
 2   url           3972 non-null   object 
 3   description   3868 non-null   object 
 4   price         3972 non-null   int64  
 5   sold_count    3969 non-null   float64
 6   rating        3380 non-null   float64
 7   total_stock   3969 non-null   float64
 8   category      3972 non-null   object 
 9   keyword       3972 non-null   object 
dtypes: float64(3), int64(2), object(5)
memory usage: 310.4+ KB


In [29]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1676 entries, 0 to 1675
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  1676 non-null   object
dtypes: object(1)
memory usage: 13.2+ KB


In [30]:
# -------------------------------------------------
# 2) GABUNGKAN KE SATU DATAFRAME KERJA
#     -> pre-clean diterapkan ke SEMUA TEKS
# -------------------------------------------------
frames = []
if not df_desc.empty:
    frames.append(pd.DataFrame({"source":"description", "text": df_desc[COL_DESC].astype(str)}))
if not df_reviews.empty:
    frames.append(pd.DataFrame({"source":"review", "text": df_reviews[COL_REVIEW].astype(str)}))

df = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=["source","text"])
print(f"Rows description: {len(df_desc)}; rows reviews: {len(df_reviews)}; rows working: {len(df)}")

Rows description: 3972; rows reviews: 1676; rows working: 5648


In [31]:
# -------------------------------------------------
# 3) PRE-CLEAN (UNTUK SEMUA TEKS)
# -------------------------------------------------
TAG_RE = re.compile(r'<[^>]+>')
SPACES_RE = re.compile(r'\s+')
DIGIT_SEP_RE = re.compile(r'(?<=\d)[\s\.\-]+(?=\d)')  # hapus spasi/titik/strip antar digit

def clean_html_spaces(s: str) -> str:
    s = TAG_RE.sub(' ', s)
    s = SPACES_RE.sub(' ', s)
    return s.strip()

def preclean_digit_runs(s: str) -> str:
    return DIGIT_SEP_RE.sub('', s)

if not df.empty:
    df["text_clean"]  = df["text"].astype(str).map(clean_html_spaces)
    df["text_digpre"] = df["text_clean"].map(preclean_digit_runs)

In [32]:
raw = "NIK saya 3276 1207 0501 0003 atau 3276-1207-0501-0003"
print("Raw:", raw)

# Bersihin digit-run
cleaned = preclean_digit_runs(raw)
print("Cleaned:", cleaned)

# Cari kandidat KTP/NIK v3
PATTERN_KTP = r'(?<!\d)(1[1-9]|21|[37][1-6]|5[1-3]|6[1-5]|[89][12])\d{2}\d{2}([04][1-9]|[1256][0-9]|[37][01])(0[1-9]|1[0-2])\d{2}\d{4}(?!\d)'
matches = re.findall(PATTERN_KTP, cleaned)
print("Matches:", matches)


Raw: NIK saya 3276 1207 0501 0003 atau 3276-1207-0501-0003
Cleaned: NIK saya 3276120705010003 atau 3276120705010003
Matches: [('32', '07', '05'), ('32', '07', '05')]


## Regex for IDs (KTP) ans Address

In [33]:
df_desc = df_desc[['description']].dropna().rename(columns={'description':'text'})
print("Total baris:", len(df_desc))

Total baris: 3868


define regex umum nik (IDs)

In [34]:
PATTERN_KTP = r'(?<!\d)(1[1-9]|21|[37][1-6]|5[1-3]|6[1-5]|[89][12])\d{2}\d{2}([04][1-9]|[1256][0-9]|[37][01])(0[1-9]|1[0-2])\d{2}\d{4}(?!\d)'
PAT_KTP = re.compile(PATTERN_KTP)


define regex baseline Address (alamat)

In [35]:
# Alamat Indonesia (versi ringan)
PATTERN_ADDR_LIGHT = (
    r'\b(?:Jl\.?|Jalan)\s+[A-Za-z0-9 .\-]+'   # Jl/Jalan + nama jalan
    r'(?:No\.?\s*\d+)?'                       # No 12
    r'(?:\s*RT\s*\d{1,2}\s*RW\s*\d{1,2})?'    # RT/RW
)
PAT_ADDR_LIGHT   = re.compile(PATTERN_ADDR_LIGHT, flags=re.IGNORECASE)
PAT_ADDR_TRIGGER = re.compile(r'\b(Jl\.?|Jalan)\b', flags=re.IGNORECASE)

In [36]:
# -------------------------------------------------
# 5) EKSTRAKSI KANDIDAT (KTP & ALAMAT)
# -------------------------------------------------
ktp_rows = []
addr_span_rows = []
addr_window_rows = []

for i, row in df.iterrows():
    txt_clean  = row["text_clean"]
    txt_digpre = row["text_digpre"]

    # KTP/NIK (gunakan text_digpre)
    for m in PAT_KTP.finditer(txt_digpre):
        token = m.group()
        idx = txt_clean.find(token)
        left = txt_clean[max(0, idx-30): idx] if idx >= 0 else ""
        right = txt_clean[idx+len(token): idx+len(token)+30] if idx >= 0 else ""
        ktp_rows.append({"row": i, "source": row["source"], "match": token, "left": left, "right": right})

    # Alamat (span langsung)
    for sp in PAT_ADDR_LIGHT.finditer(txt_clean):
        addr_span_rows.append({"row": i, "source": row["source"], "span": sp.group()})

    # Alamat (window untuk EDA pola)
    for t in PAT_ADDR_TRIGGER.finditer(txt_clean):
        start = max(0, t.start()-60)
        end   = min(len(txt_clean), t.end()+120)
        addr_window_rows.append({"row": i, "source": row["source"], "window": txt_clean[start:end]})

df_ktp          = pd.DataFrame(ktp_rows)
df_addr_spans   = pd.DataFrame(addr_span_rows)
df_addr_windows = pd.DataFrame(addr_window_rows)

print("KTP candidates:", len(df_ktp), "| Address spans:", len(df_addr_spans), "| Address windows:", len(df_addr_windows))


KTP candidates: 0 | Address spans: 79 | Address windows: 135


In [37]:
# -------------------------------------------------
# 6) EDA: TOP SEQUENCE SETELAH 'Jl/Jalan'
# -------------------------------------------------
def tokens_after_trigger(window: str, n_after=6):
    toks = re.findall(r'[A-Za-z0-9]+', window)
    idxs = [i for i, t in enumerate(toks) if t.lower() in ('jl','jalan')]
    results = []
    for idx in idxs:
        after = toks[idx+1: idx+1+n_after]
        if after:
            results.append(tuple(after))
    return results

seq_counter = Counter()
for w in df_addr_windows.get("window", []):
    for seq in tokens_after_trigger(w, n_after=6):
        seq_counter[seq] += 1

df_addr_topseq = pd.DataFrame(
    [{"sequence": " ".join(seq), "count": cnt} for seq, cnt in seq_counter.most_common(30)]
)

In [38]:
df_addr_topseq

Unnamed: 0,sequence,count
0,Raya Kuta Tuban No 7 Jl,5
1,Raya Sesetan No 277 Jl Raya,5
2,Raya Canggu Tibubeneng No 17 Jl,5
3,Teuku Umar No 76 Jl Raya,4
4,jalan dengan berbagai pilihan warna yang,4
5,dengan berbagai pilihan warna yang mudah,4
6,jalan ke mall jalan jalan ke,4
7,ke mall jalan jalan ke pantai,4
8,jalan ke pantai dan pasti nya,4
9,ke pantai dan pasti nya cocok,4


In [39]:
# -------------------------------------------------
# 8) OPSIONAL: TAMPILKAN SAMPEL CEPAT
# -------------------------------------------------
# print("\n[Sample KTP matches]")
# print(df_ktp.head(10))

print("\n[Sample Address spans]")
print(df_addr_spans.head(10))

# print("\n[Top Address sequences after Jl/Jalan]")
# print(df_addr_topseq.head(10))


[Sample Address spans]
   row       source  \
0  163  description   
1  210  description   
2  210  description   
3  210  description   
4  210  description   
5  210  description   
6  250  description   
7  379  description   
8  379  description   
9  618  description   

                                                                                                                                                                                                                                                                         span  
0                                                                                                                                                                                                                      JL MT HARYONO NO 44 KM4 DERETAN FITNESS FITCORE BATU 4  
1  jalan tol. Keuntungan menggunakan kartu e toll adalah tidak perlu membawa uang tunai dan dapat digunakan untuk pembayaran tol dan parkir di jalan tol. Produk ini baru dan dija

oke, buat fungsi dan pipeline lain untuk cek regex alamat, masi banyak yang kelewat

In [40]:
# 1) Trigger window (biar admin-only juga keambil)
PAT_ADDR_TRIGGER = re.compile(
    r'\b(Jl\.?|Jalan|Gg\.?|Gang|RT|RW|Kel(?:urahan)?\.?|Kec(?:amatan)?\.?|Kab(?:upaten)?\.?|Kota|Prov(?:insi)?\.?|K\.?\s?P\.?|KP)\b',
    re.IGNORECASE
)

# 2) Komponen (PERKETAT: wajib ada isi setelah label)
PAT_STREET  = re.compile(r'\b(?:Jl\.?|Jalan|Gg\.?|Gang)\b\s+[A-Z0-9][A-Za-z0-9 .\-]*', re.IGNORECASE)

# House number: No./No/Nomor/Nomer/Nmr/# + isi (angka/huruf)
PAT_HOUSE   = re.compile(r'\b(?:No\.?|No|Nomor|Nomer|Nmr|#)\s*\w+\b', re.IGNORECASE)

# RT/RW: wajib angka keduanya
PAT_RTRW    = re.compile(r'\bRT\s*\d{1,2}\s*RW\s*\d{1,2}\b', re.IGNORECASE)

# Kel/Kec/Kab/Kota/Prov: wajib ada NAMA sesudah label (minimal satu huruf)
PAT_KEL     = re.compile(r'\bKel(?:urahan)?\.?\s+[A-Za-z][A-Za-z .\-]*\b', re.IGNORECASE)
PAT_KEC     = re.compile(r'\bKec(?:amatan)?\.?\s+[A-Za-z][A-Za-z .\-]*\b', re.IGNORECASE)
PAT_KABKOTA = re.compile(r'\b(?:Kab(?:upaten)?|Kota)\.?\s+[A-Za-z][A-Za-z .\-]*\b', re.IGNORECASE)
PAT_PROV    = re.compile(r'\bProv(?:insi)?\.?\s+[A-Za-z][A-Za-z .\-]*\b', re.IGNORECASE)

# ZIP: tetap umum, nanti divalidasi konteksnya
PAT_ZIP     = re.compile(r'\b\d{5}\b')

# PO BOX
PAT_POBOX   = re.compile(r'\b(?:Kotak\s+Pos|K\.?\s?P\.?|KP)\s*\d+\b', re.IGNORECASE)

In [41]:
# Normalisasi koma/titik → spasi
PUNCT_TO_SPACE = re.compile(r'[,\.;:]+')
def norm_span(s: str) -> str:
    return PUNCT_TO_SPACE.sub(' ', s)

# ---- Tokenizer sederhana (alnum + underscore) ----
TOKEN_RE = re.compile(r'\w+')

def tokens_with_index(text: str):
    """Return list of (token, start, end) untuk text."""
    return [(m.group(), m.start(), m.end()) for m in TOKEN_RE.finditer(text)]

# ---- Cari span komponen & konversi ke indeks token ----
ADMIN_KEYS = ["house","rtrw","kel","kec","kabkota","prov"]  # street tidak dihitung admin
MAX_TOKEN_GAP = 4  # atur sesuai data: semakin kecil semakin ketat


In [42]:
def find_component_spans(text: str):
    t = norm_span(text)
    spans = {
        "street":  [m.span() for m in PAT_STREET.finditer(t)],
        "house":   [m.span() for m in PAT_HOUSE.finditer(t)],
        "rtrw":    [m.span() for m in PAT_RTRW.finditer(t)],
        "kel":     [m.span() for m in PAT_KEL.finditer(t)],
        "kec":     [m.span() for m in PAT_KEC.finditer(t)],
        "kabkota": [m.span() for m in PAT_KABKOTA.finditer(t)],
        "prov":    [m.span() for m in PAT_PROV.finditer(t)],
        "zip":     [m.span() for m in PAT_ZIP.finditer(t)],
        "pobox":   [m.span() for m in PAT_POBOX.finditer(t)],
    }
    return t, spans

def char_pos_to_token_idx(tokens, pos):
    """Map posisi karakter -> index token terdekat (leftmost covering, else token pertama di kanan)."""
    for i, (_, s, e) in enumerate(tokens):
        if s <= pos < e:
            return i
    for i, (_, s, e) in enumerate(tokens):
        if pos < s:
            return i
    return len(tokens) - 1 if tokens else 0

def admin_token_indices(text_norm: str, spans: dict):
    toks = tokens_with_index(text_norm)
    idxs = []
    for key in ADMIN_KEYS:
        for st, en in spans[key]:
            idxs.append(char_pos_to_token_idx(toks, st))
    return sorted(set(idxs))

def is_valid_address_token_proximity(text: str, max_token_gap: int = MAX_TOKEN_GAP) -> bool:
    t, spans = find_component_spans(text)

    # PO BOX → auto-accept
    if spans["pobox"]:
        return True

    # ❌ filter: RT/RW tanpa angka
    if re.search(r'\bRT\s*/?\s*RW\b', t, re.I):
        return False

    # ❌ filter: Kelurahan/Kecamatan tanpa nama
    # di dalam is_valid_address_token_proximity, sebelum cek token gap:
    if re.search(r'\bRT\s*/?\s*RW\b', t, re.I): 
        return False  # RT/RW tanpa angka

    if re.search(r'\bKelurahan\b(?!\s+[A-Za-z])', t, re.I): return False
    if re.search(r'\bKecamatan\b(?!\s+[A-Za-z])', t, re.I): return False
    if re.search(r'\b(Kabupaten|Kota|Prov(?:insi)?)\b(?!\s+[A-Za-z])', t, re.I): return False

    # ❌ filter: ZIP sendirian (ga ditemani Kel/Kec/Kab/Prov)
    if spans["zip"] and not (spans["kel"] or spans["kec"] or spans["kabkota"] or spans["prov"]):
        return False

    # Ambil indeks token
    idxs = admin_token_indices(t, spans)
    if len(idxs) < 2:
        return False

    # Cek jarak antar komponen admin
    for i in range(len(idxs) - 1):
        if (idxs[i+1] - idxs[i]) <= max_token_gap:
            return True

    return False


In [43]:
# 1) bangun df_addr_windows (kalau belum):
addr_window_rows = []
for i, row in df.iterrows():
    txt = row["text_clean"]
    for t in PAT_ADDR_TRIGGER.finditer(txt):
        start = max(0, t.start()-60)
        end   = min(len(txt), t.end()+120)
        addr_window_rows.append({"row": i, "source": row["source"], "window": txt[start:end]})
df_addr_windows = pd.DataFrame(addr_window_rows)

In [44]:
# 2) validasi per window dengan token proximity:
addr_valid_rows = []
for _, r in df_addr_windows.iterrows():
    w = r["window"]
    if is_valid_address_token_proximity(w, MAX_TOKEN_GAP):
        addr_valid_rows.append({"row": r["row"], "source": r["source"], "span": w})

df_addr_valid_spans = pd.DataFrame(addr_valid_rows)


In [45]:
# Tandai valid per-window (bukan per-row)
df_addr_windows['is_valid'] = df_addr_windows['window'].apply(
    lambda w: is_valid_address_token_proximity(w, MAX_TOKEN_GAP)
)

# Ambil yang valid
df_addr_valid_spans = df_addr_windows[df_addr_windows['is_valid']].copy()

# Contoh tampilkan
print("Contoh address VALID:")
print(df_addr_valid_spans.head(20)['window'])


Contoh address VALID:
50     eyboard + mouse GARANSI TOKO 2 MINGGU Alamat : RR COMPUTER, Jl. Pandeyan No.32, Pandeyan, Kec. Umbulharjo, Kota Yogyakarta, Daerah Istimewa Yogyakarta 55161
51                                    MINGGU Alamat : RR COMPUTER, Jl. Pandeyan No.32, Pandeyan, Kec. Umbulharjo, Kota Yogyakarta, Daerah Istimewa Yogyakarta 55161
52                                                    RR COMPUTER, Jl. Pandeyan No.32, Pandeyan, Kec. Umbulharjo, Kota Yogyakarta, Daerah Istimewa Yogyakarta 55161
74        nakan asuransi (safety can be fun) HiFi Computer (Bekasi) : Jl. Celepuk 2 No.43, RT.004/RW.012, Kel. Jatimakmur, Kec. Pondok Gede, Bekasi 0852-16-777-543
75                             y can be fun) HiFi Computer (Bekasi) : Jl. Celepuk 2 No.43, RT.004/RW.012, Kel. Jatimakmur, Kec. Pondok Gede, Bekasi 0852-16-777-543
76                                    e fun) HiFi Computer (Bekasi) : Jl. Celepuk 2 No.43, RT.004/RW.012, Kel. Jatimakmur, Kec. Pondok Gede, Bekasi 0852-16-77

In [46]:
# 3. Tampilkan contoh address yang TIDAK valid
print("Contoh address TIDAK valid:")
print(df_addr_windows[~df_addr_windows['is_valid']].head(10)['window'])

Contoh address TIDAK valid:
0     Anda perasaan mendalam. Menjadi kehadiran paling terang di jalan. 6. Hadiah Pendidikan Terbaik Untuk Anak: Mempromosikan perkembangan intelektual anak, dan baik untuk penanaman minat a
1     SA PILIH JASA PENGIRIMAN INSTANT CAR , JIKA PENGIRIMAN LUAR KOTA BISA PILIH JNT CARGO ATAU JTR DAN HARAP PESAN DOUBLE BUBBLE WARP MINIMAL 5 AGAR PENGIRIMAN SAMPAI TUJUAN AMAN fitur: sp
2     SI KURIR TOKO -PENGIRIMAN BARANG VIA KURIR TOKO HANYA DALAM KOTA JOGJA DAN SEKITARNYA. ONGKIR CHAT ADMIN. -HARGA DAN STOK DAPAT BERUBAH SEWAKTU-WAKTU. MOHON KONFRIMASI STOK TERLEBIH DA
3      AKTU. MOHON KONFRIMASI STOK TERLEBIH DAHULU - ONGKOS KIRIM: KAB SLEMAN : GRATIS ONGKIR KOTA YOGYA : GRATIS ONGKIR KAB BANTUL : GRATIS ONGKIR KAB GUNUNG KIDUL: Rp150.000 KAB KULON PROG
4      TERLEBIH DAHULU - ONGKOS KIRIM: KAB SLEMAN : GRATIS ONGKIR KOTA YOGYA : GRATIS ONGKIR KAB BANTUL : GRATIS ONGKIR KAB GUNUNG KIDUL: Rp150.000 KAB KULON PROGO : Rp100.000 -UNTUK PRODUK 
5      IRIM: KAB 

In [47]:
# 4. (Opsional) Hitung jumlah valid/tidak valid
print("Jumlah address valid:", df_addr_windows['is_valid'].sum())
print("Jumlah address tidak valid:", (~df_addr_windows['is_valid']).sum())

Jumlah address valid: 12
Jumlah address tidak valid: 269


In [48]:
# 5. (Opsional) Tampilkan count kemunculan window (misal, groupby window)
window_counts = df_addr_windows.groupby(['window', 'is_valid']).size().reset_index(name='count')
print(window_counts.sort_values('count', ascending=False).head(20))

                                                                                                                                                                                        window  \
215  ngan tahan lama, membuat kenyamanan menemani Anda sepanjang jalan. Pewarnaan permanen tidak mudah pudar. Pewarnaan set, setelah proses kompleks multi-lapisan, tingkat pewarnaan yang bai   
82                     Nah , jadi Besti nggak perlu bingung lagi cari outfit untuk jalan bareng sama teman ataupun pasangan , semua masalah Ammara Bliss solusinya HAPPY SHOPPING Ammara Bliss   
68    Indonesia terpercaya sejak 2002. Memiliki 50 distributor di kota-kota Indonesia dan Department Store ternama. 100% produksi Indonesia, menggunakan bahan premium berkualitas tinggi dan    
73    KTU RAKIT, INSTALASI DAN PACKAGING FREE PACKING KAYU ( LUAR KOTA/PULAU ), INSTANT AKAN DIPACKING BIASA BISA GANTI SPESIFIKASI SESUAI KEINGINAN SPESIFIKASI : # MAINBOARD CHIPSET H61 # P   
130  dan Perempuan ) , Cocok u

## Making the final json file and documentation

In [49]:
regex_patterns = [
    # PII IDs/KTP
    {
        "name": "ID_NIK",
        "pattern": r"(?<!\d)(1[1-9]|21|[37][1-6]|5[1-3]|6[1-5]|[89][12])\d{2}\d{2}([04][1-9]|[1256][0-9]|[37][01])(0[1-9]|1[0-2])\d{2}\d{4}(?!\d)",
        "flags": "",
        "replace": "[ID]",
        "role": "detector",
        "desc": "KTP/NIK v3. Harus pre-clean digit-run."
    },
    
    # PII ADDRESS
    {
        "name": "ADDR_TRIGGER",
        "pattern": r"\b(Jl\.?|Jalan|Gg\.?|Gang|RT|RW|Kel(?:urahan)?\.?|Kec(?:amatan)?\.?|Kab(?:upaten)?\.?|Kota|Prov(?:insi)?\.?|K\.?\s?P\.?|KP)\b",
        "flags": "i",
        "replace": "",
        "role": "trigger",
        "desc": "Trigger window alamat. ZIP sengaja tidak disertakan."
    },
    {
        "name": "ADDR_STREET",
        "pattern": r"\b(?:Jl\.?|Jalan|Gg\.?|Gang)\b\s+[A-Z0-9][A-Za-z0-9 .\-]*",
        "flags": "i",
        "replace": "",
        "role": "component",
        "desc": "Nama jalan/gang. Tidak dihitung admin."
    },
    {
        "name": "ADDR_HOUSE",
        "pattern": r"\b(?:No\.?|No|Nomor|Nomer|Nmr|#)\s*\w+\b",
        "flags": "i",
        "replace": "",
        "role": "component",
        "desc": "Nomor rumah/gedung. Wajib ada isinya."
    },
    {
        "name": "ADDR_RTRW",
        "pattern": r"\bRT\s*\d{1,2}\s*RW\s*\d{1,2}\b",
        "flags": "i",
        "replace": "",
        "role": "component",
        "desc": "RT/RW; keduanya wajib angka."
    },
    {
        "name": "ADDR_KEL",
        "pattern": r"\bKel(?:urahan)?\.?\s+[A-Za-z][A-Za-z .\-]*\b",
        "flags": "i",
        "replace": "",
        "role": "component",
        "desc": "Kelurahan + nama; label kosong ditolak."
    },
    {
        "name": "ADDR_KEC",
        "pattern": r"\bKec(?:amatan)?\.?\s+[A-Za-z][A-Za-z .\-]*\b",
        "flags": "i",
        "replace": "",
        "role": "component",
        "desc": "Kecamatan + nama; label kosong ditolak."
    },
    {
        "name": "ADDR_KABKOTA",
        "pattern": r"\b(?:Kab(?:upaten)?|Kota)\.?\s+[A-Za-z][A-Za-z .\-]*\b",
        "flags": "i",
        "replace": "",
        "role": "component",
        "desc": "Kabupaten/Kota + nama; label kosong ditolak."
    },
    {
        "name": "ADDR_PROV",
        "pattern": r"\bProv(?:insi)?\.?\s+[A-Za-z][A-Za-z .\-]*\b",
        "flags": "i",
        "replace": "",
        "role": "component",
        "desc": "Provinsi + nama; tidak perlu daftar provinsi satu-satu."
    },
    {
        "name": "ADDR_ZIP",
        "pattern": r"\b\d{5}\b",
        "flags": "",
        "replace": "",
        "role": "component",
        "desc": "Kode pos 5 digit. Wajib ditemani Kel/Kec/Kab/Prov."
    },
    {
        "name": "ADDR_POBOX",
        "pattern": r"\b(?:Kotak\s+Pos|K\.?\s?P\.?|KP)\s*\d+\b",
        "flags": "i",
        "replace": "",
        "role": "component",
        "desc": "PO BOX. Auto-accept."
    }
]


In [50]:
with open("pii_regex_patterns.json", "w", encoding="utf-8") as f:
    json.dump(regex_patterns, f, ensure_ascii=False, indent=2)