In [None]:
# Address normalization and matching pipeline with evaluation and CV
import re, unicodedata
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd

# Dependencies
try:
    from rapidfuzz import fuzz, process
except ImportError:
    import sys
    !{sys.executable} -m pip -q install rapidfuzz
    from rapidfuzz import fuzz, process

try:
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import (
        precision_score,
        recall_score,
        f1_score,
        roc_auc_score,
        accuracy_score,
        classification_report,
    )
    from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
except ImportError:
    import sys
    !{sys.executable} -m pip -q install scikit-learn
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import (
        precision_score,
        recall_score,
        f1_score,
        roc_auc_score,
        accuracy_score,
        classification_report,
    )
    from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# 1) Normalization
ABBREV = [
    (r"\bmah\.?\b", "mahalle"),
    (r"\bmh\b", "mahalle"),
    (r"\bcad(de|d(e|e)?si)?\b", "cadde"),
    (r"\bcad\.?\b", "cadde"),
    (r"\bcd\.?\b", "cadde"),
    (r"\bsok(ak)?\b", "sokak"),
    (r"\bsk\.?\b", "sokak"),
    (r"\bapt\.?\b", "apartman"),
    (r"\bap\.?\b", "apartman"),
]
TR_MAP = str.maketrans({"ç":"c","ğ":"g","ı":"i","ö":"o","ş":"s","ü":"u","â":"a","î":"i","û":"u"})
PUNCT = re.compile(r"[^a-z0-9\s]")
MULTISPACE = re.compile(r"\s+")

def normalize_address(s: str) -> str:
    s = s or ""
    s = unicodedata.normalize("NFKD", s.lower())
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = s.translate(TR_MAP)
    s = s.replace("/", " ").replace("\\", " ").replace("-", " ").replace("_", " ")
    for pat, repl in ABBREV:
        s = re.sub(pat, repl, s)
    s = re.sub(r"\b(no|kat|daire)\s*[:#=.-]?\s*", r" \1 ", s)
    s = PUNCT.sub(" ", s)
    s = MULTISPACE.sub(" ", s).strip()
    return s

# 2) Gazetteer + fuzzy matching
@dataclass
class Gazetteer:
    iller: List[str]
    ilceler: List[str]
    mahalleler: List[str]
    sokaklar: List[str]

    def __post_init__(self):
        # store normalized versions for matching (works with large lists)
        self.iller_norm = [normalize_address(x) for x in self.iller]
        self.ilceler_norm = [normalize_address(x) for x in self.ilceler]
        self.mahalleler_norm = [normalize_address(x) for x in self.mahalleler]
        self.sokaklar_norm = [normalize_address(x) for x in self.sokaklar]
        # inverted token index for fast candidate pruning
        from collections import defaultdict
        self.idx_mahalle = defaultdict(set)
        self.idx_sokak = defaultdict(set)
        for i, s in enumerate(self.mahalleler_norm):
            for t in s.split():
                if len(t) >= 3:
                    self.idx_mahalle[t].add(i)
        for i, s in enumerate(self.sokaklar_norm):
            for t in s.split():
                if len(t) >= 3:
                    self.idx_sokak[t].add(i)


def fuzzy_best(query: str, choices: List[str], score_cutoff: int = 80) -> Tuple[Optional[str], int]:
    if not query or not choices:
        return None, 0
    res = process.extractOne(query, choices, scorer=fuzz.WRatio, score_cutoff=score_cutoff)
    if res is None:
        return None, 0
    match, score, _idx = res
    return match, int(score)

# 3) Phonetic (very light TR-compatible soundex)
VOWELS = set("aeiou")

def turkish_soundex(word: str) -> str:
    w = normalize_address(word)
    if not w:
        return ""
    # keep first letter
    first = w[0]
    # simple mapping similar to Soundex groups
    mapping = {
        **{c:"1" for c in "bfpv"},
        **{c:"2" for c in "cgjkqsxz"},
        **{c:"3" for c in "dt"},
        **{c:"4" for c in "l"},
        **{c:"5" for c in "mn"},
        **{c:"6" for c in "r"},
    }
    # drop vowels and h,w,y
    code = []
    prev = ""
    for ch in w[1:]:
        if ch in VOWELS or ch in "hwy ":
            digit = ""
        else:
            digit = mapping.get(ch, "")
        if digit and digit != prev:
            code.append(digit)
            prev = digit
    return (first + "".join(code) + "0000")[:4]

def phonetic_similarity(a: str, b: str) -> float:
    """Return 1.0 if soundex codes match and strings are non-empty, else 0.0."""
    ca, cb = turkish_soundex(a), turkish_soundex(b)
    if not ca or not cb:
        return 0.0
    return 1.0 if ca == cb else 0.0

# 4) Extract parts via gazetteer with caching
@dataclass
class MatchResult:
    il: Optional[str]
    ilce: Optional[str]
    mahalle: Optional[str]
    sokak: Optional[str]
    mahalle_score: int
    sokak_score: int

# simple cache to speed up repeated matches on same address
GAZ_CACHE: Dict[Tuple[int, str], MatchResult] = {}

MAX_CHOICES = 400

def _candidate_list(query_tokens: List[str], norm_list: List[str], idx_map: Dict[str, set]) -> List[str]:
    cand_idx = set()
    for t in query_tokens:
        cand_idx.update(idx_map.get(t, ()))
        if len(cand_idx) >= MAX_CHOICES:
            break
    if not cand_idx:
        # fallback to first MAX_CHOICES by frequency order
        return norm_list[:MAX_CHOICES]
    # materialize
    out = [norm_list[i] for i in cand_idx]
    # cap length deterministically
    return out[:MAX_CHOICES]

def gazetteer_match(addr_norm: str, gaz: Gazetteer, cutoff: int = 80) -> MatchResult:
    key = (id(gaz), addr_norm)
    if key in GAZ_CACHE:
        return GAZ_CACHE[key]
    q = addr_norm
    q_tokens = [t for t in q.split() if len(t) >= 3]
    il, _ = fuzzy_best(q, gaz.iller_norm, score_cutoff=cutoff)
    ilce, _ = fuzzy_best(q, gaz.ilceler_norm, score_cutoff=cutoff)
    # prune candidates for mahalle & sokak via inverted index
    mah_choices = _candidate_list(q_tokens, gaz.mahalleler_norm, gaz.idx_mahalle)
    sok_choices = _candidate_list(q_tokens, gaz.sokaklar_norm, gaz.idx_sokak)
    mahalle, mahalle_score = fuzzy_best(q, mah_choices, score_cutoff=cutoff)
    sokak, sokak_score = fuzzy_best(q, sok_choices, score_cutoff=cutoff)

    def denorm(choice: Optional[str], raw: List[str], norm: List[str]) -> Optional[str]:
        if choice is None:
            return None
        try:
            idx = norm.index(choice)
            return raw[idx]
        except ValueError:
            return choice

    res = MatchResult(
        il=denorm(il, gaz.iller, gaz.iller_norm),
        ilce=denorm(ilce, gaz.ilceler, gaz.ilceler_norm),
        mahalle=denorm(mahalle, gaz.mahalleler, gaz.mahalleler_norm),
        sokak=denorm(sokak, gaz.sokaklar, gaz.sokaklar_norm),
        mahalle_score=mahalle_score,
        sokak_score=sokak_score,
    )
    GAZ_CACHE[key] = res
    return res

# Utilities
# improved house number extraction: supports 12/3, 12A/3
ADDR_NO_RE = re.compile(r"\bno\s*(\d+[a-zA-Z]?)(?:/(\d+[a-zA-Z]?))?\b")

def extract_house_no(s: str) -> Optional[str]:
    m = ADDR_NO_RE.search(s)
    return m.group(1) if m else None

def token_jaccard(a: str, b: str) -> float:
    ta = set(a.split())
    tb = set(b.split())
    if not ta and not tb:
        return 0.0
    inter = len(ta & tb)
    union = len(ta | tb)
    return float(inter) / float(union) if union else 0.0

# Feature engineering
FEATURE_NAMES = [
    "il_eq", "ilce_eq", "mahalle_fuzzy", "sokak_fuzzy", "no_eq",
    "ph_mahalle", "ph_sokak", "tok_jaccard",
]

def pair_features(a: str, b: str, gaz: Gazetteer) -> np.ndarray:
    a_norm, b_norm = normalize_address(a), normalize_address(b)
    ma = gazetteer_match(a_norm, gaz)
    mb = gazetteer_match(b_norm, gaz)
    il_eq = int((ma.il or "") == (mb.il or ""))
    ilce_eq = int((ma.ilce or "") == (mb.ilce or ""))
    mahalle_sim = fuzz.WRatio(normalize_address(ma.mahalle or ""), normalize_address(mb.mahalle or "")) if (ma.mahalle and mb.mahalle) else 0
    sokak_sim = fuzz.WRatio(normalize_address(ma.sokak or ""), normalize_address(mb.sokak or "")) if (ma.sokak and mb.sokak) else 0
    no_a = extract_house_no(a_norm)
    no_b = extract_house_no(b_norm)
    no_eq = int((no_a or "") == (no_b or ""))
    ph_mahalle = phonetic_similarity(ma.mahalle or "", mb.mahalle or "")
    ph_sokak = phonetic_similarity(ma.sokak or "", mb.sokak or "")
    tok_jac = token_jaccard(a_norm, b_norm)
    return np.array([il_eq, ilce_eq, mahalle_sim, sokak_sim, no_eq, ph_mahalle, ph_sokak, tok_jac], dtype=np.float32)

# Dataset -> features
def build_feature_matrix(df_pairs: pd.DataFrame, gaz: Gazetteer) -> Tuple[np.ndarray, np.ndarray]:
    X = np.vstack([pair_features(a, b, gaz) for a, b in zip(df_pairs['a'], df_pairs['b'])])
    y = df_pairs['label'].astype(int).values
    return X, y

# 6) Model with split, evaluation, and CV
class AddressMatcher:
    def __init__(self, n_estimators: int = 300, max_depth: int = 18, random_state: int = 42, n_jobs: int = -1):
        self.clf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=random_state,
            n_jobs=n_jobs,
            class_weight='balanced_subsample',
        )
        self.gaz: Optional[Gazetteer] = None
        self.metrics_: Dict[str, float] = {}

    def fit(self, df_pairs: pd.DataFrame, gaz: Gazetteer, test_size: float = 0.2, stratify: bool = True):
        self.gaz = gaz
        X, y = build_feature_matrix(df_pairs, gaz)
        strat = y if stratify else None
        X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=test_size, random_state=42, stratify=strat)
        self.clf.fit(X_tr, y_tr)
        self.metrics_ = self._evaluate(X_te, y_te)
        return self

    def _evaluate(self, X: np.ndarray, y: np.ndarray) -> Dict[str, float]:
        proba = self.clf.predict_proba(X)[:, 1]
        pred = (proba >= 0.5).astype(int)
        out = {
            'accuracy': accuracy_score(y, pred),
            'precision': precision_score(y, pred, zero_division=0),
            'recall': recall_score(y, pred, zero_division=0),
            'f1': f1_score(y, pred, zero_division=0),
        }
        try:
            out['roc_auc'] = roc_auc_score(y, proba)
        except Exception:
            out['roc_auc'] = float('nan')
        return out

    def cross_validate(self, df_pairs: pd.DataFrame, gaz: Gazetteer, k: int = 5) -> Dict[str, float]:
        X, y = build_feature_matrix(df_pairs, gaz)
        cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
        scores = {
            'f1': cross_val_score(self.clf, X, y, cv=cv, scoring='f1').mean(),
            'precision': cross_val_score(self.clf, X, y, cv=cv, scoring='precision').mean(),
            'recall': cross_val_score(self.clf, X, y, cv=cv, scoring='recall').mean(),
            'roc_auc': cross_val_score(self.clf, X, y, cv=cv, scoring='roc_auc').mean(),
            'accuracy': cross_val_score(self.clf, X, y, cv=cv, scoring='accuracy').mean(),
        }
        return scores

    def predict_proba(self, a: str, b: str) -> float:
        x = pair_features(a, b, self.gaz).reshape(1, -1)
        return float(self.clf.predict_proba(x)[0, 1])

    def predict(self, a: str, b: str, threshold: float = 0.5) -> int:
        return int(self.predict_proba(a, b) >= threshold)

In [None]:
# Load dataset from data/ and mine a gazetteer from train (use external ilceler list)
import os
from collections import Counter, defaultdict
iller = ["Adana", "Adıyaman", "Afyonkarahisar", "Ağrı", "Aksaray", "Amasya", "Ankara", "Antalya", "Ardahan", "Artvin", "Aydın", "Balıkesir", "Bartın", "Batman", "Bayburt", "Bilecik", "Bingöl", "Bitlis", "Bolu", "Burdur", "Bursa", "Çanakkale", "Çankırı", "Çorum", "Denizli", "Diyarbakır", "Düzce", "Edirne", "Elazığ", "Erzincan", "Erzurum", "Eskişehir", "Gaziantep", "Giresun", "Gümüşhane", "Hakkâri", "Hatay", "Iğdır", "Isparta", "İstanbul", "İzmir", "Kahramanmaraş", "Karabük", "Karaman", "Kars", "Kastamonu", "Kayseri", "Kilis", "Kırıkkale", "Kırklareli", "Kırşehir", "Kocaeli", "Konya", "Kütahya", "Malatya", "Manisa", "Mardin", "Mersin", "Muğla", "Muş", "Nevşehir", "Niğde", "Ordu", "Osmaniye", "Rize", "Sakarya", "Samsun", "Şanlıurfa", "Siirt", "Sinop", "Sivas", "Şırnak", "Tekirdağ", "Tokat", "Trabzon", "Tunceli", "Uşak", "Van", "Yalova", "Yozgat", "Zonguldak"]
DATA_DIR = '/home/yusuf/teknofest/data'
ILCELER_TXT = '/home/yusuf/teknofest/ilceler_unique.txt'

# helpers
def read_csv_safe(path: str) -> pd.DataFrame:
    return pd.read_csv(path, dtype=str, keep_default_na=False, encoding='utf-8', engine='python')

def read_list_from_txt(path: str) -> List[str]:
    if not os.path.exists(path):
        return []
    with open(path, 'r', encoding='utf-8') as f:
        lines = [ln.strip() for ln in f.readlines()]
    return [ln for ln in lines if ln]

# Locate files
train_path = os.path.join(DATA_DIR, 'train.csv')
test_path = os.path.join(DATA_DIR, 'test.csv')
assert os.path.exists(train_path) and os.path.exists(test_path), 'train.csv/test.csv not found under data/'

train_df = read_csv_safe(train_path)
test_df = read_csv_safe(test_path)
print('train shape:', train_df.shape, 'cols:', list(train_df.columns))
print('test shape:', test_df.shape, 'cols:', list(test_df.columns))

# Expected columns: train: address, label; test: id, address
addr_col = 'address'
label_col = 'label'
assert addr_col in train_df.columns and label_col in train_df.columns
assert addr_col in test_df.columns

# Normalize once
train_df['address_norm'] = train_df[addr_col].apply(normalize_address)
test_df['address_norm'] = test_df[addr_col].apply(normalize_address)

# External ilceler list
ilceler_ext = read_list_from_txt(ILCELER_TXT)
print('Loaded external ilceler:', len(ilceler_ext))

# Mine gazetteer candidates from normalized text
KEYS = {'mahalle', 'sokak', 'cadde', 'bulvar'}

def mine_gazetteer(
    df: pd.DataFrame,
    external_ilceler: Optional[List[str]] = None,
    max_items: int = 5000,
    min_freq: int = 2,
    use_bigrams: bool = True,
) -> Gazetteer:
    mah_cand = Counter()
    sok_cand = Counter()
    for s in df['address_norm'].values:
        toks = s.split()
        for i, t in enumerate(toks):
            if t in KEYS and i > 0:
                prev1 = toks[i-1]
                if prev1:
                    if t == 'mahalle':
                        mah_cand[prev1] += 1
                    elif t in {'sokak', 'cadde', 'bulvar'}:
                        sok_cand[prev1] += 1
                if use_bigrams and i > 1:
                    prev2 = toks[i-2] + ' ' + toks[i-1]
                    if t == 'mahalle':
                        mah_cand[prev2] += 1
                    elif t in {'sokak', 'cadde', 'bulvar'}:
                        sok_cand[prev2] += 1
    # apply min frequency and top-k
    mahalle_items = [(w, c) for w, c in mah_cand.items() if c >= min_freq]
    sokak_items = [(w, c) for w, c in sok_cand.items() if c >= min_freq]
    mahalle_items.sort(key=lambda x: -x[1])
    sokak_items.sort(key=lambda x: -x[1])
    mahalleler = [w for w, _ in mahalle_items[:max_items]]
    sokaklar = [w for w, _ in sokak_items[:max_items]]
    # dedupe and drop empties
    mahalleler = [m for i, m in enumerate(mahalleler) if m and m not in mahalleler[:i]]
    sokaklar = [m for i, m in enumerate(sokaklar) if m and m not in sokaklar[:i]]
    ilceler = external_ilceler or []
    return Gazetteer(iller=iller, ilceler=ilceler, mahalleler=mahalleler, sokaklar=sokaklar)

# build gazetteer
GAZ_CACHE.clear()
mined_gaz = mine_gazetteer(train_df, external_ilceler=ilceler_ext, max_items=3000, min_freq=3, use_bigrams=False)
print('Mined gazetteer sizes -> ilceler:', len(mined_gaz.ilceler), 'mahalle:', len(mined_gaz.mahalleler), 'sokak:', len(mined_gaz.sokaklar))

# Address-level split to avoid leakage
addr_by_label = train_df.groupby(label_col)['address_norm'].apply(list)
unique_pairs = set()  # for dedupe pairs later

def make_pairs_from_groups(groups: Dict[str, List[str]], max_pos_per_label: int = 400) -> List[Tuple[str, str, int]]:
    from itertools import combinations
    pairs = []
    for lab, addrs in groups.items():
        subset = addrs[:max_pos_per_label]
        for a_idx in range(len(subset)):
            for b_idx in range(a_idx + 1, len(subset)):
                a, b = subset[a_idx], subset[b_idx]
                if a == b:
                    continue
                key = (min(a, b), max(a, b), 1)
                if key in unique_pairs:
                    continue
                unique_pairs.add(key)
                pairs.append((a, b, 1))
    return pairs

train shape: (848237, 2) cols: ['address', 'label']
test shape: (217241, 2) cols: ['id', 'address']
Loaded external ilceler: 894
Mined gazetteer sizes -> ilceler: 894 mahalle: 5000 sokak: 5000


In [None]:
# Build labeled pairs with address-level split to avoid leakage; add balanced negatives
from itertools import combinations
from collections import defaultdict

# Split labels first, then create pairs inside train split only
labels = train_df[label_col].unique().tolist()
labels.sort()

# Stratified split at label level: 80/20
from sklearn.model_selection import train_test_split as _tts
label_tr, label_te = _tts(labels, test_size=0.2, random_state=42)

by_label = train_df.groupby(label_col)['address_norm'].apply(list)

# Positive pairs from TRAIN labels only
pos_pairs = []
for lab in label_tr:
    addrs = by_label.get(lab, [])
    if not addrs:
        continue
    max_per_label = 200
    for a, b in combinations(addrs[:max_per_label], 2):
        if a == b:
            continue
        pos_pairs.append((a, b, 1))

# Balanced negative sampling: sample pairs across two different labels
rng = np.random.default_rng(42)
neg_pairs = []
# create a map label->addresses subset
addr_map = {lab: by_label.get(lab, [])[:200] for lab in label_tr}
label_tr_list = [lab for lab in label_tr if len(addr_map[lab]) >= 1]
num_negs_target = len(pos_pairs)
while len(neg_pairs) < num_negs_target and len(label_tr_list) >= 2:
    la, lb = rng.choice(label_tr_list, size=2, replace=False)
    a_list, b_list = addr_map[la], addr_map[lb]
    if not a_list or not b_list:
        continue
    a = a_list[rng.integers(0, len(a_list))]
    b = b_list[rng.integers(0, len(b_list))]
    if a == b:
        continue
    neg_pairs.append((a, b, 0))

pairs_df = pd.DataFrame(pos_pairs + neg_pairs, columns=['a', 'b', 'label']).drop_duplicates()
print('pairs_df:', pairs_df.shape, 'pos:', (pairs_df.label==1).sum(), 'neg:', (pairs_df.label==0).sum())

pairs_df: (39243085, 3) pos: 39241085 neg: 2000


In [None]:
# Train/evaluate with mined gazetteer and constructed pairs
matcher2 = AddressMatcher()
matcher2.fit(pairs_df.rename(columns={'label':'label'}), mined_gaz, test_size=0.2)
print('Holdout metrics (mined gazetteer):', matcher2.metrics_)
print('CV(3) metrics:', matcher2.cross_validate(pairs_df, mined_gaz, k=3))

# Evaluate generalization on HELD-OUT labels (simple sanity):
# Create some pairs from held-out labels and score using the already-fit model
held_pos, held_neg = [], []
for lab in label_te[:50]:  # limit for speed
    addrs = by_label.get(lab, [])
    if len(addrs) >= 2:
        held_pos.append((addrs[0], addrs[1], 1))
# negatives from two different held-out labels
if len(label_te) >= 2:
    labs = list(label_te)
    for i in range(min(50, len(labs)-1)):
        la, lb = labs[i], labs[(i+1) % len(labs)]
        a_list = by_label.get(la, [])
        b_list = by_label.get(lb, [])
        if a_list and b_list:
            held_neg.append((a_list[0], b_list[0], 0))

if held_pos or held_neg:
    held_df = pd.DataFrame(held_pos + held_neg, columns=['a','b','label'])
    Xh, yh = build_feature_matrix(held_df, mined_gaz)
    proba = matcher2.clf.predict_proba(Xh)[:,1]
    pred = (proba >= 0.5).astype(int)
    print('Held-out labels metrics:', {
        'accuracy': accuracy_score(yh, pred),
        'precision': precision_score(yh, pred, zero_division=0),
        'recall': recall_score(yh, pred, zero_division=0),
        'f1': f1_score(yh, pred, zero_division=0),
    })

# Example: score similarity between two arbitrary test rows
if len(test_df) >= 2:
    a = test_df.iloc[0]['address']
    b = test_df.iloc[1]['address']
    print('Example test pair prob:', matcher2.predict_proba(a, b))

KeyboardInterrupt: 