In [2]:
import os, glob
import pandas as pd
import numpy as np

def find_file(fname: str, search_roots=(".",)):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                return os.path.abspath(path)
    return None

def resolve_path(default_path: str, fallback_name: str):
    if os.path.exists(default_path):
        return default_path
    p = find_file(fallback_name, search_roots=(".", os.getcwd()))
    if p:
        print(f"[info] Bulundu: {fallback_name} -> {p}")
        return p
    raise FileNotFoundError(f"Dosya bulunamadƒ±: {fallback_name}")

PATH_HISTORY = resolve_path("/mnt/data/customer_history.csv", "customer_history.csv")
PATH_CUSTOMERS = resolve_path("/mnt/data/customers.csv", "customers.csv")
PATH_REF = resolve_path("/mnt/data/referance_data.csv", "referance_data.csv")
PATH_TEST = resolve_path("/mnt/data/referance_data_test.csv", "referance_data_test.csv")
PATH_SUB = resolve_path("/mnt/data/sample_submission.csv", "sample_submission.csv")

hist = pd.read_csv(PATH_HISTORY, parse_dates=["date"], low_memory=False)
cust = pd.read_csv(PATH_CUSTOMERS, low_memory=False)
ref  = pd.read_csv(PATH_REF, parse_dates=["ref_date"], low_memory=False)
test = pd.read_csv(PATH_TEST, parse_dates=["ref_date"], low_memory=False)
sub  = pd.read_csv(PATH_SUB, low_memory=False)

for df in (hist, cust, ref, test, sub):
    if "cust_id" in df.columns:
        df["cust_id"] = pd.to_numeric(df["cust_id"], errors="coerce").astype("Int64")
if "churn" in ref.columns:
    ref["churn"] = pd.to_numeric(ref["churn"], errors="coerce").fillna(0).astype("Int8")

print("‚úÖ Dosyalar y√ºklendi:")
print("hist:", hist.shape, "| cust:", cust.shape, "| ref:", ref.shape, "| test:", test.shape)


  from pandas.core import (


[info] Bulundu: customer_history.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customer_history.csv
[info] Bulundu: customers.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customers.csv
[info] Bulundu: referance_data.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data.csv
[info] Bulundu: referance_data_test.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data_test.csv
[info] Bulundu: sample_submission.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\sample_submission.csv
‚úÖ Dosyalar y√ºklendi:
hist: (5359609, 7) | cust: (176293, 8) | ref: (133287, 3) | test: (43006, 2)


In [3]:
SAMPLE_N = 500  # √∂rnek 500 m√º≈üteri
keys_sample = ref[["cust_id","ref_date","churn"]].sample(SAMPLE_N, random_state=42)
print("üéØ Mini √∂rnek olu≈üturuldu:", keys_sample.shape)

üéØ Mini √∂rnek olu≈üturuldu: (500, 3)


In [5]:
# === 2) √ñZELLƒ∞K √úRETME FONKSƒ∞YONLARI ===
HIST_COLS = [
    "cust_id","date",
    "mobile_eft_all_cnt","active_product_category_nbr",
    "mobile_eft_all_amt","cc_transaction_all_amt","cc_transaction_all_cnt"
]
hist = hist[[c for c in HIST_COLS if c in hist.columns]].sort_values(["cust_id","date"]).reset_index(drop=True)
WINDOWS = [1,3,6,12]
BASE_NUM_COLS = [c for c in HIST_COLS if c not in ["cust_id","date"]]

def _agg_safe(s, fn):
    s = pd.to_numeric(s, errors="coerce"); s = s[np.isfinite(s)]
    if s.size == 0: return 0.0
    if fn=="sum":  return float(s.sum())
    if fn=="mean": return float(s.mean())
    if fn=="std":  return float(s.std(ddof=0))
    if fn=="max":  return float(s.max())
    return 0.0

def aggregate_for_one(h_cust: pd.DataFrame, ref_date: pd.Timestamp):
    feat = {"ref_date": ref_date}
    h = h_cust[h_cust["date"] < ref_date]
    if h.empty:
        for col in BASE_NUM_COLS:
            for w in WINDOWS:
                feat[f"{col}_L{w}M_sum"]=feat[f"{col}_L{w}M_mean"]=feat[f"{col}_L{w}M_std"]=feat[f"{col}_L{w}M_max"]=0.0
        feat["recency_days"] = 9999
    else:
        feat["recency_days"] = int((ref_date - h["date"].max()).days)
        for w in WINDOWS:
            start = ref_date - pd.DateOffset(months=w)
            hw = h[(h["date"] >= start) & (h["date"] < ref_date)]
            for col in BASE_NUM_COLS:
                feat[f"{col}_L{w}M_sum"]  = _agg_safe(hw[col], "sum")
                feat[f"{col}_L{w}M_mean"] = _agg_safe(hw[col], "mean")
                feat[f"{col}_L{w}M_std"]  = _agg_safe(hw[col], "std")
                feat[f"{col}_L{w}M_max"]  = _agg_safe(hw[col], "max")

    eps = 1e-6
    for w in WINDOWS:
        feat[f"cc_amt_per_txn_L{w}M"] = feat.get(f"cc_transaction_all_amt_L{w}M_sum",0.0)/(feat.get(f"cc_transaction_all_cnt_L{w}M_sum",0.0)+eps)
        feat[f"mobile_eft_amt_per_txn_L{w}M"] = feat.get(f"mobile_eft_all_amt_L{w}M_sum",0.0)/(feat.get(f"mobile_eft_all_cnt_L{w}M_sum",0.0)+eps)

    # --- HARCAMA/EFT DEƒûƒ∞≈ûƒ∞M ORANI ---
    for short, long in [(1,3), (3,6), (6,12)]:
        for base in ["cc_transaction_all_amt", "mobile_eft_all_amt"]:
            ms = feat.get(f"{base}_L{short}M_sum", 0.0)
            ml = feat.get(f"{base}_L{long}M_sum", 0.0)
            feat[f"{base}_growth_L{short}vL{long}"] = (ms - ml) / (ml + eps)

    # --- EFT / KREDƒ∞ KARTI ORANI ---
    for w in WINDOWS:
        feat[f"eft_to_cc_amt_L{w}M_ratio"] = feat.get(f"mobile_eft_all_amt_L{w}M_sum",0) / (feat.get(f"cc_transaction_all_amt_L{w}M_sum",0) + eps)
        feat[f"eft_to_cc_cnt_L{w}M_ratio"] = feat.get(f"mobile_eft_all_cnt_L{w}M_sum",0) / (feat.get(f"cc_transaction_all_cnt_L{w}M_sum",0) + eps)

    # --- √úR√úN √áE≈ûƒ∞TLƒ∞Lƒ∞ƒûƒ∞ TRENDƒ∞ ---
    for short, long in [(1,3), (3,6), (6,12)]:
        ms = feat.get(f"active_product_category_nbr_L{short}M_mean", 0.0)
        ml = feat.get(f"active_product_category_nbr_L{long}M_mean", 0.0)
        feat[f"product_variety_trend_L{short}vL{long}"] = ms - ml

    return feat

def build_features_for_keys(hist_df, keys_df, n_limit=None, progress_every=5000):
    if n_limit is not None:
        keys_df = keys_df.head(n_limit).copy()
    h = hist_df
    cache = {}
    rows = []
    for i, r in enumerate(keys_df.itertuples(index=False), 1):
        cid, rd = r.cust_id, r.ref_date
        if pd.isna(cid) or pd.isna(rd): continue
        if cid in cache: h_cust = cache[cid]
        else:
            h_cust = h[h["cust_id"] == cid]; cache[cid] = h_cust
        f = aggregate_for_one(h_cust, rd); f["cust_id"]=cid; rows.append(f)
        if progress_every and (i % progress_every == 0):
            print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(rows)


In [6]:
feats_sample = build_features_for_keys(hist, keys_sample[["cust_id","ref_date"]], n_limit=None, progress_every=100)


[features] 100/500
[features] 200/500
[features] 300/500
[features] 400/500
[features] 500/500


In [8]:
feats_sample = build_features_for_keys(hist, keys_sample[["cust_id","ref_date"]], n_limit=None, progress_every=100)
sample_df = keys_sample.merge(feats_sample, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
print("‚úÖ Mini feature set hazƒ±r:", sample_df.shape)


[features] 100/500
[features] 200/500
[features] 300/500
[features] 400/500
[features] 500/500
‚úÖ Mini feature set hazƒ±r: (500, 116)


In [9]:
num_cols = sample_df.select_dtypes(include="number").columns
cat_cols = [c for c in sample_df.columns if c not in num_cols and c != "ref_date"]

sample_df[num_cols] = sample_df[num_cols].fillna(0)
for c in cat_cols:
    sample_df[c] = sample_df[c].fillna("Unknown")

sample_enc = pd.get_dummies(sample_df, columns=cat_cols, drop_first=True)
print("‚úÖ Encode tamamlandƒ±:", sample_enc.shape)


‚úÖ Encode tamamlandƒ±: (500, 139)


In [10]:
TARGET = "churn"
PROTECTED = ["cust_id","ref_date",TARGET]
feat_cols = [c for c in sample_enc.columns if c not in PROTECTED]

cut = sample_enc["ref_date"].quantile(0.8)
train_mask = sample_enc["ref_date"] < cut
valid_mask = ~train_mask

X_train = sample_enc.loc[train_mask, feat_cols].fillna(0)
y_train = sample_enc.loc[train_mask, TARGET].astype(int).values
X_valid = sample_enc.loc[valid_mask, feat_cols].fillna(0)
y_valid = sample_enc.loc[valid_mask, TARGET].astype(int).values

print("‚úÖ Train/Valid b√∂l√ºm√º tamam:")
print("Train:", X_train.shape, "| Valid:", X_valid.shape)


‚úÖ Train/Valid b√∂l√ºm√º tamam:
Train: (391, 136) | Valid: (109, 136)


In [11]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

clf = LGBMClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=70,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight="balanced",
    random_state=42
)

clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    callbacks=[
        early_stopping(stopping_rounds=30),
        log_evaluation(period=30)
    ]
)


[LightGBM] [Info] Number of positive: 63, number of negative: 328
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8413
[LightGBM] [Info] Number of data points in the train set: 391, number of used features: 126
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 30 rounds
[30]	valid_0's auc: 0.596154	valid_0's binary_logloss: 0.575206
Early stopping, best iteration is:
[1]	valid_0's auc: 0.609776	valid_0's binary_logloss: 0.683881


In [13]:
from sklearn.metrics import roc_auc_score
import numpy as np

def gini_from_auc(y_true, y_score):
    return 2 * roc_auc_score(y_true, y_score) - 1

def recall_at_k(y_true, y_score, k=0.10):
    n = len(y_true)
    topk = int(np.ceil(n * k))
    idx = np.argsort(-y_score)[:topk]
    return float(y_true[idx].sum()) / float(y_true.sum() + 1e-9)

def lift_at_k(y_true, y_score, k=0.10):
    n = len(y_true)
    topk = int(np.ceil(n * k))
    idx = np.argsort(-y_score)[:topk]
    prec = float(y_true[idx].mean())
    prev = float(y_true.mean())
    return prec / max(prev, 1e-9)

def competition_score(y_true, y_score):
    gini  = gini_from_auc(y_true, y_score)
    rec10 = recall_at_k(y_true, y_score, 0.10)
    lift10 = lift_at_k(y_true, y_score, 0.10)
    score = 0.4*gini + 0.3*rec10 + 0.3*lift10
    return {
        "Gini": round(gini, 4),
        "Recall@10%": round(rec10, 4),
        "Lift@10%": round(lift10, 4),
        "CompetitionScore": round(score, 4)
    }


In [14]:
valid_proba = clf.predict_proba(X_valid)[:, 1]
metrics = competition_score(y_valid, valid_proba)
print("\nüìä MINI TEST SONU√áLARI:")
for k, v in metrics.items():
    print(f"  {k}: {v}")



üìä MINI TEST SONU√áLARI:
  Gini: 0.2196
  Recall@10%: 0.1538
  Lift@10%: 1.5245
  CompetitionScore: 0.5913


In [15]:
# === FULL DATA FEATURE BUILD ===
BATCH = 20000
keys = ref[["cust_id","ref_date","churn"]].sort_values("ref_date").reset_index(drop=True)
parts = []
for s in range(0, len(keys), BATCH):
    e = min(s + BATCH, len(keys))
    keys_part = keys.iloc[s:e][["cust_id","ref_date"]]
    feats_part = build_features_for_keys(hist, keys_part, n_limit=None)
    part = keys.iloc[s:e].merge(feats_part, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    parts.append(part)
    print(f"[train] done: {e}/{len(keys)}")
X_full = pd.concat(parts, ignore_index=True)

# === CLEAN + ENCODE ===
num_cols = X_full.select_dtypes(include="number").columns
cat_cols = [c for c in X_full.columns if c not in num_cols and c != "ref_date"]
X_full[num_cols] = X_full[num_cols].fillna(0)
for c in cat_cols: X_full[c] = X_full[c].fillna("Unknown")
X_full_enc = pd.get_dummies(X_full, columns=cat_cols, drop_first=True)

# === TIME SPLIT ===
TARGET = "churn"
PROTECTED = ["cust_id","ref_date",TARGET]
feat_cols = [c for c in X_full_enc.columns if c not in PROTECTED]
cut = X_full_enc["ref_date"].quantile(0.80)
train_mask = X_full_enc["ref_date"] < cut
valid_mask = ~train_mask
X_train = X_full_enc.loc[train_mask, feat_cols].fillna(0)
y_train = X_full_enc.loc[train_mask, TARGET].astype(int).values
X_valid = X_full_enc.loc[valid_mask, feat_cols].fillna(0)
y_valid = X_full_enc.loc[valid_mask, TARGET].astype(int).values

# === MODEL (hedef ~1.22‚Äì1.27) ===
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
clf = LGBMClassifier(
    n_estimators=850,
    learning_rate=0.04,
    max_depth=8,
    num_leaves=95,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight="balanced",
    random_state=42
)
clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=50)]
)

# === VALIDATION SCORE ===
valid_proba = clf.predict_proba(X_valid)[:, 1]

from sklearn.metrics import roc_auc_score
import numpy as np
def gini_from_auc(y_true, y_score): return 2*roc_auc_score(y_true, y_score)-1
def recall_at_k(y_true, y_score, k=0.10):
    n = len(y_true); topk = int(np.ceil(n*k)); idx = np.argsort(-y_score)[:topk]
    return float(y_true[idx].sum())/float(y_true.sum()+1e-9)
def lift_at_k(y_true, y_score, k=0.10):
    n = len(y_true); topk = int(np.ceil(n*k)); idx = np.argsort(-y_score)[:topk]
    prec = float(y_true[idx].mean()); prev = float(y_true.mean()); return prec/max(prev,1e-9)
gini = round(gini_from_auc(y_valid, valid_proba),4)
rec10 = round(recall_at_k(y_valid, valid_proba),4)
lift10 = round(lift_at_k(y_valid, valid_proba),4)
score = round(0.4*gini + 0.3*rec10 + 0.3*lift10,4)
print({"Gini": gini, "Recall@10%": rec10, "Lift@10%": lift10, "CompetitionScore": score})


[features] 5000/20000
[features] 10000/20000
[features] 15000/20000
[features] 20000/20000
[train] done: 20000/133287
[features] 5000/20000
[features] 10000/20000
[features] 15000/20000
[features] 20000/20000
[train] done: 40000/133287
[features] 5000/20000
[features] 10000/20000
[features] 15000/20000
[features] 20000/20000
[train] done: 60000/133287
[features] 5000/20000
[features] 10000/20000
[features] 15000/20000
[features] 20000/20000
[train] done: 80000/133287
[features] 5000/20000
[features] 10000/20000
[features] 15000/20000
[features] 20000/20000
[train] done: 100000/133287
[features] 5000/20000
[features] 10000/20000
[features] 15000/20000
[features] 20000/20000
[train] done: 120000/133287
[features] 5000/13287
[features] 10000/13287
[train] done: 133287/133287
[LightGBM] [Info] Number of positive: 15178, number of negative: 88713
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044603 seconds.
You can set `force_col_wise=true` to remov

In [17]:
# ============================================================
# ING Datathon - Tam Pipeline (Mini veya Full)  ‚Äî by Zeynep
# ============================================================

# 0) K√úT√úPHANELER
import os, glob
import numpy as np
import pandas as pd

# ------------------------------------------------------------
# 1) YARDIMCI: Dosya Bulucu ve Y√ºkleme
# ------------------------------------------------------------
def find_file(fname: str, search_roots=(".",)):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                return os.path.abspath(path)
    return None

def resolve_path(default_path: str, fallback_name: str):
    if os.path.exists(default_path):
        return default_path
    p = find_file(fallback_name, search_roots=(".", os.getcwd()))
    if p:
        print(f"[info] Bulundu: {fallback_name} -> {p}")
        return p
    raise FileNotFoundError(f"Dosya bulunamadƒ±: {fallback_name}")

PATH_HISTORY = resolve_path("/mnt/data/customer_history.csv", "customer_history.csv")
PATH_CUSTOMERS = resolve_path("/mnt/data/customers.csv", "customers.csv")
PATH_REF = resolve_path("/mnt/data/referance_data.csv", "referance_data.csv")
PATH_TEST = resolve_path("/mnt/data/referance_data_test.csv", "referance_data_test.csv")
PATH_SUB = resolve_path("/mnt/data/sample_submission.csv", "sample_submission.csv")

hist = pd.read_csv(PATH_HISTORY, parse_dates=["date"], low_memory=False)
cust = pd.read_csv(PATH_CUSTOMERS, low_memory=False)
ref  = pd.read_csv(PATH_REF, parse_dates=["ref_date"], low_memory=False)
test = pd.read_csv(PATH_TEST, parse_dates=["ref_date"], low_memory=False)
sub  = pd.read_csv(PATH_SUB, low_memory=False)

for df in (hist, cust, ref, test, sub):
    if "cust_id" in df.columns:
        df["cust_id"] = pd.to_numeric(df["cust_id"], errors="coerce").astype("Int64")
if "churn" in ref.columns:
    ref["churn"] = pd.to_numeric(ref["churn"], errors="coerce").fillna(0).astype("Int8")

print("‚úÖ Dosyalar y√ºklendi:")
print("  hist:", hist.shape, "| cust:", cust.shape, "| ref:", ref.shape, "| test:", test.shape)

# ------------------------------------------------------------
# 2) √ñZELLƒ∞K √úRETME BAZI AYARLAR
# ------------------------------------------------------------
HIST_COLS = [
    "cust_id","date",
    "mobile_eft_all_cnt","active_product_category_nbr",
    "mobile_eft_all_amt","cc_transaction_all_amt","cc_transaction_all_cnt"
]
hist = hist[[c for c in HIST_COLS if c in hist.columns]].sort_values(["cust_id","date"]).reset_index(drop=True)

WINDOWS = [1,3,6,12]
BASE_NUM_COLS = [c for c in HIST_COLS if c not in ["cust_id","date"]]

def _agg_safe(s, fn):
    s = pd.to_numeric(s, errors="coerce"); s = s[np.isfinite(s)]
    if s.size == 0: return 0.0
    if fn=="sum":  return float(s.sum())
    if fn=="mean": return float(s.mean())
    if fn=="std":  return float(s.std(ddof=0))
    if fn=="max":  return float(s.max())
    return 0.0

def aggregate_for_one(h_cust: pd.DataFrame, ref_date: pd.Timestamp):
    feat = {"ref_date": ref_date}
    h = h_cust[h_cust["date"] < ref_date]
    if h.empty:
        for col in BASE_NUM_COLS:
            for w in WINDOWS:
                feat[f"{col}_L{w}M_sum"]=feat[f"{col}_L{w}M_mean"]=feat[f"{col}_L{w}M_std"]=feat[f"{col}_L{w}M_max"]=0.0
        feat["recency_days"] = 9999
    else:
        feat["recency_days"] = int((ref_date - h["date"].max()).days)
        for w in WINDOWS:
            start = ref_date - pd.DateOffset(months=w)
            hw = h[(h["date"] >= start) & (h["date"] < ref_date)]
            for col in BASE_NUM_COLS:
                feat[f"{col}_L{w}M_sum"]  = _agg_safe(hw[col], "sum")
                feat[f"{col}_L{w}M_mean"] = _agg_safe(hw[col], "mean")
                feat[f"{col}_L{w}M_std"]  = _agg_safe(hw[col], "std")
                feat[f"{col}_L{w}M_max"]  = _agg_safe(hw[col], "max")

    eps = 1e-6
    # Oranlar
    for w in WINDOWS:
        feat[f"cc_amt_per_txn_L{w}M"] = feat.get(f"cc_transaction_all_amt_L{w}M_sum",0.0)/(feat.get(f"cc_transaction_all_cnt_L{w}M_sum",0.0)+eps)
        feat[f"mobile_eft_amt_per_txn_L{w}M"] = feat.get(f"mobile_eft_all_amt_L{w}M_sum",0.0)/(feat.get(f"mobile_eft_all_cnt_L{w}M_sum",0.0)+eps)

    # Trend diff/ratio
    for short,long in [(1,3),(3,6),(6,12)]:
        for base in ["cc_transaction_all_amt","cc_transaction_all_cnt","mobile_eft_all_amt","mobile_eft_all_cnt","active_product_category_nbr"]:
            ms = feat.get(f"{base}_L{short}M_mean",0.0)
            ml = feat.get(f"{base}_L{long}M_mean",0.0)
            feat[f"{base}_trend_mean_L{short}vL{long}_diff"]  = ms-ml
            feat[f"{base}_trend_mean_L{short}vL{long}_ratio"] = ms/(ml+eps)

    # --- YENƒ∞ G√ú√áL√ú FEATURE'LAR ---
    # 1) ƒ∞≈ülem sƒ±klƒ±ƒüƒ± trendi (frekans deƒüi≈üimi)
    for short, long in [(1,3), (3,6), (6,12)]:
        ms = feat.get(f"cc_transaction_all_cnt_L{short}M_mean", 0.0)
        ml = feat.get(f"cc_transaction_all_cnt_L{long}M_mean", 0.0)
        feat[f"txn_freq_trend_L{short}vL{long}"] = (ms - ml) / (ml + eps)

    # 2) Recency skor (son aktiflik ‚Äì normalize)
    feat["recency_score"] = float(np.exp(-feat.get("recency_days", 9999) / 30.0))

    # 3) Sƒ±fƒ±r ay oranƒ± (aktif olmayan aylarƒ±n oranƒ±)
    active_cols = ["cc_transaction_all_cnt", "mobile_eft_all_cnt"]
    for col in active_cols:
        zero_months = sum(feat.get(f"{col}_L{w}M_sum", 0.0) == 0 for w in [1,3,6,12])
        feat[f"{col}_zero_month_ratio"] = zero_months / 4.0

    # 4) Volatilite (std/mean)
    for base in ["cc_transaction_all_amt", "mobile_eft_all_amt"]:
        for w in [3,6,12]:
            std = feat.get(f"{base}_L{w}M_std", 0.0)
            mean = feat.get(f"{base}_L{w}M_mean", 0.0)
            feat[f"{base}_volatility_L{w}M"] = std / (mean + eps)

    # 5) EFT/Kart oranlarƒ± (ek sinyal)
    for w in WINDOWS:
        feat[f"eft_to_cc_amt_L{w}M_ratio"] = feat.get(f"mobile_eft_all_amt_L{w}M_sum",0) / (feat.get(f"cc_transaction_all_amt_L{w}M_sum",0) + eps)
        feat[f"eft_to_cc_cnt_L{w}M_ratio"] = feat.get(f"mobile_eft_all_cnt_L{w}M_sum",0) / (feat.get(f"cc_transaction_all_cnt_L{w}M_sum",0) + eps)

    # Growth (kƒ±sa vs uzun)
    for short, long in [(1,3), (3,6), (6,12)]:
        for base in ["cc_transaction_all_amt", "mobile_eft_all_amt"]:
            ms = feat.get(f"{base}_L{short}M_sum", 0.0)
            ml = feat.get(f"{base}_L{long}M_sum", 0.0)
            feat[f"{base}_growth_L{short}vL{long}"] = (ms - ml) / (ml + eps)

    return feat

def build_features_for_keys(hist_df, keys_df, n_limit=None, progress_every=5000):
    if n_limit is not None:
        keys_df = keys_df.head(n_limit).copy()
    h = hist_df
    cache = {}
    rows = []
    for i, r in enumerate(keys_df.itertuples(index=False), 1):
        cid, rd = r.cust_id, r.ref_date
        if pd.isna(cid) or pd.isna(rd): continue
        if cid in cache: h_cust = cache[cid]
        else:
            h_cust = h[h["cust_id"] == cid]; cache[cid] = h_cust
        f = aggregate_for_one(h_cust, rd); f["cust_id"]=cid; rows.append(f)
        if progress_every and (i % progress_every == 0):
            print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(rows)

# ------------------------------------------------------------
# 3) METRƒ∞KLER (Competition Score)
# ------------------------------------------------------------
from sklearn.metrics import roc_auc_score

def gini_from_auc(y_true, y_score): return 2 * roc_auc_score(y_true, y_score) - 1

def recall_at_k(y_true, y_score, k=0.10):
    n = len(y_true); topk = int(np.ceil(n * k)); idx = np.argsort(-y_score)[:topk]
    return float(y_true[idx].sum()) / float(y_true.sum() + 1e-9)

def lift_at_k(y_true, y_score, k=0.10):
    n = len(y_true); topk = int(np.ceil(n * k)); idx = np.argsort(-y_score)[:topk]
    prec = float(y_true[idx].mean()); prev = float(y_true.mean())
    return prec / max(prev, 1e-9)

def competition_score(y_true, y_score):
    gini  = gini_from_auc(y_true, y_score)
    rec10 = recall_at_k(y_true, y_score, 0.10)
    lift10= lift_at_k(y_true, y_score, 0.10)
    score = 0.4*gini + 0.3*rec10 + 0.3*lift10
    return {"Gini": round(gini,4), "Recall@10%": round(rec10,4), "Lift@10%": round(lift10,4), "CompetitionScore": round(score,4)}

# ------------------------------------------------------------
# 4) √áALI≈ûTIRMA MODU
#    "mini": hƒ±zlƒ± test (√∂r. 2000 m√º≈üteri)
#    "full": t√ºm veri (ref tamamƒ±)
# ------------------------------------------------------------
RUN_MODE = "mini"   # "mini" veya "full"
SAMPLE_N = 2000     # mini modda ka√ß m√º≈üteri?

# ------------------------------------------------------------
# 5) FEATURE √úRETƒ∞Mƒ∞ (mini veya full)
# ------------------------------------------------------------
if RUN_MODE == "mini":
    keys = ref[["cust_id","ref_date","churn"]].sample(SAMPLE_N, random_state=42).reset_index(drop=True)
    print(f"üéØ MINI MOD: {len(keys)} m√º≈üteri")
else:
    keys = ref[["cust_id","ref_date","churn"]].sort_values("ref_date").reset_index(drop=True)
    print(f"üéØ FULL MOD: {len(keys)} m√º≈üteri")

parts = []
BATCH = 20000 if RUN_MODE=="full" else len(keys)
for s in range(0, len(keys), BATCH):
    e = min(s + BATCH, len(keys))
    keys_part = keys.iloc[s:e][["cust_id","ref_date"]]
    feats_part = build_features_for_keys(hist, keys_part, n_limit=None, progress_every=5000 if RUN_MODE=="full" else 500)
    part = keys.iloc[s:e].merge(feats_part, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    parts.append(part)
    print(f"[train] done: {e}/{len(keys)}")
X_full = pd.concat(parts, ignore_index=True)
print("‚úÖ Eƒüitim feature set hazƒ±r:", X_full.shape)

# ------------------------------------------------------------
# 6) TEMƒ∞ZLEME + ENCODE
# ------------------------------------------------------------
num_cols = X_full.select_dtypes(include="number").columns
cat_cols = [c for c in X_full.columns if c not in num_cols and c != "ref_date"]

X_full[num_cols] = X_full[num_cols].fillna(0)
for c in cat_cols: X_full[c] = X_full[c].fillna("Unknown")

X_full_enc = pd.get_dummies(X_full, columns=cat_cols, drop_first=True)
print("‚úÖ Encode bitti:", X_full_enc.shape)

# ------------------------------------------------------------
# 7) ZAMANSAL B√ñLME (Train / Valid)
# ------------------------------------------------------------
TARGET = "churn"
PROTECTED = ["cust_id","ref_date",TARGET]
feat_cols = [c for c in X_full_enc.columns if c not in PROTECTED]

cut = X_full_enc["ref_date"].quantile(0.80)  # %80 ge√ßmi≈ü, %20 gelecek
train_mask = X_full_enc["ref_date"] < cut
valid_mask = ~train_mask

X_train = X_full_enc.loc[train_mask, feat_cols].fillna(0)
y_train = X_full_enc.loc[train_mask, TARGET].astype(int).values
X_valid = X_full_enc.loc[valid_mask, feat_cols].fillna(0)
y_valid = X_full_enc.loc[valid_mask, TARGET].astype(int).values

print("‚úÖ Split tamam | Train:", X_train.shape, "| Valid:", X_valid.shape)

# ------------------------------------------------------------
# 8) MODEL (LightGBM) ‚Äî dengesizlik ayarlƒ±
# ------------------------------------------------------------
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

pos_weight = (len(y_train) / (y_train.sum() + 1e-9))  # sƒ±nƒ±f dengesizliƒüi
clf = LGBMClassifier(
    n_estimators=900,
    learning_rate=0.035,
    max_depth=9,
    num_leaves=110,
    subsample=0.85,
    colsample_bytree=0.85,
    scale_pos_weight=pos_weight,
    class_weight="balanced",
    random_state=42
)

clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=50)
    ]
)

# ------------------------------------------------------------
# 9) VALƒ∞DATION SKORU
# ------------------------------------------------------------
valid_proba = clf.predict_proba(X_valid)[:, 1]
metrics = competition_score(y_valid, valid_proba)
print("\nüìä VALIDATION SONU√áLARI:", metrics)

# ------------------------------------------------------------
# 10) (OPSƒ∞YONEL) TEST TAHMƒ∞Nƒ∞ ve SUBMISSION
#     RUN_MODE "full" ise a√ßmak mantƒ±klƒ±.
# ------------------------------------------------------------
DO_SUBMISSION = (RUN_MODE == "full")
if DO_SUBMISSION:
    print("\nüßæ Submission hazƒ±rlanƒ±yor...")
    test_keys = test[["cust_id","ref_date"]].copy()
    X_hist_test = build_features_for_keys(hist, test_keys, n_limit=None, progress_every=5000)

    X_test = test_keys.merge(X_hist_test, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    num_cols_t = X_test.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols_t = [c for c in X_test.columns if c not in num_cols_t and c not in ["ref_date"]]
    X_test[num_cols_t] = X_test[num_cols_t].fillna(0)
    for c in cat_cols_t: X_test[c] = X_test[c].fillna("Unknown")

    X_test_enc = pd.get_dummies(X_test, columns=cat_cols_t, drop_first=True)

    # Eƒüitimdeki kolonlara hizala
    TRAIN_FEATS = X_train.columns.tolist()
    X_test_enc = X_test_enc.reindex(columns=TRAIN_FEATS, fill_value=0)

    test_proba = clf.predict_proba(X_test_enc)[:, 1]
    out = sub[["cust_id"]].merge(pd.DataFrame({"cust_id": X_test["cust_id"], "churn": test_proba}), on="cust_id", how="left")
    out["churn"] = out["churn"].fillna(float(ref["churn"].mean()))
    out.to_csv("submission.csv", index=False, float_format="%.6f")
    print("‚úÖ submission.csv yazƒ±ldƒ±. Satƒ±r sayƒ±sƒ±:", out.shape)


[info] Bulundu: customer_history.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customer_history.csv
[info] Bulundu: customers.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customers.csv
[info] Bulundu: referance_data.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data.csv
[info] Bulundu: referance_data_test.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data_test.csv
[info] Bulundu: sample_submission.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\sample_submission.csv
‚úÖ Dosyalar y√ºklendi:
  hist: (5359609, 7) | cust: (176293, 8) | ref: (133287, 3) | test: (43006, 2)
üéØ MINI MOD: 2000 m√º≈üteri
[features] 500/2000
[features] 1000/2000
[features] 1500/2000
[features] 2000/2000
[train] done: 2000/2000
‚úÖ Eƒüitim feature set hazƒ±r: (2000, 155)
‚úÖ Encode bitti: (2000, 178)
‚úÖ Split tamam | Train: (1558, 175) | Valid: (442, 175)
[LightGBM] [Info] Number of positive: 223, n

In [20]:
# === MINI MODEL (2000 m√º≈üteri) ‚Äî D√ºzeltmeli Versiyon ===
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
import numpy as np, pandas as pd

# Eƒüer X_full_enc daha √∂nce √ºretildiyse devam, yoksa yeniden olu≈ütur:
if "X_full_enc" not in locals():
    raise ValueError("L√ºtfen √∂nce feature √ºretim kƒ±smƒ±nƒ± √ßalƒ±≈ütƒ±r (X_full_enc olu≈üturulmalƒ±).")

# === 1Ô∏è‚É£ Float d√∂n√º≈ü√ºm√º (kritik d√ºzeltme)
# === 1Ô∏è‚É£ Float d√∂n√º≈ü√ºm√º (kritik d√ºzeltme)
X_full_enc = X_full_enc.drop(columns=["ref_date"], errors="ignore").astype(float)


# === 2Ô∏è‚É£ Train / Valid ayƒ±rma
TARGET = "churn"
PROTECTED = ["cust_id","ref_date",TARGET]
feat_cols = [c for c in X_full_enc.columns if c not in PROTECTED]

cut = X_full["ref_date"].quantile(0.80)
train_mask = X_full["ref_date"] < cut
valid_mask = ~train_mask


X_train = X_full_enc.loc[train_mask, feat_cols].fillna(0)
y_train = X_full_enc.loc[train_mask, TARGET].astype(int).values
X_valid = X_full_enc.loc[valid_mask, feat_cols].fillna(0)
y_valid = X_full_enc.loc[valid_mask, TARGET].astype(int).values

print("‚úÖ Train/Valid b√∂l√ºm√º tamam:")
print("Train:", X_train.shape, "| Valid:", X_valid.shape)

# === 3Ô∏è‚É£ Hafif parametreli LightGBM
pos_weight = (len(y_train)/(y_train.sum()+1e-9))
clf = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    num_leaves=40,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=pos_weight,
    class_weight="balanced",
    random_state=42
)

clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    callbacks=[early_stopping(stopping_rounds=30), log_evaluation(period=30)]
)

# === 4Ô∏è‚É£ Skor hesaplama fonksiyonlarƒ±
from sklearn.metrics import roc_auc_score
def gini_from_auc(y_true, y_score): return 2 * roc_auc_score(y_true, y_score) - 1
def recall_at_k(y_true, y_score, k=0.10):
    n=len(y_true); topk=int(np.ceil(n*k)); idx=np.argsort(-y_score)[:topk]
    return float(y_true[idx].sum())/float(y_true.sum()+1e-9)
def lift_at_k(y_true, y_score, k=0.10):
    n=len(y_true); topk=int(np.ceil(n*k)); idx=np.argsort(-y_score)[:topk]
    prec=float(y_true[idx].mean()); prev=float(y_true.mean()); return prec/max(prev,1e-9)
def competition_score(y_true, y_score):
    g=gini_from_auc(y_true,y_score); r=recall_at_k(y_true,y_score); l=lift_at_k(y_true,y_score)
    s=0.4*g+0.3*r+0.3*l
    return {"Gini":round(g,4),"Recall@10%":round(r,4),"Lift@10%":round(l,4),"CompetitionScore":round(s,4)}

# === 5Ô∏è‚É£ Tahmin ve sonu√ß
valid_proba = clf.predict_proba(X_valid)[:,1]
metrics = competition_score(y_valid, valid_proba)
print("\nüìä LEVEL 3 - MINI MODEL SONU√áLARI:")
for k, v in metrics.items():
    print(f"  {k}: {v}")


‚úÖ Train/Valid b√∂l√ºm√º tamam:
Train: (1558, 175) | Valid: (442, 175)
[LightGBM] [Info] Number of positive: 223, number of negative: 1335
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002939 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25220
[LightGBM] [Info] Number of data points in the train set: 1558, number of used features: 170
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 30 rounds
[30]	valid_0's auc: 0.610746	valid_0's binary_logloss: 1.01247
Early stopping, best iteration is:
[1]	valid_0's auc: 0.613695	valid_0's binary_logloss: 0.70776

üìä LEVEL 3 - MINI MODEL SONU√áLARI:
  Gini: 0.2274
  Recall@10%: 0.1912
  Lift@10%: 1.8778
  CompetitionScore: 0.7116


In [21]:
# === 1Ô∏è‚É£ Tarih aralƒ±klarƒ±nƒ± kontrol et ===
print("üîπ Tarih aralƒ±klarƒ±:")
print("  hist date min:", hist["date"].min())
print("  hist date max:", hist["date"].max())
print("  ref ref_date min:", ref["ref_date"].min())
print("  ref ref_date max:", ref["ref_date"].max())

# === 2Ô∏è‚É£ Tarih tiplerini kontrol et ===
print("\nüîπ Tarih veri tipleri:")
print("  hist['date'] dtype:", hist["date"].dtype)
print("  ref['ref_date'] dtype:", ref["ref_date"].dtype)

# === 3Ô∏è‚É£ √ñrnek m√º≈üteri i√ßin feature √ºretimi ===
cid = ref["cust_id"].iloc[0]
rd = ref["ref_date"].iloc[0]
print(f"\nüîπ √ñrnek m√º≈üteri ID: {cid} | Referans tarihi: {rd}")

feat_example = aggregate_for_one(hist[hist["cust_id"] == cid], rd)
print("\nüîπ aggregate_for_one √ßƒ±ktƒ±sƒ± (ilk 10 √∂zellik):")
for k, v in list(feat_example.items())[:10]:
    print(f"  {k}: {v}")

print(f"\nüî∏ Toplam {len(feat_example)} √∂zellik √ºretildi.")
print("‚úÖ Kontrol tamam.")


üîπ Tarih aralƒ±klarƒ±:
  hist date min: 2016-01-01 00:00:00
  hist date max: 2019-06-01 00:00:00
  ref ref_date min: 2017-07-01 00:00:00
  ref ref_date max: 2018-12-01 00:00:00

üîπ Tarih veri tipleri:
  hist['date'] dtype: datetime64[ns]
  ref['ref_date'] dtype: datetime64[ns]

üîπ √ñrnek m√º≈üteri ID: 0 | Referans tarihi: 2017-09-01 00:00:00

üîπ aggregate_for_one √ßƒ±ktƒ±sƒ± (ilk 10 √∂zellik):
  ref_date: 2017-09-01 00:00:00
  recency_days: 31
  mobile_eft_all_cnt_L1M_sum: 2.0
  mobile_eft_all_cnt_L1M_mean: 2.0
  mobile_eft_all_cnt_L1M_std: 0.0
  mobile_eft_all_cnt_L1M_max: 2.0
  active_product_category_nbr_L1M_sum: 2.0
  active_product_category_nbr_L1M_mean: 2.0
  active_product_category_nbr_L1M_std: 0.0
  active_product_category_nbr_L1M_max: 2.0

üî∏ Toplam 146 √∂zellik √ºretildi.
‚úÖ Kontrol tamam.


In [23]:
# ============================================================
# üß† ING Datathon ‚Äî Final Model (Fixed + Optimized Full Pipeline)
# ============================================================

# === 0) K√úT√úPHANELER ===
import os, glob, gc
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.
metrics import roc_auc_score

# === 1) DOSYA Y√úKLEME ===
def find_file(fname: str, search_roots=(".",)):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                return os.path.abspath(path)
    return None

def resolve_path(default_path: str, fallback_name: str):
    if os.path.exists(default_path):
        return default_path
    p = find_file(fallback_name, search_roots=(".", os.getcwd()))
    if p:
        print(f"[info] Bulundu: {fallback_name} -> {p}")
        return p
    raise FileNotFoundError(f"Dosya bulunamadƒ±: {fallback_name}")

PATH_HISTORY = resolve_path("/mnt/data/customer_history.csv", "customer_history.csv")
PATH_CUSTOMERS = resolve_path("/mnt/data/customers.csv", "customers.csv")
PATH_REF = resolve_path("/mnt/data/referance_data.csv", "referance_data.csv")
PATH_TEST = resolve_path("/mnt/data/referance_data_test.csv", "referance_data_test.csv")
PATH_SUB = resolve_path("/mnt/data/sample_submission.csv", "sample_submission.csv")

hist = pd.read_csv(PATH_HISTORY, parse_dates=["date"], low_memory=False)
cust = pd.read_csv(PATH_CUSTOMERS, low_memory=False)
ref  = pd.read_csv(PATH_REF, parse_dates=["ref_date"], low_memory=False)
test = pd.read_csv(PATH_TEST, parse_dates=["ref_date"], low_memory=False)
sub  = pd.read_csv(PATH_SUB, low_memory=False)

for df in (hist, cust, ref, test, sub):
    if "cust_id" in df.columns:
        df["cust_id"] = pd.to_numeric(df["cust_id"], errors="coerce").astype("Int64")
if "churn" in ref.columns:
    ref["churn"] = pd.to_numeric(ref["churn"], errors="coerce").fillna(0).astype("Int8")

print("‚úÖ Dosyalar y√ºklendi:")
print("  hist:", hist.shape, "| cust:", cust.shape, "| ref:", ref.shape, "| test:", test.shape)

# === 2) √ñZELLƒ∞K √úRETME AYARLARI ===
HIST_COLS = [
    "cust_id", "date",
    "mobile_eft_all_cnt", "active_product_category_nbr",
    "mobile_eft_all_amt", "cc_transaction_all_amt", "cc_transaction_all_cnt"
]
hist = hist[[c for c in HIST_COLS if c in hist.columns]].sort_values(["cust_id", "date"]).reset_index(drop=True)
WINDOWS = [1, 3, 6, 12]
BASE_NUM_COLS = [c for c in HIST_COLS if c not in ["cust_id", "date"]]

def _agg_safe(s, fn):
    s = pd.to_numeric(s, errors="coerce")
    s = s[np.isfinite(s)]
    if s.size == 0: return 0.0
    if fn == "sum":  return float(s.sum())
    if fn == "mean": return float(s.mean())
    if fn == "std":  return float(s.std(ddof=0))
    if fn == "max":  return float(s.max())
    return 0.0

def aggregate_for_one(h_cust: pd.DataFrame, ref_date: pd.Timestamp):
    feat = {"ref_date": ref_date}
    h = h_cust[h_cust["date"] < ref_date]
    if h.empty:
        for col in BASE_NUM_COLS:
            for w in WINDOWS:
                feat[f"{col}_L{w}M_sum"] = feat[f"{col}_L{w}M_mean"] = feat[f"{col}_L{w}M_std"] = feat[f"{col}_L{w}M_max"] = 0.0
        feat["recency_days"] = 9999
    else:
        feat["recency_days"] = int((ref_date - h["date"].max()).days)
        for w in WINDOWS:
            start = ref_date - pd.DateOffset(months=w)
            hw = h[(h["date"] >= start) & (h["date"] < ref_date)]
            for col in BASE_NUM_COLS:
                feat[f"{col}_L{w}M_sum"]  = _agg_safe(hw[col], "sum")
                feat[f"{col}_L{w}M_mean"] = _agg_safe(hw[col], "mean")
                feat[f"{col}_L{w}M_std"]  = _agg_safe(hw[col], "std")
                feat[f"{col}_L{w}M_max"]  = _agg_safe(hw[col], "max")

    eps = 1e-6
    for w in WINDOWS:
        feat[f"cc_amt_per_txn_L{w}M"] = feat.get(f"cc_transaction_all_amt_L{w}M_sum", 0) / (feat.get(f"cc_transaction_all_cnt_L{w}M_sum", 0) + eps)
        feat[f"mobile_eft_amt_per_txn_L{w}M"] = feat.get(f"mobile_eft_all_amt_L{w}M_sum", 0) / (feat.get(f"mobile_eft_all_cnt_L{w}M_sum", 0) + eps)

    # Trend ve farklar
    for short, long in [(1, 3), (3, 6), (6, 12)]:
        for base in ["cc_transaction_all_amt", "cc_transaction_all_cnt", "mobile_eft_all_amt", "mobile_eft_all_cnt", "active_product_category_nbr"]:
            ms = feat.get(f"{base}_L{short}M_mean", 0)
            ml = feat.get(f"{base}_L{long}M_mean", 0)
            feat[f"{base}_trend_mean_L{short}vL{long}_diff"] = ms - ml
            feat[f"{base}_trend_mean_L{short}vL{long}_ratio"] = ms / (ml + eps)

    # Ek g√º√ßl√º feature'lar
    for short, long in [(1, 3), (3, 6), (6, 12)]:
        ms = feat.get(f"cc_transaction_all_cnt_L{short}M_mean", 0)
        ml = feat.get(f"cc_transaction_all_cnt_L{long}M_mean", 0)
        feat[f"txn_freq_trend_L{short}vL{long}"] = (ms - ml) / (ml + eps)
    feat["recency_score"] = float(np.exp(-feat.get("recency_days", 9999) / 30.0))
    for col in ["cc_transaction_all_cnt", "mobile_eft_all_cnt"]:
        zero_months = sum(feat.get(f"{col}_L{w}M_sum", 0) == 0 for w in [1, 3, 6, 12])
        feat[f"{col}_zero_month_ratio"] = zero_months / 4.0
    for base in ["cc_transaction_all_amt", "mobile_eft_all_amt"]:
        for w in [3, 6, 12]:
            std = feat.get(f"{base}_L{w}M_std", 0)
            mean = feat.get(f"{base}_L{w}M_mean", 0)
            feat[f"{base}_volatility_L{w}M"] = std / (mean + eps)
    return feat

def build_features_for_keys(hist_df, keys_df, progress_every=5000):
    rows, cache = [], {}
    for i, r in enumerate(keys_df.itertuples(index=False), 1):
        cid, rd = r.cust_id, r.ref_date
        if pd.isna(cid) or pd.isna(rd): continue
        h_cust = cache.get(cid)
        if h_cust is None:
            h_cust = hist_df[hist_df["cust_id"] == cid]
            cache[cid] = h_cust
        f = aggregate_for_one(h_cust, rd)
        f["cust_id"] = cid
        rows.append(f)
        if progress_every and i % progress_every == 0:
            print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(rows)

# === 3) METRƒ∞KLER (d√ºzeltilmi≈ü) ===
def gini_from_auc(y_true, y_score):
    return 2 * roc_auc_score(y_true, y_score) - 1

def recall_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true)
    n = len(y_true)
    topk = int(np.ceil(n * k))
    idx = np.argsort(-y_score)[:topk]
    return float(y_true[idx].sum()) / float(y_true.sum() + 1e-9)

def lift_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true)
    n = len(y_true)
    topk = int(np.ceil(n * k))
    idx = np.argsort(-y_score)[:topk]
    prec = float(y_true[idx].mean())
    prev = float(y_true.mean())
    return prec / max(prev, 1e-9)

def competition_score(y_true, y_score):
    if isinstance(y_true, pd.Series):
        y_true = y_true.reset_index(drop=True)
    g = gini_from_auc(y_true, y_score)
    r = recall_at_k(y_true, y_score)
    l = lift_at_k(y_true, y_score)
    s = 0.4 * g + 0.3 * r + 0.3 * l
    return {"Gini": round(g, 4), "Recall@10%": round(r, 4), "Lift@10%": round(l, 4), "CompetitionScore": round(s, 4)}

# === 4) MOD SE√áƒ∞Mƒ∞ ===
RUN_MODE = "full"   # "mini" veya "full"
SAMPLE_N = 2000
keys = ref[["cust_id", "ref_date", "churn"]]
if RUN_MODE == "mini":
    keys = keys.sample(SAMPLE_N, random_state=42)
keys = keys.sort_values("ref_date").reset_index(drop=True)
print(f"üéØ √áalƒ±≈üma modu: {RUN_MODE} | M√º≈üteri sayƒ±sƒ±: {len(keys)}")

# === 5) FEATURE √úRETƒ∞Mƒ∞ ===
parts = []
BATCH = 20000 if RUN_MODE == "full" else len(keys)
for s in range(0, len(keys), BATCH):
    e = min(s + BATCH, len(keys))
    feats_part = build_features_for_keys(hist, keys.iloc[s:e][["cust_id", "ref_date"]], progress_every=5000 if RUN_MODE == "full" else 500)
    part = keys.iloc[s:e].merge(feats_part, on=["cust_id", "ref_date"], how="left").merge(cust, on="cust_id", how="left")
    parts.append(part)
    print(f"[train] done: {e}/{len(keys)}")

X_full = pd.concat(parts, ignore_index=True)
del parts; gc.collect()
print("‚úÖ Eƒüitim seti:", X_full.shape)

# === 6) ENCODE + TEMƒ∞ZLE (RAM optimize) ===
num_cols = X_full.select_dtypes(include="number").columns
cat_cols = [c for c in X_full.columns if c not in num_cols and c != "ref_date"]
X_full[num_cols] = X_full[num_cols].fillna(0)
for c in cat_cols:
    X_full[c] = X_full[c].fillna("Unknown")

X_full_enc = pd.get_dummies(X_full.drop(columns=["ref_date"]), columns=cat_cols, drop_first=True, dtype=np.float32)
X_full_enc = X_full_enc.astype(np.float32)
print("‚úÖ Encode bitti:", X_full_enc.shape)

# === 7) TRAIN/VALID SPLIT ===
cut = X_full["ref_date"].quantile(0.80)
train_mask = X_full["ref_date"] < cut
valid_mask = ~train_mask
TARGET = "churn"
PROTECTED = ["cust_id", TARGET]
feat_cols = [c for c in X_full_enc.columns if c not in PROTECTED]
X_train = X_full_enc.loc[train_mask, feat_cols]
y_train = X_full.loc[train_mask, TARGET].astype(int)
X_valid = X_full_enc.loc[valid_mask, feat_cols]
y_valid = X_full.loc[valid_mask, TARGET].astype(int)
print("‚úÖ Split tamam | Train:", X_train.shape, "| Valid:", X_valid.shape)

# === 8) MODEL ===
pos_weight = len(y_train) / (y_train.sum() + 1e-9)
clf = LGBMClassifier(
    n_estimators=900, learning_rate=0.035, max_depth=9, num_leaves=110,
    subsample=0.85, colsample_bytree=0.85, scale_pos_weight=pos_weight,
    random_state=42
)
clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=50)]
)

# === 9) SKOR ===
valid_proba = clf.predict_proba(X_valid)[:, 1]
metrics = competition_score(y_valid, valid_proba)
print("\nüìä VALIDATION SONU√áLARI:", metrics)

# === 10) SUBMISSION (full modda) ===
if RUN_MODE == "full":
    print("\nüßæ Submission hazƒ±rlanƒ±yor...")
    test_keys = test[["cust_id", "ref_date"]]
    X_hist_test = build_features_for_keys(hist, test_keys, progress_every=5000)
    X_test = test_keys.merge(X_hist_test, on=["cust_id", "ref_date"], how="left").merge(cust, on="cust_id", how="left")
    num_cols_t = X_test.select_dtypes(include=[np.number]).columns
    cat_cols_t = [c for c in X_test.columns if c not in num_cols_t and c != "ref_date"]
    X_test[num_cols_t] = X_test[num_cols_t].fillna(0)
    for c in cat_cols_t:
        X_test[c] = X_test[c].fillna("Unknown")
    X_test_enc = pd.get_dummies(X_test.drop(columns=["ref_date"]), columns=cat_cols_t, drop_first=True, dtype=np.float32)
    X_test_enc = X_test_enc.reindex(columns=X_train.columns, fill_value=0)
    test_proba = clf.predict_proba(X_test_enc)[:, 1]
    
    
    out = sub[["cust_id"]].merge(pd.DataFrame({"cust_id": X_test["cust_id"], "churn": test_proba}), on="cust_id", how="left")
    out["churn"] = out["churn"].fillna(float(ref["churn"].mean()))
    out.to_csv("submission.csv", index=False, float_format="%.6f")
    print("‚úÖ submission.csv yazƒ±ldƒ±! Satƒ±r sayƒ±sƒ±:", out.shape)


[info] Bulundu: customer_history.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customer_history.csv
[info] Bulundu: customers.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customers.csv
[info] Bulundu: referance_data.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data.csv
[info] Bulundu: referance_data_test.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data_test.csv
[info] Bulundu: sample_submission.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\sample_submission.csv
‚úÖ Dosyalar y√ºklendi:
  hist: (5359609, 7) | cust: (176293, 8) | ref: (133287, 3) | test: (43006, 2)
üéØ √áalƒ±≈üma modu: full | M√º≈üteri sayƒ±sƒ±: 133287
[features] 5000/20000
[features] 10000/20000
[features] 15000/20000
[features] 20000/20000
[train] done: 20000/133287
[features] 5000/20000
[features] 10000/20000
[features] 15000/20000
[features] 20000/20000
[train] done: 40000/133287
[features] 5000/2000

In [24]:
# ============================================================
# üß† ING Datathon ‚Äî Optimized Pipeline (Mini-first, then Full)
#  - Mini mod: hƒ±zlƒ± deney, submission yok
#  - Full mod : t√ºm veri + submission.csv
#  - Ama√ß: CompetitionScore'u stabilize edip 1.22+ seviyesine yakla≈ütƒ±rmak
# ============================================================

# === 0) K√úT√úPHANELER ===
import os, glob, gc
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score

pd.set_option("display.width", 180)
pd.set_option("display.max_columns", 200)

# === 1) DOSYA Y√úKLEME ===
def find_file(fname: str, search_roots=(".",)):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                return os.path.abspath(path)
    return None

def resolve_path(default_path: str, fallback_name: str):
    if os.path.exists(default_path):
        return default_path
    p = find_file(fallback_name, search_roots=(".", os.getcwd()))
    if p:
        print(f"[info] Bulundu: {fallback_name} -> {p}")
        return p
    raise FileNotFoundError(f"Dosya bulunamadƒ±: {fallback_name}")

PATH_HISTORY = resolve_path("/mnt/data/customer_history.csv", "customer_history.csv")
PATH_CUSTOMERS = resolve_path("/mnt/data/customers.csv", "customers.csv")
PATH_REF = resolve_path("/mnt/data/referance_data.csv", "referance_data.csv")
PATH_TEST = resolve_path("/mnt/data/referance_data_test.csv", "referance_data_test.csv")
PATH_SUB = resolve_path("/mnt/data/sample_submission.csv", "sample_submission.csv")

hist = pd.read_csv(PATH_HISTORY, parse_dates=["date"], low_memory=False)
cust = pd.read_csv(PATH_CUSTOMERS, low_memory=False)
ref  = pd.read_csv(PATH_REF, parse_dates=["ref_date"], low_memory=False)
test = pd.read_csv(PATH_TEST, parse_dates=["ref_date"], low_memory=False)
sub  = pd.read_csv(PATH_SUB, low_memory=False)

for df in (hist, cust, ref, test, sub):
    if "cust_id" in df.columns:
        df["cust_id"] = pd.to_numeric(df["cust_id"], errors="coerce").astype("Int64")
if "churn" in ref.columns:
    ref["churn"] = pd.to_numeric(ref["churn"], errors="coerce").fillna(0).astype("Int8")

print("‚úÖ Dosyalar y√ºklendi:")
print("  hist:", hist.shape, "| cust:", cust.shape, "| ref:", ref.shape, "| test:", test.shape)

# === 2) √ñZELLƒ∞K √úRETME ===
HIST_COLS = [
    "cust_id","date",
    "mobile_eft_all_cnt","active_product_category_nbr",
    "mobile_eft_all_amt","cc_transaction_all_amt","cc_transaction_all_cnt"
]
hist = hist[[c for c in HIST_COLS if c in hist.columns]].sort_values(["cust_id","date"]).reset_index(drop=True)
WINDOWS = [1,3,6,12]
BASE_NUM_COLS = [c for c in HIST_COLS if c not in ["cust_id","date"]]

def _agg_safe(s, fn):
    s = pd.to_numeric(s, errors="coerce")
    s = s[np.isfinite(s)]
    if s.size == 0: return 0.0
    if fn=="sum":  return float(s.sum())
    if fn=="mean": return float(s.mean())
    if fn=="std":  return float(s.std(ddof=0))
    if fn=="max":  return float(s.max())
    return 0.0

def aggregate_for_one(h_cust: pd.DataFrame, ref_date: pd.Timestamp):
    feat = {"ref_date": ref_date}
    h = h_cust[h_cust["date"] < ref_date]

    # ge√ßmi≈ü yoksa sƒ±fƒ±rla
    if h.empty:
        for col in BASE_NUM_COLS:
            for w in WINDOWS:
                feat[f"{col}_L{w}M_sum"]=feat[f"{col}_L{w}M_mean"]=feat[f"{col}_L{w}M_std"]=feat[f"{col}_L{w}M_max"]=0.0
        feat["recency_days"] = 9999
    else:
        feat["recency_days"] = int((ref_date - h["date"].max()).days)
        for w in WINDOWS:
            start = ref_date - pd.DateOffset(months=w)
            hw = h[(h["date"] >= start) & (h["date"] < ref_date)]
            for col in BASE_NUM_COLS:
                feat[f"{col}_L{w}M_sum"]  = _agg_safe(hw[col], "sum")
                feat[f"{col}_L{w}M_mean"] = _agg_safe(hw[col], "mean")
                feat[f"{col}_L{w}M_std"]  = _agg_safe(hw[col], "std")
                feat[f"{col}_L{w}M_max"]  = _agg_safe(hw[col], "max")

    eps = 1e-6
    # ƒ∞≈ülem ba≈üƒ±na tutar oranlarƒ±
    for w in WINDOWS:
        feat[f"cc_amt_per_txn_L{w}M"] = feat.get(f"cc_transaction_all_amt_L{w}M_sum",0.0)/(feat.get(f"cc_transaction_all_cnt_L{w}M_sum",0.0)+eps)
        feat[f"mobile_eft_amt_per_txn_L{w}M"] = feat.get(f"mobile_eft_all_amt_L{w}M_sum",0.0)/(feat.get(f"mobile_eft_all_cnt_L{w}M_sum",0.0)+eps)

    # Kƒ±sa vs uzun d√∂nem TREND (yalnƒ±zca mean tabanlƒ±, sadele≈ütirilmi≈ü)
    for short,long in [(1,3),(3,6),(6,12)]:
        for base in ["cc_transaction_all_amt","cc_transaction_all_cnt","mobile_eft_all_amt","mobile_eft_all_cnt","active_product_category_nbr"]:
            ms = feat.get(f"{base}_L{short}M_mean",0.0)
            ml = feat.get(f"{base}_L{long}M_mean",0.0)
            feat[f"{base}_trend_mean_L{short}vL{long}_diff"]  = ms-ml
            feat[f"{base}_trend_mean_L{short}vL{long}_ratio"] = ms/(ml+eps)

    # Recency d√∂n√º≈ü√ºm√º (daha stabil)
    feat["recency_log"] = float(np.log1p(feat.get("recency_days", 9999)))

    return feat

def build_features_for_keys(hist_df, keys_df, progress_every=5000):
    rows, cache = [], {}
    for i, r in enumerate(keys_df.itertuples(index=False), 1):
        cid, rd = r.cust_id, r.ref_date
        if pd.isna(cid) or pd.isna(rd): 
            continue
        h_cust = cache.get(cid)
        if h_cust is None:
            h_cust = hist_df[hist_df["cust_id"] == cid]
            cache[cid] = h_cust
        f = aggregate_for_one(h_cust, rd)
        f["cust_id"] = cid
        rows.append(f)
        if progress_every and (i % progress_every == 0):
            print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(rows)

# === 3) METRƒ∞KLER ===
def gini_from_auc(y_true, y_score):
    return 2 * roc_auc_score(y_true, y_score) - 1

def recall_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true)
    n = len(y_true)
    topk = int(np.ceil(n * k))
    idx = np.argsort(-y_score)[:topk]
    return float(y_true[idx].sum()) / float(y_true.sum() + 1e-9)

def lift_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true)
    n = len(y_true)
    topk = int(np.ceil(n * k))
    idx = np.argsort(-y_score)[:topk]
    prec = float(y_true[idx].mean())
    prev = float(y_true.mean())
    return prec / max(prev, 1e-9)

def competition_score(y_true, y_score):
    g = gini_from_auc(y_true, y_score)
    r = recall_at_k(y_true, y_score)
    l = lift_at_k(y_true, y_score)
    s = 0.4*g + 0.3*r + 0.3*l
    return {"Gini": round(g, 4), "Recall@10%": round(r, 4), "Lift@10%": round(l, 4), "CompetitionScore": round(s, 4)}

# === 4) √áALI≈ûMA MODU (√∂nce mini!) ===
# "mini": hƒ±zlƒ±, submission yok | "full": t√ºm veri, submission var
RUN_MODE = "mini"
SAMPLE_N = 8000  # mini i√ßin yeterli bir √∂rnek b√ºy√ºkl√ºƒü√º (RAM ve hƒ±z i√ßin)
LEVEL1 = False   # True yaparsan: en eski %20 tarih dilimi (level-1 benzeri)
print(f"üéØ √áalƒ±≈üma modu: {RUN_MODE} | LEVEL1: {LEVEL1}")

# === 5) TRAIN KEYS ===
keys = ref[["cust_id","ref_date","churn"]].copy()

# Level-1 tarzƒ±: daha eski tarihlerden bir dilim se√ß (data leakage riskini azaltƒ±r)
if LEVEL1:
    cut_lev = keys["ref_date"].quantile(0.20)
    keys = keys[keys["ref_date"] <= cut_lev].copy()

# Mini mod: rastgele deƒüil, zaman sƒ±rasƒ±nƒ± koruyarak uniform √∂rnekle
keys = keys.sort_values("ref_date").reset_index(drop=True)
if RUN_MODE == "mini" and len(keys) > SAMPLE_N:
    # Zamanƒ± bozmadan yakla≈üƒ±k e≈üit aralƒ±klarla √∂rnekle
    idx = np.linspace(0, len(keys)-1, SAMPLE_N).round().astype(int)
    keys = keys.iloc[idx].reset_index(drop=True)

print(f"üóùÔ∏è Anahtar sayƒ±sƒ± (cust_id, ref_date): {len(keys)}")

# === 6) FEATURE √úRETƒ∞Mƒ∞ ===
parts = []
BATCH = 20000 if RUN_MODE=="full" else len(keys)
for s in range(0, len(keys), BATCH):
    e = min(s + BATCH, len(keys))
    feats_part = build_features_for_keys(hist, keys.iloc[s:e][["cust_id","ref_date"]], progress_every=2000 if RUN_MODE=="mini" else 5000)
    part = keys.iloc[s:e].merge(feats_part, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    parts.append(part)
    print(f"[train] done: {e}/{len(keys)}")

X_full = pd.concat(parts, ignore_index=True)
del parts; gc.collect()
print("‚úÖ Eƒüitim seti hazƒ±r:", X_full.shape)

# === 7) ENCODE + EKSƒ∞K DOLDUR ===
num_cols = X_full.select_dtypes(include="number").columns
cat_cols = [c for c in X_full.columns if c not in num_cols and c != "ref_date"]
X_full[num_cols] = X_full[num_cols].fillna(0)
for c in cat_cols:
    X_full[c] = X_full[c].fillna("Unknown")

# D√∂n√º≈ü√ºm: ref_date'i encode'a sokma
X_full_enc = pd.get_dummies(
    X_full.drop(columns=["ref_date"]),
    columns=cat_cols, drop_first=True
)  # float64 kalsƒ±n (precision i√ßin)

print("‚úÖ Encode bitti:", X_full_enc.shape)

# === 8) ZAMAN BAZLI TRAIN/VALID SPLIT ===
cut = X_full["ref_date"].quantile(0.80)
train_mask = X_full["ref_date"] < cut
valid_mask = ~train_mask

TARGET = "churn"
PROTECTED = ["cust_id", TARGET]
feat_cols = [c for c in X_full_enc.columns if c not in PROTECTED]

X_train = X_full_enc.loc[train_mask, feat_cols]
y_train = X_full.loc[train_mask, TARGET].astype(int)
X_valid = X_full_enc.loc[valid_mask, feat_cols]
y_valid = X_full.loc[valid_mask, TARGET].astype(int)

print("‚úÖ Split tamam | Train:", X_train.shape, "| Valid:", X_valid.shape)

# === 9) MODEL (overfit azaltƒ±lmƒ±≈ü) ===
clf = LGBMClassifier(
    n_estimators=600,
    learning_rate=0.04,
    max_depth=7,
    num_leaves=72,
    subsample=0.85,
    colsample_bytree=0.80,
    min_child_samples=50,
    class_weight="balanced",  # scale_pos_weight yerine
    random_state=42
)

clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    callbacks=[early_stopping(stopping_rounds=30), log_evaluation(period=50)]
)

# === 10) DOƒûRULAMA SKORLARI ===
valid_proba = clf.predict_proba(X_valid)[:, 1]
metrics = competition_score(y_valid, valid_proba)
print("\nüìä VALIDATION SONU√áLARI:", metrics)

# === 11) FEATURE IMPORTANCE ‚Äî opsiyonel analiz (mini modda bak)
try:
    imp = pd.DataFrame({"feature": clf.feature_name_, "importance": clf.feature_importances_}).sort_values("importance", ascending=False)
    print("\nüèÖ En √∂nemli 20 deƒüi≈üken:")
    print(imp.head(20))
except Exception as e:
    print("‚ö†Ô∏è Importance okunamadƒ±:", e)

# === 12) SUBMISSION (sadece full modda) ===
if RUN_MODE == "full":
    print("\nüßæ Submission hazƒ±rlanƒ±yor...")
    test_keys = test[["cust_id","ref_date"]].copy()
    X_hist_test = build_features_for_keys(hist, test_keys, progress_every=5000)

    X_test = test_keys.merge(X_hist_test, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    num_cols_t = X_test.select_dtypes(include=[np.number]).columns
    cat_cols_t = [c for c in X_test.columns if c not in num_cols_t and c != "ref_date"]

    X_test[num_cols_t] = X_test[num_cols_t].fillna(0)
    for c in cat_cols_t:
        X_test[c] = X_test[c].fillna("Unknown")

    X_test_enc = pd.get_dummies(X_test.drop(columns=["ref_date"]), columns=cat_cols_t, drop_first=True)

    # Eƒüitimdeki kolonlarla tam hizala
    TRAIN_FEATS = X_train.columns.tolist()
    X_test_enc = X_test_enc.reindex(columns=TRAIN_FEATS, fill_value=0)

    test_proba = clf.predict_proba(X_test_enc)[:, 1]
    out = sub[["cust_id"]].merge(pd.DataFrame({"cust_id": X_test["cust_id"], "churn": test_proba}), on="cust_id", how="left")
    out["churn"] = out["churn"].fillna(float(ref["churn"].mean()))
    out.to_csv("submission.csv", index=False, float_format="%.6f")
    print("‚úÖ submission.csv yazƒ±ldƒ±! Satƒ±r sayƒ±sƒ±:", out.shape)

print("\nüéâ Bitti! RUN_MODE='mini' ile hƒ±zlƒ± skor aldƒ±n. ƒ∞√ßine sinerse RUN_MODE='full' yapƒ±p submission √ºretebilirsin.")


[info] Bulundu: customer_history.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customer_history.csv
[info] Bulundu: customers.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customers.csv
[info] Bulundu: referance_data.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data.csv
[info] Bulundu: referance_data_test.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data_test.csv
[info] Bulundu: sample_submission.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\sample_submission.csv
‚úÖ Dosyalar y√ºklendi:
  hist: (5359609, 7) | cust: (176293, 8) | ref: (133287, 3) | test: (43006, 2)
üéØ √áalƒ±≈üma modu: mini | LEVEL1: False
üóùÔ∏è Anahtar sayƒ±sƒ± (cust_id, ref_date): 8000
[features] 2000/8000
[features] 4000/8000
[features] 6000/8000
[features] 8000/8000
[train] done: 8000/8000
‚úÖ Eƒüitim seti hazƒ±r: (8000, 130)
‚úÖ Encode bitti: (8000, 152)
‚úÖ Split tamam | Train: (6235, 150) | Vali

In [25]:
# ============================================================
# üß† ING Datathon ‚Äî Mini-Boost Pipeline (‚â•0.90 hedefli mini)
# ============================================================

# === 0) K√úT√úPHANELER ===
import os, glob, gc
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score

pd.set_option("display.width", 180)
pd.set_option("display.max_columns", 200)

# === 1) DOSYA Y√úKLEME ===
def find_file(fname: str, search_roots=(".",)):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                return os.path.abspath(path)
    return None

def resolve_path(default_path: str, fallback_name: str):
    if os.path.exists(default_path):
        return default_path
    p = find_file(fallback_name, search_roots=(".", os.getcwd()))
    if p:
        print(f"[info] Bulundu: {fallback_name} -> {p}")
        return p
    raise FileNotFoundError(f"Dosya bulunamadƒ±: {fallback_name}")

PATH_HISTORY = resolve_path("/mnt/data/customer_history.csv", "customer_history.csv")
PATH_CUSTOMERS = resolve_path("/mnt/data/customers.csv", "customers.csv")
PATH_REF      = resolve_path("/mnt/data/referance_data.csv", "referance_data.csv")
PATH_TEST     = resolve_path("/mnt/data/referance_data_test.csv", "referance_data_test.csv")
PATH_SUB      = resolve_path("/mnt/data/sample_submission.csv", "sample_submission.csv")

hist = pd.read_csv(PATH_HISTORY, parse_dates=["date"], low_memory=False)
cust = pd.read_csv(PATH_CUSTOMERS, low_memory=False)
ref  = pd.read_csv(PATH_REF, parse_dates=["ref_date"], low_memory=False)
test = pd.read_csv(PATH_TEST, parse_dates=["ref_date"], low_memory=False)
sub  = pd.read_csv(PATH_SUB, low_memory=False)

for df in (hist, cust, ref, test, sub):
    if "cust_id" in df.columns:
        df["cust_id"] = pd.to_numeric(df["cust_id"], errors="coerce").astype("Int64")
if "churn" in ref.columns:
    ref["churn"] = pd.to_numeric(ref["churn"], errors="coerce").fillna(0).astype("Int8")

# Kolon adlarƒ±nda bo≈üluk varsa temizle (LGBM uyarƒ±sƒ±nƒ± keser)
cust.columns = cust.columns.str.strip()
hist.columns = hist.columns.str.strip()

print("‚úÖ Dosyalar y√ºklendi:")
print("  hist:", hist.shape, "| cust:", cust.shape, "| ref:", ref.shape, "| test:", test.shape)

# === 2) √ñZELLƒ∞K √úRETƒ∞Mƒ∞ ===
HIST_COLS = [
    "cust_id","date",
    "mobile_eft_all_cnt","active_product_category_nbr",
    "mobile_eft_all_amt","cc_transaction_all_amt","cc_transaction_all_cnt"
]
hist = hist[[c for c in HIST_COLS if c in hist.columns]].sort_values(["cust_id","date"]).reset_index(drop=True)
WINDOWS = [1,3,6,12]
BASE_NUM_COLS = [c for c in HIST_COLS if c not in ["cust_id","date"]]

def _agg_safe(s, fn):
    s = pd.to_numeric(s, errors="coerce")
    s = s[np.isfinite(s)]
    if s.size == 0: return 0.0
    if fn=="sum":  return float(s.sum())
    if fn=="mean": return float(s.mean())
    if fn=="std":  return float(s.std(ddof=0))
    if fn=="max":  return float(s.max())
    return 0.0

def _months_active(frame: pd.DataFrame) -> int:
    if frame.empty: return 0
    cond = (pd.to_numeric(frame.get("cc_transaction_all_cnt", 0), errors="coerce").fillna(0) > 0) | \
           (pd.to_numeric(frame.get("mobile_eft_all_cnt", 0), errors="coerce").fillna(0) > 0)
    return int(cond.sum())

def aggregate_for_one(h_cust: pd.DataFrame, ref_date: pd.Timestamp):
    feat = {"ref_date": ref_date}
    h = h_cust[h_cust["date"] < ref_date]

    # ge√ßmi≈ü yoksa sƒ±fƒ±rla
    if h.empty:
        for col in BASE_NUM_COLS:
            for w in WINDOWS:
                feat[f"{col}_L{w}M_sum"]=feat[f"{col}_L{w}M_mean"]=feat[f"{col}_L{w}M_std"]=feat[f"{col}_L{w}M_max"]=0.0
        feat["recency_days"] = 9999
        # activity & shares & momentum default
        for w in WINDOWS:
            feat[f"months_active_L{w}M"] = 0
            feat[f"inactivity_ratio_L{w}M"] = 1.0
            feat[f"cc_share_amt_L{w}M"] = 0.0
            feat[f"eft_share_amt_L{w}M"] = 0.0
            # momentum L1 vs L3-L6
        feat["cc_amt_momentum_L1v3m"] = 0.0
        feat["cc_amt_momentum_L1v6m"] = 0.0
        feat["eft_amt_momentum_L1v3m"] = 0.0
        feat["eft_amt_momentum_L1v6m"] = 0.0
    else:
        feat["recency_days"] = int((ref_date - h["date"].max()).days)
        for w in WINDOWS:
            start = ref_date - pd.DateOffset(months=w)
            hw = h[(h["date"] >= start) & (h["date"] < ref_date)]
            for col in BASE_NUM_COLS:
                feat[f"{col}_L{w}M_sum"]  = _agg_safe(hw[col], "sum")
                feat[f"{col}_L{w}M_mean"] = _agg_safe(hw[col], "mean")
                feat[f"{col}_L{w}M_std"]  = _agg_safe(hw[col], "std")
                feat[f"{col}_L{w}M_max"]  = _agg_safe(hw[col], "max")
            # activity ve inactivity
            ma = _months_active(hw)
            feat[f"months_active_L{w}M"] = float(ma)
            feat[f"inactivity_ratio_L{w}M"] = float(max(w - ma, 0)) / float(w)

        # Kanal paylarƒ±
        eps = 1e-6
        for w in WINDOWS:
            cc_sum  = feat.get(f"cc_transaction_all_amt_L{w}M_sum", 0.0)
            eft_sum = feat.get(f"mobile_eft_all_amt_L{w}M_sum", 0.0)
            tot = cc_sum + eft_sum
            feat[f"cc_share_amt_L{w}M"]  = cc_sum / (tot + eps)
            feat[f"eft_share_amt_L{w}M"] = eft_sum / (tot + eps)

        # Momentum: son 1 ay toplamƒ± / son 3-6 ay ortalamasƒ±
        L1_cc = feat.get("cc_transaction_all_amt_L1M_sum", 0.0)
        L1_eft= feat.get("mobile_eft_all_amt_L1M_sum", 0.0)
        L3_ccm= feat.get("cc_transaction_all_amt_L3M_mean", 0.0)
        L6_ccm= feat.get("cc_transaction_all_amt_L6M_mean", 0.0)
        L3_eftm=feat.get("mobile_eft_all_amt_L3M_mean", 0.0)
        L6_eftm=feat.get("mobile_eft_all_amt_L6M_mean", 0.0)
        eps = 1e-6
        feat["cc_amt_momentum_L1v3m"]  = L1_cc  / (L3_ccm  + eps)
        feat["cc_amt_momentum_L1v6m"]  = L1_cc  / (L6_ccm  + eps)
        feat["eft_amt_momentum_L1v3m"] = L1_eft / (L3_eftm + eps)
        feat["eft_amt_momentum_L1v6m"] = L1_eft / (L6_eftm + eps)

    # ƒ∞≈ülem ba≈üƒ±na tutar oranlarƒ±
    eps = 1e-6
    for w in WINDOWS:
        feat[f"cc_amt_per_txn_L{w}M"] = feat.get(f"cc_transaction_all_amt_L{w}M_sum",0.0)/(feat.get(f"cc_transaction_all_cnt_L{w}M_sum",0.0)+eps)
        feat[f"mobile_eft_amt_per_txn_L{w}M"] = feat.get(f"mobile_eft_all_amt_L{w}M_sum",0.0)/(feat.get(f"mobile_eft_all_cnt_L{w}M_sum",0.0)+eps)

    # Kƒ±sa vs uzun d√∂nem TREND (yalnƒ±zca mean bazlƒ±)
    for short,long in [(1,3),(3,6),(6,12)]:
        for base in ["cc_transaction_all_amt","cc_transaction_all_cnt","mobile_eft_all_amt","mobile_eft_all_cnt","active_product_category_nbr"]:
            ms = feat.get(f"{base}_L{short}M_mean",0.0)
            ml = feat.get(f"{base}_L{long}M_mean",0.0)
            feat[f"{base}_trend_mean_L{short}vL{long}_diff"]  = ms-ml
            feat[f"{base}_trend_mean_L{short}vL{long}_ratio"] = ms/(ml+eps)

    # Recency d√∂n√º≈ü√ºm√º
    feat["recency_log"] = float(np.log1p(feat.get("recency_days", 9999)))

    return feat

def build_features_for_keys(hist_df, keys_df, progress_every=5000):
    rows, cache = [], {}
    for i, r in enumerate(keys_df.itertuples(index=False), 1):
        cid, rd = r.cust_id, r.ref_date
        if pd.isna(cid) or pd.isna(rd): 
            continue
        h_cust = cache.get(cid)
        if h_cust is None:
            h_cust = hist_df[hist_df["cust_id"] == cid]
            cache[cid] = h_cust
        f = aggregate_for_one(h_cust, rd)
        f["cust_id"] = cid
        rows.append(f)
        if progress_every and (i % progress_every == 0):
            print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(rows)

# === 3) METRƒ∞KLER ===
def gini_from_auc(y_true, y_score):
    return 2 * roc_auc_score(y_true, y_score) - 1

def recall_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true)
    n = len(y_true)
    topk = int(np.ceil(n * k))
    idx = np.argsort(-y_score)[:topk]
    return float(y_true[idx].sum()) / float(y_true.sum() + 1e-9)

def lift_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true)
    n = len(y_true)
    topk = int(np.ceil(n * k))
    idx = np.argsort(-y_score)[:topk]
    prec = float(y_true[idx].mean())
    prev = float(y_true.mean())
    return prec / max(prev, 1e-9)

def competition_score(y_true, y_score):
    g = gini_from_auc(y_true, y_score)
    r = recall_at_k(y_true, y_score)
    l = lift_at_k(y_true, y_score)
    s = 0.4*g + 0.3*r + 0.3*l
    return {"Gini": round(g, 4), "Recall@10%": round(r, 4), "Lift@10%": round(l, 4), "CompetitionScore": round(s, 4)}

# === 4) MOD & √ñRNEKLEME (mini + recent focus) ===
RUN_MODE = "mini"         # mini -> hƒ±zlƒ± test | full -> t√ºm veri + submission
SAMPLE_N = 9000           # 8k-12k arasƒ± iyi
FOCUS_RECENT = True       # mini skoru y√ºkseltir
RECENT_FRAC  = 0.50       # son %50 zaman dilimi i√ßinden √∂rnekle

keys = ref[["cust_id","ref_date","churn"]].copy()
keys = keys.sort_values("ref_date").reset_index(drop=True)

if RUN_MODE == "mini":
    if FOCUS_RECENT:
        q = keys["ref_date"].quantile(1.0 - RECENT_FRAC)
        keys_recent = keys[keys["ref_date"] >= q].reset_index(drop=True)
        if len(keys_recent) > SAMPLE_N:
            idx = np.linspace(0, len(keys_recent)-1, SAMPLE_N).round().astype(int)
            keys = keys_recent.iloc[idx].reset_index(drop=True)
        else:
            keys = keys_recent
    else:
        if len(keys) > SAMPLE_N:
            idx = np.linspace(0, len(keys)-1, SAMPLE_N).round().astype(int)
            keys = keys.iloc[idx].reset_index(drop=True)

print(f"üéØ √áalƒ±≈üma modu: {RUN_MODE} | FOCUS_RECENT: {FOCUS_RECENT} | keys: {len(keys)}")

# === 5) FEATURE √úRETƒ∞Mƒ∞ ===
parts = []
BATCH = 20000 if RUN_MODE=="full" else len(keys)
for s in range(0, len(keys), BATCH):
    e = min(s + BATCH, len(keys))
    feats_part = build_features_for_keys(hist, keys.iloc[s:e][["cust_id","ref_date"]], progress_every=2000 if RUN_MODE=="mini" else 5000)
    part = keys.iloc[s:e].merge(feats_part, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    parts.append(part)
    print(f"[train] done: {e}/{len(keys)}")

X_full = pd.concat(parts, ignore_index=True)
del parts; gc.collect()
print("‚úÖ Eƒüitim seti hazƒ±r:", X_full.shape)

# === 6) ENCODE + EKSƒ∞K DOLDUR ===
num_cols = X_full.select_dtypes(include="number").columns
cat_cols = [c for c in X_full.columns if c not in num_cols and c != "ref_date"]
X_full[num_cols] = X_full[num_cols].fillna(0)
for c in cat_cols:
    X_full[c] = X_full[c].fillna("Unknown")

X_full_enc = pd.get_dummies(
    X_full.drop(columns=["ref_date"]),
    columns=cat_cols, drop_first=True
)
# LightGBM uyarƒ±sƒ±nƒ± kes
X_full_enc.columns = X_full_enc.columns.str.replace(r"\s+", "_", regex=True)

# === 7) ZAMAN BAZLI TRAIN/VALID SPLIT ===
cut = X_full["ref_date"].quantile(0.80)
train_mask = X_full["ref_date"] < cut
valid_mask = ~train_mask

TARGET = "churn"
PROTECTED = ["cust_id", TARGET]
feat_cols = [c for c in X_full_enc.columns if c not in PROTECTED]

X_train = X_full_enc.loc[train_mask, feat_cols]
y_train = X_full.loc[train_mask, TARGET].astype(int)
X_valid = X_full_enc.loc[valid_mask, feat_cols]
y_valid = X_full.loc[valid_mask, TARGET].astype(int)

print("‚úÖ Split | Train:", X_train.shape, "| Valid:", X_valid.shape)

# === 8) MODEL (ranking-friendly) ===
clf = LGBMClassifier(
    objective="binary",
    n_estimators=800,
    learning_rate=0.035,
    max_depth=8,
    num_leaves=96,
    subsample=0.90,
    colsample_bytree=0.75,
    min_child_samples=20,     # daha esnek b√∂l√ºnme
    reg_alpha=0.10,           # hafif L1
    reg_lambda=1.00,          # L2
    class_weight="balanced",  # scale_pos_weight yerine
    random_state=42
)

clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    callbacks=[early_stopping(stopping_rounds=30), log_evaluation(period=50)]
)

# === 9) VALIDATION SKOR ===
valid_proba = clf.predict_proba(X_valid)[:, 1]
metrics = competition_score(y_valid, valid_proba)
print("\nüìä VALIDATION SONU√áLARI:", metrics)

# === 10) FULL MOD SUBMISSION ===
if RUN_MODE == "full":
    print("\nüßæ Submission hazƒ±rlanƒ±yor...")
    test_keys = test[["cust_id","ref_date"]].copy()
    X_hist_test = build_features_for_keys(hist, test_keys, progress_every=5000)

    X_test = test_keys.merge(X_hist_test, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    num_cols_t = X_test.select_dtypes(include=[np.number]).columns
    cat_cols_t = [c for c in X_test.columns if c not in num_cols_t and c != "ref_date"]

    X_test[num_cols_t] = X_test[num_cols_t].fillna(0)
    for c in cat_cols_t:
        X_test[c] = X_test[c].fillna("Unknown")

    X_test_enc = pd.get_dummies(X_test.drop(columns=["ref_date"]), columns=cat_cols_t, drop_first=True)
    X_test_enc.columns = X_test_enc.columns.str.replace(r"\s+", "_", regex=True)

    TRAIN_FEATS = X_train.columns.tolist()
    X_test_enc = X_test_enc.reindex(columns=TRAIN_FEATS, fill_value=0)

    test_proba = clf.predict_proba(X_test_enc)[:, 1]
    out = sub[["cust_id"]].merge(pd.DataFrame({"cust_id": X_test["cust_id"], "churn": test_proba}), on="cust_id", how="left")
    out["churn"] = out["churn"].fillna(float(ref["churn"].mean()))
    out.to_csv("submission.csv", index=False, float_format="%.6f")
    print("‚úÖ submission.csv yazƒ±ldƒ±! Satƒ±r sayƒ±sƒ±:", out.shape)


[info] Bulundu: customer_history.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customer_history.csv
[info] Bulundu: customers.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customers.csv
[info] Bulundu: referance_data.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data.csv
[info] Bulundu: referance_data_test.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data_test.csv
[info] Bulundu: sample_submission.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\sample_submission.csv
‚úÖ Dosyalar y√ºklendi:
  hist: (5359609, 7) | cust: (176293, 8) | ref: (133287, 3) | test: (43006, 2)
üéØ √áalƒ±≈üma modu: mini | FOCUS_RECENT: True | keys: 9000
[features] 2000/9000
[features] 4000/9000
[features] 6000/9000
[features] 8000/9000
[train] done: 9000/9000
‚úÖ Eƒüitim seti hazƒ±r: (9000, 150)
‚úÖ Split | Train: (7185, 170) | Valid: (1815, 170)
[LightGBM] [Info] Number of positive: 966, number of n

Early stopping, best iteration is:
[99]	valid_0's auc: 0.705303	valid_0's binary_logloss: 0.520861

üìä VALIDATION SONU√áLARI: {'Gini': 0.4106, 'Recall@10%': 0.2103, 'Lift@10%': 2.0974, 'CompetitionScore': 0.8566}


In [26]:
# ============================================================
# üß† ING Datathon ‚Äî Mini-Boost++ Pipeline (Recent + Prune)
#   - Mini mod: hƒ±zlƒ± deneme, recent-focused sampling
#   - Low-variance drop + Importance pruning (alt %20)
#   - Full mod: aynƒ± mantƒ±kla submission.csv √ºretir
# ============================================================

# === 0) K√úT√úPHANELER ===
import os, glob, gc
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score

pd.set_option("display.width", 180)
pd.set_option("display.max_columns", 200)

# === 1) PARAMETRELER ===
RUN_MODE       = "mini"     # "mini" | "full"
SAMPLE_N       = 10000      # mini √∂rnek sayƒ±sƒ±
FOCUS_RECENT   = True       # mini'de son zaman dilimine odaklan
RECENT_FRAC    = 0.65       # son %65 i√ßinden √∂rnekle (0.50-0.70 arasƒ± deneyebilirsin)
DO_LOWVAR_DROP = True       # d√º≈ü√ºk varyanslƒ± kolonlarƒ± at
DO_PRUNE       = True       # importance tabanlƒ± budama yap (mini i√ßin √∂nerilir)

# === 2) DOSYA Y√úKLEME ===
def find_file(fname: str, search_roots=(".",)):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                return os.path.abspath(path)
    return None

def resolve_path(default_path: str, fallback_name: str):
    if os.path.exists(default_path):
        return default_path
    p = find_file(fallback_name, search_roots=(".", os.getcwd()))
    if p:
        print(f"[info] Bulundu: {fallback_name} -> {p}")
        return p
    raise FileNotFoundError(f"Dosya bulunamadƒ±: {fallback_name}")

PATH_HISTORY = resolve_path("/mnt/data/customer_history.csv", "customer_history.csv")
PATH_CUSTOMERS = resolve_path("/mnt/data/customers.csv", "customers.csv")
PATH_REF = resolve_path("/mnt/data/referance_data.csv", "referance_data.csv")
PATH_TEST = resolve_path("/mnt/data/referance_data_test.csv", "referance_data_test.csv")
PATH_SUB = resolve_path("/mnt/data/sample_submission.csv", "sample_submission.csv")

hist = pd.read_csv(PATH_HISTORY, parse_dates=["date"], low_memory=False)
cust = pd.read_csv(PATH_CUSTOMERS, low_memory=False)
ref  = pd.read_csv(PATH_REF, parse_dates=["ref_date"], low_memory=False)
test = pd.read_csv(PATH_TEST, parse_dates=["ref_date"], low_memory=False)
sub  = pd.read_csv(PATH_SUB, low_memory=False)

for df in (hist, cust, ref, test, sub):
    if "cust_id" in df.columns:
        df["cust_id"] = pd.to_numeric(df["cust_id"], errors="coerce").astype("Int64")
if "churn" in ref.columns:
    ref["churn"] = pd.to_numeric(ref["churn"], errors="coerce").fillna(0).astype("Int8")

# Kolon adƒ± temizlik (LGBM uyarƒ±sƒ± i√ßin)
for d in (cust, hist):
    d.columns = d.columns.str.strip()

print("‚úÖ Dosyalar y√ºklendi:")
print("  hist:", hist.shape, "| cust:", cust.shape, "| ref:", ref.shape, "| test:", test.shape)

# === 3) √ñZELLƒ∞K √úRETƒ∞Mƒ∞ ===
HIST_COLS = [
    "cust_id","date",
    "mobile_eft_all_cnt","active_product_category_nbr",
    "mobile_eft_all_amt","cc_transaction_all_amt","cc_transaction_all_cnt"
]
hist = hist[[c for c in HIST_COLS if c in hist.columns]].sort_values(["cust_id","date"]).reset_index(drop=True)
WINDOWS = [1,3,6,12]
BASE_NUM_COLS = [c for c in HIST_COLS if c not in ["cust_id","date"]]

def _agg_safe(s, fn):
    s = pd.to_numeric(s, errors="coerce")
    s = s[np.isfinite(s)]
    if s.size == 0: return 0.0
    if fn=="sum":  return float(s.sum())
    if fn=="mean": return float(s.mean())
    if fn=="std":  return float(s.std(ddof=0))
    if fn=="max":  return float(s.max())
    return 0.0

def _months_active(frame: pd.DataFrame) -> int:
    if frame.empty: return 0
    cond = (pd.to_numeric(frame.get("cc_transaction_all_cnt", 0), errors="coerce").fillna(0) > 0) | \
           (pd.to_numeric(frame.get("mobile_eft_all_cnt", 0), errors="coerce").fillna(0) > 0)
    return int(cond.sum())

def aggregate_for_one(h_cust: pd.DataFrame, ref_date: pd.Timestamp):
    feat = {"ref_date": ref_date}
    h = h_cust[h_cust["date"] < ref_date]

    if h.empty:
        for col in BASE_NUM_COLS:
            for w in WINDOWS:
                feat[f"{col}_L{w}M_sum"]=feat[f"{col}_L{w}M_mean"]=feat[f"{col}_L{w}M_std"]=feat[f"{col}_L{w}M_max"]=0.0
        feat["recency_days"] = 9999
        for w in WINDOWS:
            feat[f"months_active_L{w}M"] = 0.0
            feat[f"inactivity_ratio_L{w}M"] = 1.0
            feat[f"activity_rate_L{w}M"] = 0.0
            feat[f"cc_share_amt_L{w}M"] = 0.0
            feat[f"eft_share_amt_L{w}M"] = 0.0
        feat["cc_amt_momentum_L1v3m"] = 0.0
        feat["cc_amt_momentum_L1v6m"] = 0.0
        feat["eft_amt_momentum_L1v3m"] = 0.0
        feat["eft_amt_momentum_L1v6m"] = 0.0
        feat["cc_share_shift_L1v6"] = 0.0
        feat["eft_share_shift_L1v6"] = 0.0
    else:
        feat["recency_days"] = int((ref_date - h["date"].max()).days)
        for w in WINDOWS:
            start = ref_date - pd.DateOffset(months=w)
            hw = h[(h["date"] >= start) & (h["date"] < ref_date)]
            for col in BASE_NUM_COLS:
                feat[f"{col}_L{w}M_sum"]  = _agg_safe(hw[col], "sum")
                feat[f"{col}_L{w}M_mean"] = _agg_safe(hw[col], "mean")
                feat[f"{col}_L{w}M_std"]  = _agg_safe(hw[col], "std")
                feat[f"{col}_L{w}M_max"]  = _agg_safe(hw[col], "max")
            # activity / inactivity / activity_rate
            ma = float(_months_active(hw))
            feat[f"months_active_L{w}M"]   = ma
            feat[f"inactivity_ratio_L{w}M"] = float(max(w - ma, 0.0)) / float(w)
            feat[f"activity_rate_L{w}M"]    = ma / float(w)

        # Kanal paylarƒ±
        eps = 1e-6
        for w in WINDOWS:
            cc_sum  = feat.get(f"cc_transaction_all_amt_L{w}M_sum", 0.0)
            eft_sum = feat.get(f"mobile_eft_all_amt_L{w}M_sum", 0.0)
            tot = cc_sum + eft_sum
            feat[f"cc_share_amt_L{w}M"]  = cc_sum / (tot + eps)
            feat[f"eft_share_amt_L{w}M"] = eft_sum / (tot + eps)

        # Momentum: son 1 ay toplam / son 3-6 ay ortalama
        L1_cc   = feat.get("cc_transaction_all_amt_L1M_sum", 0.0)
        L1_eft  = feat.get("mobile_eft_all_amt_L1M_sum", 0.0)
        L3_ccm  = feat.get("cc_transaction_all_amt_L3M_mean", 0.0)
        L6_ccm  = feat.get("cc_transaction_all_amt_L6M_mean", 0.0)
        L3_eftm = feat.get("mobile_eft_all_amt_L3M_mean", 0.0)
        L6_eftm = feat.get("mobile_eft_all_amt_L6M_mean", 0.0)
        eps = 1e-6
        feat["cc_amt_momentum_L1v3m"]  = L1_cc  / (L3_ccm  + eps)
        feat["cc_amt_momentum_L1v6m"]  = L1_cc  / (L6_ccm  + eps)
        feat["eft_amt_momentum_L1v3m"] = L1_eft / (L3_eftm + eps)
        feat["eft_amt_momentum_L1v6m"] = L1_eft / (L6_eftm + eps)

        # Kanal payƒ± deƒüi≈üimi (L1 vs L6)
        feat["cc_share_shift_L1v6"]  = feat.get("cc_share_amt_L1M", 0.0)  - feat.get("cc_share_amt_L6M", 0.0)
        feat["eft_share_shift_L1v6"] = feat.get("eft_share_amt_L1M", 0.0) - feat.get("eft_share_amt_L6M", 0.0)

    # ƒ∞≈ülem ba≈üƒ±na tutar oranlarƒ±
    eps = 1e-6
    for w in WINDOWS:
        feat[f"cc_amt_per_txn_L{w}M"] = feat.get(f"cc_transaction_all_amt_L{w}M_sum",0.0)/(feat.get(f"cc_transaction_all_cnt_L{w}M_sum",0.0)+eps)
        feat[f"mobile_eft_amt_per_txn_L{w}M"] = feat.get(f"mobile_eft_all_amt_L{w}M_sum",0.0)/(feat.get(f"mobile_eft_all_cnt_L{w}M_sum",0.0)+eps)

    # Kƒ±sa vs uzun d√∂nem TREND (yalnƒ±zca mean bazlƒ±)
    for short,long in [(1,3),(3,6),(6,12)]:
        for base in ["cc_transaction_all_amt","cc_transaction_all_cnt","mobile_eft_all_amt","mobile_eft_all_cnt","active_product_category_nbr"]:
            ms = feat.get(f"{base}_L{short}M_mean",0.0)
            ml = feat.get(f"{base}_L{long}M_mean",0.0)
            feat[f"{base}_trend_mean_L{short}vL{long}_diff"]  = ms-ml
            feat[f"{base}_trend_mean_L{short}vL{long}_ratio"] = ms/(ml+eps)

    # Recency d√∂n√º≈ü√ºm√º
    feat["recency_log"] = float(np.log1p(feat.get("recency_days", 9999)))

    return feat

def build_features_for_keys(hist_df, keys_df, progress_every=5000):
    rows, cache = [], {}
    for i, r in enumerate(keys_df.itertuples(index=False), 1):
        cid, rd = r.cust_id, r.ref_date
        if pd.isna(cid) or pd.isna(rd): 
            continue
        h_cust = cache.get(cid)
        if h_cust is None:
            h_cust = hist_df[hist_df["cust_id"] == cid]
            cache[cid] = h_cust
        f = aggregate_for_one(h_cust, rd)
        f["cust_id"] = cid
        rows.append(f)
        if progress_every and (i % progress_every == 0):
            print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(rows)

# === 4) METRƒ∞KLER ===
def gini_from_auc(y_true, y_score):
    return 2 * roc_auc_score(y_true, y_score) - 1

def recall_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true)
    topk = int(np.ceil(len(y_true) * k))
    idx = np.argsort(-y_score)[:topk]
    return float(y_true[idx].sum()) / float(y_true.sum() + 1e-9)

def lift_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true)
    topk = int(np.ceil(len(y_true) * k))
    idx = np.argsort(-y_score)[:topk]
    prec = float(y_true[idx].mean())
    prev = float(y_true.mean())
    return prec / max(prev, 1e-9)

def competition_score(y_true, y_score):
    g = gini_from_auc(y_true, y_score)
    r = recall_at_k(y_true, y_score)
    l = lift_at_k(y_true, y_score)
    s = 0.4*g + 0.3*r + 0.3*l
    return {"Gini": round(g, 4), "Recall@10%": round(r, 4), "Lift@10%": round(l, 4), "CompetitionScore": round(s, 4)}

# === 5) MOD & √ñRNEKLEME (mini + recent focus) ===
keys = ref[["cust_id","ref_date","churn"]].copy().sort_values("ref_date").reset_index(drop=True)

if RUN_MODE == "mini":
    if FOCUS_RECENT:
        q = keys["ref_date"].quantile(1.0 - RECENT_FRAC)
        keys_recent = keys[keys["ref_date"] >= q].reset_index(drop=True)
        if len(keys_recent) > SAMPLE_N:
            idx = np.linspace(0, len(keys_recent)-1, SAMPLE_N).round().astype(int)
            keys = keys_recent.iloc[idx].reset_index(drop=True)
        else:
            keys = keys_recent
    else:
        if len(keys) > SAMPLE_N:
            idx = np.linspace(0, len(keys)-1, SAMPLE_N).round().astype(int)
            keys = keys.iloc[idx].reset_index(drop=True)

print(f"üéØ √áalƒ±≈üma modu: {RUN_MODE} | FOCUS_RECENT: {FOCUS_RECENT} | keys: {len(keys)}")

# === 6) FEATURE √úRETƒ∞Mƒ∞ ===
parts = []
BATCH = 20000 if RUN_MODE=="full" else len(keys)
for s in range(0, len(keys), BATCH):
    e = min(s + BATCH, len(keys))
    feats_part = build_features_for_keys(hist, keys.iloc[s:e][["cust_id","ref_date"]], progress_every=2000 if RUN_MODE=="mini" else 5000)
    part = keys.iloc[s:e].merge(feats_part, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    parts.append(part)
    print(f"[train] done: {e}/{len(keys)}")

X_full = pd.concat(parts, ignore_index=True)
del parts; gc.collect()
print("‚úÖ Eƒüitim seti hazƒ±r:", X_full.shape)

# === 7) ENCODE + EKSƒ∞K DOLDUR ===
num_cols = X_full.select_dtypes(include="number").columns
cat_cols = [c for c in X_full.columns if c not in num_cols and c != "ref_date"]
X_full[num_cols] = X_full[num_cols].fillna(0)
for c in cat_cols:
    X_full[c] = X_full[c].fillna("Unknown")

X_full_enc = pd.get_dummies(
    X_full.drop(columns=["ref_date"]),
    columns=cat_cols, drop_first=True
)
# LGBM uyarƒ±sƒ± √∂nlemi
X_full_enc.columns = X_full_enc.columns.str.replace(r"\s+", "_", regex=True)

# === 8) D√ú≈û√úK VARYANS DROP (opsiyonel) ===
if DO_LOWVAR_DROP:
    nunique = X_full_enc.nunique(dropna=False)
    lowvar_cols = nunique[nunique <= 1].index.tolist()
    if lowvar_cols:
        print(f"üßπ Low-variance drop: {len(lowvar_cols)} kolon")
        X_full_enc = X_full_enc.drop(columns=lowvar_cols)

# === 9) ZAMAN BAZLI TRAIN/VALID SPLIT ===
cut = X_full["ref_date"].quantile(0.80)
train_mask = X_full["ref_date"] < cut
valid_mask = ~train_mask

TARGET = "churn"
PROTECTED = ["cust_id", TARGET]
feat_cols = [c for c in X_full_enc.columns if c not in PROTECTED]

X_train = X_full_enc.loc[train_mask, feat_cols]
y_train = X_full.loc[train_mask, TARGET].astype(int)
X_valid = X_full_enc.loc[valid_mask, feat_cols]
y_valid = X_full.loc[valid_mask, TARGET].astype(int)

print("‚úÖ Split | Train:", X_train.shape, "| Valid:", X_valid.shape)

# === 10) MODEL (ranking-friendly) ‚Äî 1. eƒüitim ===
clf = LGBMClassifier(
    objective="binary",
    n_estimators=800,
    learning_rate=0.035,
    max_depth=8,
    num_leaves=96,
    subsample=0.90,
    colsample_bytree=0.75,
    min_child_samples=20,
    reg_alpha=0.10,
    reg_lambda=1.00,
    class_weight="balanced",
    random_state=42
)
clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    callbacks=[early_stopping(stopping_rounds=30), log_evaluation(period=50)]
)

valid_proba = clf.predict_proba(X_valid)[:, 1]
metrics = competition_score(y_valid, valid_proba)
print("\nüìä ƒ∞lk eƒüitim:", metrics)

# === 11) IMPORTANCE PRUNING (opsiyonel, mini i√ßin √∂nerilir) ===
def get_feature_names(model):
    names = getattr(model, "feature_name_", None)
    if names is None:
        try:
            names = model.booster_.feature_name()
        except Exception:
            names = list(X_train.columns)
    return list(names)

BEST_CLF = clf
BEST_METRICS = metrics
BEST_FEATS = list(X_train.columns)

if DO_PRUNE:
    try:
        imp = pd.DataFrame({
            "feature": get_feature_names(clf),
            "importance": clf.feature_importances_
        }).sort_values("importance", ascending=True)

        cut_idx = int(len(imp) * 0.20)  # alt %20'yi at
        drop_imp = imp.iloc[:cut_idx]["feature"].tolist()
        print(f"ü™ì Importance drop: {len(drop_imp)} kolon")

        keep_feats = [c for c in X_train.columns if c not in drop_imp]
        X_train2 = X_train[keep_feats]
        X_valid2 = X_valid[keep_feats]

        clf2 = LGBMClassifier(
            objective="binary",
            n_estimators=900,
            learning_rate=0.033,
            max_depth=8,
            num_leaves=112,
            subsample=0.90,
            colsample_bytree=0.80,
            min_child_samples=18,
            reg_alpha=0.10,
            reg_lambda=1.50,
            class_weight="balanced",
            random_state=42
        )
        clf2.fit(
            X_train2, y_train,
            eval_set=[(X_valid2, y_valid)],
            eval_metric="auc",
            callbacks=[early_stopping(stopping_rounds=30), log_evaluation(period=50)]
        )

        valid_proba2 = clf2.predict_proba(X_valid2)[:, 1]
        metrics2 = competition_score(y_valid, valid_proba2)
        print("\nüèÅ Budama sonrasƒ±:", metrics2)

        # En iyi modeli ve feature listesini se√ß
        if metrics2["CompetitionScore"] >= metrics["CompetitionScore"]:
            BEST_CLF = clf2
            BEST_METRICS = metrics2
            BEST_FEATS = keep_feats
            X_train = X_train2
            X_valid = X_valid2
        else:
            BEST_CLF = clf
            BEST_METRICS = metrics
            BEST_FEATS = list(X_train.columns)

    except Exception as e:
        print("‚ö†Ô∏è Pruning atlanƒ±yor (hata):", e)
        BEST_CLF = clf
        BEST_METRICS = metrics
        BEST_FEATS = list(X_train.columns)

print("\n‚úÖ Se√ßilen (en iyi) skor:", BEST_METRICS)

# === 12) FULL MOD SUBMISSION ===
if RUN_MODE == "full":
    print("\nüßæ Submission hazƒ±rlanƒ±yor...")
    test_keys = test[["cust_id","ref_date"]].copy()
    X_hist_test = build_features_for_keys(hist, test_keys, progress_every=5000)

    X_test = test_keys.merge(X_hist_test, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    num_cols_t = X_test.select_dtypes(include=[np.number]).columns
    cat_cols_t = [c for c in X_test.columns if c not in num_cols_t and c != "ref_date"]

    X_test[num_cols_t] = X_test[num_cols_t].fillna(0)
    for c in cat_cols_t:
        X_test[c] = X_test[c].fillna("Unknown")

    X_test_enc = pd.get_dummies(X_test.drop(columns=["ref_date"]), columns=cat_cols_t, drop_first=True)
    X_test_enc.columns = X_test_enc.columns.str.replace(r"\s+", "_", regex=True)

    # Eƒüitimde SON KULLANILAN feature setine hizala
    TRAIN_FEATS = BEST_FEATS
    X_test_enc = X_test_enc.reindex(columns=TRAIN_FEATS, fill_value=0)

    test_proba = BEST_CLF.predict_proba(X_test_enc)[:, 1]
    out = sub[["cust_id"]].merge(pd.DataFrame({"cust_id": X_test["cust_id"], "churn": test_proba}), on="cust_id", how="left")
    out["churn"] = out["churn"].fillna(float(ref["churn"].mean()))
    out.to_csv("submission.csv", index=False, float_format="%.6f")
    print("‚úÖ submission.csv yazƒ±ldƒ±! Satƒ±r sayƒ±sƒ±:", out.shape)

print("\nüéâ Bitti! Mini skoru beƒüenirsen RUN_MODE='full' ile submission √ºretebilirsin.")


[info] Bulundu: customer_history.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customer_history.csv
[info] Bulundu: customers.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customers.csv
[info] Bulundu: referance_data.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data.csv
[info] Bulundu: referance_data_test.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data_test.csv
[info] Bulundu: sample_submission.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\sample_submission.csv
‚úÖ Dosyalar y√ºklendi:
  hist: (5359609, 7) | cust: (176293, 8) | ref: (133287, 3) | test: (43006, 2)
üéØ √áalƒ±≈üma modu: mini | FOCUS_RECENT: True | keys: 10000
[features] 2000/10000
[features] 4000/10000
[features] 6000/10000
[features] 8000/10000
[features] 10000/10000
[train] done: 10000/10000
‚úÖ Eƒüitim seti hazƒ±r: (10000, 156)
üßπ Low-variance drop: 5 kolon
‚úÖ Split | Train: (7486, 171) | Valid: (25

Early stopping, best iteration is:
[73]	valid_0's auc: 0.681515	valid_0's binary_logloss: 0.550596

üìä ƒ∞lk eƒüitim: {'Gini': 0.363, 'Recall@10%': 0.1516, 'Lift@10%': 1.5124, 'CompetitionScore': 0.6444}
ü™ì Importance drop: 34 kolon
[LightGBM] [Info] Number of positive: 1045, number of negative: 6441
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23909
[LightGBM] [Info] Number of data points in the train set: 7486, number of used features: 137
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 30 rounds
[50]	valid_0's auc: 0.67652	valid_0's binary_logloss: 0.575075


[100]	valid_0's auc: 0.679408	valid_0's binary_logloss: 0.53422
Early stopping, best iteration is:
[78]	valid_0's auc: 0.681656	valid_0's binary_logloss: 0.548106

üèÅ Budama sonrasƒ±: {'Gini': 0.3633, 'Recall@10%': 0.1545, 'Lift@10%': 1.5415, 'CompetitionScore': 0.6541}

‚úÖ Se√ßilen (en iyi) skor: {'Gini': 0.3633, 'Recall@10%': 0.1545, 'Lift@10%': 1.5415, 'CompetitionScore': 0.6541}

üéâ Bitti! Mini skoru beƒüenirsen RUN_MODE='full' ile submission √ºretebilirsin.


In [27]:
# ============================================================
# üß† ING Datathon ‚Äî Mini-Boost++ Pipeline (Recent + Safe-Prune)
#   - Mini mod: hƒ±zlƒ± deneme, recent-focused sampling
#   - Low-variance drop + Gain-importance pruning (alt %15, g√ºvenli)
#   - Ay-bazlƒ± "months_active" d√ºzeltildi (activity_rate 0..1 clamp)
#   - Full mod: aynƒ± mantƒ±kla submission.csv √ºretir
# ============================================================

# === 0) K√úT√úPHANELER ===
import os, glob, gc
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score

pd.set_option("display.width", 180)
pd.set_option("display.max_columns", 220)

# === 1) PARAMETRELER ===
RUN_MODE       = "mini"     # "mini" | "full"
SAMPLE_N       = 11000      # mini √∂rnek sayƒ±sƒ± (10k -> 11k)
FOCUS_RECENT   = True       # mini'de son zaman dilimine odaklan
RECENT_FRAC    = 0.60       # son %60 i√ßinden √∂rnekle (0.55-0.65 arasƒ± deneyebilirsin)
DO_LOWVAR_DROP = True       # d√º≈ü√ºk varyanslƒ± kolonlarƒ± at
DO_PRUNE       = True       # importance tabanlƒ± budama yap (mini i√ßin √∂nerilir)
PRUNE_FRAC     = 0.15       # alt %15 importance (gain) drop
PRUNE_IMPROVE  = 0.003      # pruning sonrasƒ± min iyile≈üme e≈üiƒüi (competition score)

# === 2) DOSYA Y√úKLEME ===
def find_file(fname: str, search_roots=(".",)):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                return os.path.abspath(path)
    return None

def resolve_path(default_path: str, fallback_name: str):
    if os.path.exists(default_path):
        return default_path
    p = find_file(fallback_name, search_roots=(".", os.getcwd()))
    if p:
        print(f"[info] Bulundu: {fallback_name} -> {p}")
        return p
    raise FileNotFoundError(f"Dosya bulunamadƒ±: {fallback_name}")

PATH_HISTORY = resolve_path("/mnt/data/customer_history.csv", "customer_history.csv")
PATH_CUSTOMERS = resolve_path("/mnt/data/customers.csv", "customers.csv")
PATH_REF = resolve_path("/mnt/data/referance_data.csv", "referance_data.csv")
PATH_TEST = resolve_path("/mnt/data/referance_data_test.csv", "referance_data_test.csv")
PATH_SUB = resolve_path("/mnt/data/sample_submission.csv", "sample_submission.csv")

hist = pd.read_csv(PATH_HISTORY, parse_dates=["date"], low_memory=False)
cust = pd.read_csv(PATH_CUSTOMERS, low_memory=False)
ref  = pd.read_csv(PATH_REF, parse_dates=["ref_date"], low_memory=False)
test = pd.read_csv(PATH_TEST, parse_dates=["ref_date"], low_memory=False)
sub  = pd.read_csv(PATH_SUB, low_memory=False)

for df in (hist, cust, ref, test, sub):
    if "cust_id" in df.columns:
        df["cust_id"] = pd.to_numeric(df["cust_id"], errors="coerce").astype("Int64")
if "churn" in ref.columns:
    ref["churn"] = pd.to_numeric(ref["churn"], errors="coerce").fillna(0).astype("Int8")

# Kolon adƒ± temizlik (LGBM uyarƒ±sƒ± i√ßin)
for d in (cust, hist):
    d.columns = d.columns.str.strip()

print("‚úÖ Dosyalar y√ºklendi:")
print("  hist:", hist.shape, "| cust:", cust.shape, "| ref:", ref.shape, "| test:", test.shape)

# === 3) √ñZELLƒ∞K √úRETƒ∞Mƒ∞ ===
HIST_COLS = [
    "cust_id","date",
    "mobile_eft_all_cnt","active_product_category_nbr",
    "mobile_eft_all_amt","cc_transaction_all_amt","cc_transaction_all_cnt"
]
hist = hist[[c for c in HIST_COLS if c in hist.columns]].sort_values(["cust_id","date"]).reset_index(drop=True)
WINDOWS = [1,3,6,12]
BASE_NUM_COLS = [c for c in HIST_COLS if c not in ["cust_id","date"]]

def _agg_safe(s, fn):
    s = pd.to_numeric(s, errors="coerce")
    s = s[np.isfinite(s)]
    if s.size == 0: return 0.0
    if fn=="sum":  return float(s.sum())
    if fn=="mean": return float(s.mean())
    if fn=="std":  return float(s.std(ddof=0))
    if fn=="max":  return float(s.max())
    return 0.0

def _months_active(frame: pd.DataFrame) -> int:
    """Penceredeki AY sayƒ±sƒ±nƒ± (benzersiz) say: en az 1 i≈ülem yapƒ±lan ay."""
    if frame.empty:
        return 0
    m = frame.copy()
    m["ym"] = m["date"].dt.to_period("M")
    monthly_active = (
        m.groupby("ym")[["cc_transaction_all_cnt", "mobile_eft_all_cnt"]]
         .sum()
         .sum(axis=1) > 0
    )
    return int(monthly_active.sum())

def aggregate_for_one(h_cust: pd.DataFrame, ref_date: pd.Timestamp):
    feat = {"ref_date": ref_date}
    h = h_cust[h_cust["date"] < ref_date]

    if h.empty:
        for col in BASE_NUM_COLS:
            for w in WINDOWS:
                feat[f"{col}_L{w}M_sum"]=feat[f"{col}_L{w}M_mean"]=feat[f"{col}_L{w}M_std"]=feat[f"{col}_L{w}M_max"]=0.0
        feat["recency_days"] = 9999
        for w in WINDOWS:
            feat[f"months_active_L{w}M"] = 0.0
            feat[f"inactivity_ratio_L{w}M"] = 1.0
            feat[f"activity_rate_L{w}M"] = 0.0
            feat[f"cc_share_amt_L{w}M"] = 0.0
            feat[f"eft_share_amt_L{w}M"] = 0.0
        feat["cc_amt_momentum_L1v3m"] = 0.0
        feat["cc_amt_momentum_L1v6m"] = 0.0
        feat["eft_amt_momentum_L1v3m"] = 0.0
        feat["eft_amt_momentum_L1v6m"] = 0.0
        feat["cc_share_shift_L1v6"] = 0.0
        feat["eft_share_shift_L1v6"] = 0.0
    else:
        feat["recency_days"] = int((ref_date - h["date"].max()).days)
        for w in WINDOWS:
            start = ref_date - pd.DateOffset(months=w)
            hw = h[(h["date"] >= start) & (h["date"] < ref_date)]

            for col in BASE_NUM_COLS:
                feat[f"{col}_L{w}M_sum"]  = _agg_safe(hw[col], "sum")
                feat[f"{col}_L{w}M_mean"] = _agg_safe(hw[col], "mean")
                feat[f"{col}_L{w}M_std"]  = _agg_safe(hw[col], "std")
                feat[f"{col}_L{w}M_max"]  = _agg_safe(hw[col], "max")

            # activity / inactivity / activity_rate (AY bazƒ±nda ve clamp)
            ma_raw = float(_months_active(hw))     # benzersiz aktif ay sayƒ±sƒ±
            ma = min(ma_raw, float(w))             # teorik √ºst sƒ±nƒ±r w
            feat[f"months_active_L{w}M"]    = ma
            feat[f"inactivity_ratio_L{w}M"] = max(w - ma, 0.0) / float(w)
            feat[f"activity_rate_L{w}M"]    = min(ma / float(w), 1.0)  # 0..1

        # Kanal paylarƒ±
        eps = 1e-6
        for w in WINDOWS:
            cc_sum  = feat.get(f"cc_transaction_all_amt_L{w}M_sum", 0.0)
            eft_sum = feat.get(f"mobile_eft_all_amt_L{w}M_sum", 0.0)
            tot = cc_sum + eft_sum
            feat[f"cc_share_amt_L{w}M"]  = cc_sum / (tot + eps)
            feat[f"eft_share_amt_L{w}M"] = eft_sum / (tot + eps)

        # Momentum: son 1 ay toplam / son 3-6 ay ortalama
        L1_cc   = feat.get("cc_transaction_all_amt_L1M_sum", 0.0)
        L1_eft  = feat.get("mobile_eft_all_amt_L1M_sum", 0.0)
        L3_ccm  = feat.get("cc_transaction_all_amt_L3M_mean", 0.0)
        L6_ccm  = feat.get("cc_transaction_all_amt_L6M_mean", 0.0)
        L3_eftm = feat.get("mobile_eft_all_amt_L3M_mean", 0.0)
        L6_eftm = feat.get("mobile_eft_all_amt_L6M_mean", 0.0)
        eps = 1e-6
        feat["cc_amt_momentum_L1v3m"]  = L1_cc  / (L3_ccm  + eps)
        feat["cc_amt_momentum_L1v6m"]  = L1_cc  / (L6_ccm  + eps)
        feat["eft_amt_momentum_L1v3m"] = L1_eft / (L3_eftm + eps)
        feat["eft_amt_momentum_L1v6m"] = L1_eft / (L6_eftm + eps)

        # Kanal payƒ± deƒüi≈üimi (L1 vs L6)
        feat["cc_share_shift_L1v6"]  = feat.get("cc_share_amt_L1M", 0.0)  - feat.get("cc_share_amt_L6M", 0.0)
        feat["eft_share_shift_L1v6"] = feat.get("eft_share_amt_L1M", 0.0) - feat.get("eft_share_amt_L6M", 0.0)

    # ƒ∞≈ülem ba≈üƒ±na tutar oranlarƒ±
    eps = 1e-6
    for w in WINDOWS:
        feat[f"cc_amt_per_txn_L{w}M"] = feat.get(f"cc_transaction_all_amt_L{w}M_sum",0.0)/(feat.get(f"cc_transaction_all_cnt_L{w}M_sum",0.0)+eps)
        feat[f"mobile_eft_amt_per_txn_L{w}M"] = feat.get(f"mobile_eft_all_amt_L{w}M_sum",0.0)/(feat.get(f"mobile_eft_all_cnt_L{w}M_sum",0.0)+eps)

    # Kƒ±sa vs uzun d√∂nem TREND (yalnƒ±zca mean bazlƒ±)
    for short,long in [(1,3),(3,6),(6,12)]:
        for base in ["cc_transaction_all_amt","cc_transaction_all_cnt","mobile_eft_all_amt","mobile_eft_all_cnt","active_product_category_nbr"]:
            ms = feat.get(f"{base}_L{short}M_mean",0.0)
            ml = feat.get(f"{base}_L{long}M_mean",0.0)
            feat[f"{base}_trend_mean_L{short}vL{long}_diff"]  = ms-ml
            feat[f"{base}_trend_mean_L{short}vL{long}_ratio"] = ms/(ml+eps)

    # Recency d√∂n√º≈ü√ºm√º
    feat["recency_log"] = float(np.log1p(feat.get("recency_days", 9999)))

    return feat

def build_features_for_keys(hist_df, keys_df, progress_every=5000):
    rows, cache = [], {}
    for i, r in enumerate(keys_df.itertuples(index=False), 1):
        cid, rd = r.cust_id, r.ref_date
        if pd.isna(cid) or pd.isna(rd): 
            continue
        h_cust = cache.get(cid)
        if h_cust is None:
            h_cust = hist_df[hist_df["cust_id"] == cid]
            cache[cid] = h_cust
        f = aggregate_for_one(h_cust, rd)
        f["cust_id"] = cid
        rows.append(f)
        if progress_every and (i % progress_every == 0):
            print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(rows)

# === 4) METRƒ∞KLER ===
def gini_from_auc(y_true, y_score):
    return 2 * roc_auc_score(y_true, y_score) - 1

def recall_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true)
    topk = int(np.ceil(len(y_true) * k))
    idx = np.argsort(-y_score)[:topk]
    return float(y_true[idx].sum()) / float(y_true.sum() + 1e-9)

def lift_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true)
    topk = int(np.ceil(len(y_true) * k))
    idx = np.argsort(-y_score)[:topk]
    prec = float(y_true[idx].mean())
    prev = float(y_true.mean())
    return prec / max(prev, 1e-9)

def competition_score(y_true, y_score):
    g = gini_from_auc(y_true, y_score)
    r = recall_at_k(y_true, y_score)
    l = lift_at_k(y_true, y_score)
    s = 0.4*g + 0.3*r + 0.3*l
    return {"Gini": round(g, 4), "Recall@10%": round(r, 4), "Lift@10%": round(l, 4), "CompetitionScore": round(s, 4)}

# === 5) MOD & √ñRNEKLEME (mini + recent focus) ===
keys = ref[["cust_id","ref_date","churn"]].copy().sort_values("ref_date").reset_index(drop=True)

if RUN_MODE == "mini":
    if FOCUS_RECENT:
        q = keys["ref_date"].quantile(1.0 - RECENT_FRAC)
        keys_recent = keys[keys["ref_date"] >= q].reset_index(drop=True)
        if len(keys_recent) > SAMPLE_N:
            idx = np.linspace(0, len(keys_recent)-1, SAMPLE_N).round().astype(int)
            keys = keys_recent.iloc[idx].reset_index(drop=True)
        else:
            keys = keys_recent
    else:
        if len(keys) > SAMPLE_N:
            idx = np.linspace(0, len(keys)-1, SAMPLE_N).round().astype(int)
            keys = keys.iloc[idx].reset_index(drop=True)

print(f"üéØ √áalƒ±≈üma modu: {RUN_MODE} | FOCUS_RECENT: {FOCUS_RECENT} | keys: {len(keys)}")

# === 6) FEATURE √úRETƒ∞Mƒ∞ ===
parts = []
BATCH = 20000 if RUN_MODE=="full" else len(keys)
for s in range(0, len(keys), BATCH):
    e = min(s + BATCH, len(keys))
    feats_part = build_features_for_keys(hist, keys.iloc[s:e][["cust_id","ref_date"]], progress_every=2000 if RUN_MODE=="mini" else 5000)
    part = keys.iloc[s:e].merge(feats_part, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    parts.append(part)
    print(f"[train] done: {e}/{len(keys)}")

X_full = pd.concat(parts, ignore_index=True)
del parts; gc.collect()
print("‚úÖ Eƒüitim seti hazƒ±r:", X_full.shape)

# === 7) ENCODE + EKSƒ∞K DOLDUR ===
num_cols = X_full.select_dtypes(include="number").columns
cat_cols = [c for c in X_full.columns if c not in num_cols and c != "ref_date"]
X_full[num_cols] = X_full[num_cols].fillna(0)
for c in cat_cols:
    X_full[c] = X_full[c].fillna("Unknown")

X_full_enc = pd.get_dummies(
    X_full.drop(columns=["ref_date"]),
    columns=cat_cols, drop_first=True
)
# LGBM uyarƒ±sƒ± √∂nlemi
X_full_enc.columns = X_full_enc.columns.str.replace(r"\s+", "_", regex=True)

# === 8) D√ú≈û√úK VARYANS DROP (opsiyonel) ===
if DO_LOWVAR_DROP:
    nunique = X_full_enc.nunique(dropna=False)
    lowvar_cols = nunique[nunique <= 1].index.tolist()
    if lowvar_cols:
        print(f"üßπ Low-variance drop: {len(lowvar_cols)} kolon")
        X_full_enc = X_full_enc.drop(columns=lowvar_cols)

# === 9) ZAMAN BAZLI TRAIN/VALID SPLIT ===
cut = X_full["ref_date"].quantile(0.80)
train_mask = X_full["ref_date"] < cut
valid_mask = ~train_mask

TARGET = "churn"
PROTECTED = ["cust_id", TARGET]
feat_cols = [c for c in X_full_enc.columns if c not in PROTECTED]

X_train = X_full_enc.loc[train_mask, feat_cols]
y_train = X_full.loc[train_mask, TARGET].astype(int)
X_valid = X_full_enc.loc[valid_mask, feat_cols]
y_valid = X_full.loc[valid_mask, TARGET].astype(int)

print("‚úÖ Split | Train:", X_train.shape, "| Valid:", X_valid.shape)

# === 10) MODEL (ranking-friendly) ‚Äî 1. eƒüitim ===
clf = LGBMClassifier(
    objective="binary",
    n_estimators=800,
    learning_rate=0.035,
    max_depth=8,
    num_leaves=96,
    subsample=0.90,
    colsample_bytree=0.75,
    min_child_samples=24,   # 20 -> 24 (bir tƒ±k stabil)
    reg_alpha=0.10,
    reg_lambda=1.25,       # 1.00 -> 1.25 (bir tƒ±k stabil)
    class_weight="balanced",
    random_state=42
)
clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    callbacks=[early_stopping(stopping_rounds=30), log_evaluation(period=50)]
)

valid_proba = clf.predict_proba(X_valid)[:, 1]
metrics = competition_score(y_valid, valid_proba)
print("\nüìä ƒ∞lk eƒüitim:", metrics)

# === 11) IMPORTANCE PRUNING (gain-tabanlƒ±, g√ºvenli) ===
def get_feature_names(model):
    names = getattr(model, "feature_name_", None)
    if names is None:
        try:
            names = model.booster_.feature_name()
        except Exception:
            names = list(X_train.columns)
    return list(names)

BEST_CLF = clf
BEST_METRICS = metrics
BEST_FEATS = list(X_train.columns)

if DO_PRUNE:
    try:
        booster = getattr(clf, "booster_", None)
        if booster is None:
            booster = clf._Booster  # bazƒ± s√ºr√ºmler

        names = get_feature_names(clf)
        gains = booster.feature_importance(importance_type="gain")
        imp = pd.DataFrame({"feature": names, "gain": gains}).sort_values("gain", ascending=True)

        cut_idx = int(len(imp) * PRUNE_FRAC)
        drop_imp = imp.iloc[:cut_idx]["feature"].tolist()
        print(f"ü™ì (gain) Importance drop: {len(drop_imp)} kolon")

        keep_feats = [c for c in X_train.columns if c not in drop_imp]
        X_train2 = X_train[keep_feats]
        X_valid2 = X_valid[keep_feats]

        clf2 = LGBMClassifier(
            objective="binary",
            n_estimators=900,
            learning_rate=0.033,
            max_depth=8,
            num_leaves=112,
            subsample=0.90,
            colsample_bytree=0.80,
            min_child_samples=18,
            reg_alpha=0.10,
            reg_lambda=1.50,
            class_weight="balanced",
            random_state=42
        )
        clf2.fit(
            X_train2, y_train,
            eval_set=[(X_valid2, y_valid)],
            eval_metric="auc",
            callbacks=[early_stopping(stopping_rounds=30), log_evaluation(period=50)]
        )

        valid_proba2 = clf2.predict_proba(X_valid2)[:, 1]
        metrics2 = competition_score(y_valid, valid_proba2)
        print("\nüèÅ Budama sonrasƒ±:", metrics2)

        # En az PRUNE_IMPROVE kadar iyile≈üme olursa kabul et
        if (metrics2["CompetitionScore"] >= BEST_METRICS["CompetitionScore"] + PRUNE_IMPROVE):
            BEST_CLF = clf2
            BEST_METRICS = metrics2
            BEST_FEATS = keep_feats
            X_train = X_train2
            X_valid = X_valid2
        else:
            print("‚Ü©Ô∏è ƒ∞yile≈üme e≈üiƒüi kar≈üƒ±lanmadƒ±, ilk modeli koruyorum.")
            BEST_CLF = clf
            BEST_METRICS = metrics
            BEST_FEATS = list(X_train.columns)

    except Exception as e:
        print("‚ö†Ô∏è Pruning atlanƒ±yor (hata):", e)
        BEST_CLF = clf
        BEST_METRICS = metrics
        BEST_FEATS = list(X_train.columns)

print("\n‚úÖ Se√ßilen (en iyi) skor:", BEST_METRICS)

# === 12) FULL MOD SUBMISSION ===
if RUN_MODE == "full":
    print("\nüßæ Submission hazƒ±rlanƒ±yor...")
    test_keys = test[["cust_id","ref_date"]].copy()
    X_hist_test = build_features_for_keys(hist, test_keys, progress_every=5000)

    X_test = test_keys.merge(X_hist_test, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    num_cols_t = X_test.select_dtypes(include=[np.number]).columns
    cat_cols_t = [c for c in X_test.columns if c not in num_cols_t and c != "ref_date"]

    X_test[num_cols_t] = X_test[num_cols_t].fillna(0)
    for c in cat_cols_t:
        X_test[c] = X_test[c].fillna("Unknown")

    X_test_enc = pd.get_dummies(X_test.drop(columns=["ref_date"]), columns=cat_cols_t, drop_first=True)
    X_test_enc.columns = X_test_enc.columns.str.replace(r"\s+", "_", regex=True)

    # Eƒüitimde SON KULLANILAN feature setine hizala
    TRAIN_FEATS = BEST_FEATS
    X_test_enc = X_test_enc.reindex(columns=TRAIN_FEATS, fill_value=0)

    test_proba = BEST_CLF.predict_proba(X_test_enc)[:, 1]
    out = sub[["cust_id"]].merge(pd.DataFrame({"cust_id": X_test["cust_id"], "churn": test_proba}), on="cust_id", how="left")
    out["churn"] = out["churn"].fillna(float(ref["churn"].mean()))
    out.to_csv("submission.csv", index=False, float_format="%.6f")
    print("‚úÖ submission.csv yazƒ±ldƒ±! Satƒ±r sayƒ±sƒ±:", out.shape)

print("\nüéâ Bitti! Mini skoru beƒüenirsen RUN_MODE='full' ile submission √ºretebilirsin.")


[info] Bulundu: customer_history.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customer_history.csv
[info] Bulundu: customers.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customers.csv
[info] Bulundu: referance_data.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data.csv
[info] Bulundu: referance_data_test.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data_test.csv
[info] Bulundu: sample_submission.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\sample_submission.csv
‚úÖ Dosyalar y√ºklendi:
  hist: (5359609, 7) | cust: (176293, 8) | ref: (133287, 3) | test: (43006, 2)
üéØ √áalƒ±≈üma modu: mini | FOCUS_RECENT: True | keys: 11000
[features] 2000/11000
[features] 4000/11000
[features] 6000/11000
[features] 8000/11000
[features] 10000/11000
[train] done: 11000/11000
‚úÖ Eƒüitim seti hazƒ±r: (11000, 156)
üßπ Low-variance drop: 5 kolon
‚úÖ Split | Train: (7980, 171) | Valid: (30

Early stopping, best iteration is:
[19]	valid_0's auc: 0.675746	valid_0's binary_logloss: 0.62561

üèÅ Budama sonrasƒ±: {'Gini': 0.3515, 'Recall@10%': 0.1837, 'Lift@10%': 1.8367, 'CompetitionScore': 0.7467}

‚úÖ Se√ßilen (en iyi) skor: {'Gini': 0.3515, 'Recall@10%': 0.1837, 'Lift@10%': 1.8367, 'CompetitionScore': 0.7467}

üéâ Bitti! Mini skoru beƒüenirsen RUN_MODE='full' ile submission √ºretebilirsin.


In [1]:
# ============================================================
# üß† ING Datathon ‚Äî Full Mode Pipeline (Seasonal + 5 Lags + Safe-Prune)
#   - WINDOWS: [1,2,3,6,12]
#   - Sezonsallƒ±k (mo_sin/mo_cos, yƒ±lba≈üƒ± & yƒ±lsonu bayraklarƒ±, 5'li sezon paylarƒ±)
#   - Ay-bazlƒ± "months_active" (activity_rate 0..1 clamp)
#   - Low-variance drop + Gain-importance pruning (alt %15, g√ºvenli)
#   - FULL mod: submission.csv √ºretir
# ============================================================

# === 0) K√úT√úPHANELER ===
import os, glob, gc
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score

pd.set_option("display.width", 180)
pd.set_option("display.max_columns", 240)

# === 1) PARAMETRELER ===
RUN_MODE       = "full"     # "mini" | "full"
SAMPLE_N       = 11000      # mini i√ßin; full'da kullanƒ±lmaz
FOCUS_RECENT   = True       # mini i√ßin; full'da kullanƒ±lmaz
RECENT_FRAC    = 0.60       # mini i√ßin; full'da kullanƒ±lmaz
DO_LOWVAR_DROP = True       # d√º≈ü√ºk varyanslƒ± kolonlarƒ± at
DO_PRUNE       = True       # importance tabanlƒ± budama yap
PRUNE_FRAC     = 0.15       # alt %15 (gain) drop
PRUNE_IMPROVE  = 0.003      # pruning sonrasƒ± min iyile≈üme e≈üiƒüi

# 12 ayƒ± 5 sezona ayƒ±r
SEASON_GROUPS_5 = {
    "YStart": [1, 2],           # Yƒ±l ba≈üƒ±
    "Spring": [3, 4, 5],        # ƒ∞lkbahar
    "Summer": [6, 7],           # Yaz
    "Fall":   [8, 9, 10],       # Sonbahar
    "YEnd":   [11, 12],         # Yƒ±l sonu
}

# === 2) DOSYA Y√úKLEME ===
def find_file(fname: str, search_roots=(".",)):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                return os.path.abspath(path)
    return None

def resolve_path(default_path: str, fallback_name: str):
    if os.path.exists(default_path):
        return default_path
    p = find_file(fallback_name, search_roots=(".", os.getcwd()))
    if p:
        print(f"[info] Bulundu: {fallback_name} -> {p}")
        return p
    raise FileNotFoundError(f"Dosya bulunamadƒ±: {fallback_name}")

PATH_HISTORY = resolve_path("/mnt/data/customer_history.csv", "customer_history.csv")
PATH_CUSTOMERS = resolve_path("/mnt/data/customers.csv", "customers.csv")
PATH_REF = resolve_path("/mnt/data/referance_data.csv", "referance_data.csv")
PATH_TEST = resolve_path("/mnt/data/referance_data_test.csv", "referance_data_test.csv")
PATH_SUB = resolve_path("/mnt/data/sample_submission.csv", "sample_submission.csv")

hist = pd.read_csv(PATH_HISTORY, parse_dates=["date"], low_memory=False)
cust = pd.read_csv(PATH_CUSTOMERS, low_memory=False)
ref  = pd.read_csv(PATH_REF, parse_dates=["ref_date"], low_memory=False)
test = pd.read_csv(PATH_TEST, parse_dates=["ref_date"], low_memory=False)
sub  = pd.read_csv(PATH_SUB, low_memory=False)

for df in (hist, cust, ref, test, sub):
    if "cust_id" in df.columns:
        df["cust_id"] = pd.to_numeric(df["cust_id"], errors="coerce").astype("Int64")
if "churn" in ref.columns:
    ref["churn"] = pd.to_numeric(ref["churn"], errors="coerce").fillna(0).astype("Int8")

# Kolon adƒ± temizlik (LGBM uyarƒ±sƒ± i√ßin)
for d in (cust, hist):
    d.columns = d.columns.str.strip()

print("‚úÖ Dosyalar y√ºklendi:")
print("  hist:", hist.shape, "| cust:", cust.shape, "| ref:", ref.shape, "| test:", test.shape)

# === 3) √ñZELLƒ∞K √úRETƒ∞Mƒ∞ ===
HIST_COLS = [
    "cust_id","date",
    "mobile_eft_all_cnt","active_product_category_nbr",
    "mobile_eft_all_amt","cc_transaction_all_amt","cc_transaction_all_cnt"
]
hist = hist[[c for c in HIST_COLS if c in hist.columns]].sort_values(["cust_id","date"]).reset_index(drop=True)

# 5'li lag penceresi
WINDOWS = [1, 2, 3, 6, 12]
BASE_NUM_COLS = [c for c in HIST_COLS if c not in ["cust_id","date"]]

def _agg_safe(s, fn):
    s = pd.to_numeric(s, errors="coerce")
    s = s[np.isfinite(s)]
    if s.size == 0: return 0.0
    if fn=="sum":  return float(s.sum())
    if fn=="mean": return float(s.mean())
    if fn=="std":  return float(s.std(ddof=0))
    if fn=="max":  return float(s.max())
    return 0.0

def _months_active(frame: pd.DataFrame) -> int:
    """Penceredeki AY sayƒ±sƒ±nƒ± (benzersiz) say: en az 1 i≈ülem yapƒ±lan ay."""
    if frame.empty:
        return 0
    m = frame.copy()
    m["ym"] = m["date"].dt.to_period("M")
    monthly_active = (
        m.groupby("ym")[["cc_transaction_all_cnt", "mobile_eft_all_cnt"]]
         .sum()
         .sum(axis=1) > 0
    )
    return int(monthly_active.sum())

def aggregate_for_one(h_cust: pd.DataFrame, ref_date: pd.Timestamp):
    feat = {"ref_date": ref_date}
    h = h_cust[h_cust["date"] < ref_date]

    if h.empty:
        for col in BASE_NUM_COLS:
            for w in WINDOWS:
                feat[f"{col}_L{w}M_sum"]=feat[f"{col}_L{w}M_mean"]=feat[f"{col}_L{w}M_std"]=feat[f"{col}_L{w}M_max"]=0.0
        feat["recency_days"] = 9999
        for w in WINDOWS:
            feat[f"months_active_L{w}M"] = 0.0
            feat[f"inactivity_ratio_L{w}M"] = 1.0
            feat[f"activity_rate_L{w}M"] = 0.0
            feat[f"cc_share_amt_L{w}M"] = 0.0
            feat[f"eft_share_amt_L{w}M"] = 0.0
        feat["cc_amt_momentum_L1v3m"] = 0.0
        feat["cc_amt_momentum_L1v6m"] = 0.0
        feat["eft_amt_momentum_L1v3m"] = 0.0
        feat["eft_amt_momentum_L1v6m"] = 0.0
        feat["cc_share_shift_L1v6"] = 0.0
        feat["eft_share_shift_L1v6"] = 0.0
    else:
        feat["recency_days"] = int((ref_date - h["date"].max()).days)
        for w in WINDOWS:
            start = ref_date - pd.DateOffset(months=w)
            hw = h[(h["date"] >= start) & (h["date"] < ref_date)]

            for col in BASE_NUM_COLS:
                feat[f"{col}_L{w}M_sum"]  = _agg_safe(hw[col], "sum")
                feat[f"{col}_L{w}M_mean"] = _agg_safe(hw[col], "mean")
                feat[f"{col}_L{w}M_std"]  = _agg_safe(hw[col], "std")
                feat[f"{col}_L{w}M_max"]  = _agg_safe(hw[col], "max")

            # activity / inactivity / activity_rate (AY bazƒ±nda ve clamp)
            ma_raw = float(_months_active(hw))
            ma = min(ma_raw, float(w))
            feat[f"months_active_L{w}M"]    = ma
            feat[f"inactivity_ratio_L{w}M"] = max(w - ma, 0.0) / float(w)
            feat[f"activity_rate_L{w}M"]    = min(ma / float(w), 1.0)  # 0..1

        # Kanal paylarƒ±
        eps = 1e-6
        for w in WINDOWS:
            cc_sum  = feat.get(f"cc_transaction_all_amt_L{w}M_sum", 0.0)
            eft_sum = feat.get(f"mobile_eft_all_amt_L{w}M_sum", 0.0)
            tot = cc_sum + eft_sum
            feat[f"cc_share_amt_L{w}M"]  = cc_sum / (tot + eps)
            feat[f"eft_share_amt_L{w}M"] = eft_sum / (tot + eps)

        # Momentum: son 1 ay toplam / son 3-6 ay ortalama
        L1_cc   = feat.get("cc_transaction_all_amt_L1M_sum", 0.0)
        L1_eft  = feat.get("mobile_eft_all_amt_L1M_sum", 0.0)
        L3_ccm  = feat.get("cc_transaction_all_amt_L3M_mean", 0.0)
        L6_ccm  = feat.get("cc_transaction_all_amt_L6M_mean", 0.0)
        L3_eftm = feat.get("mobile_eft_all_amt_L3M_mean", 0.0)
        L6_eftm = feat.get("mobile_eft_all_amt_L6M_mean", 0.0)
        eps = 1e-6
        feat["cc_amt_momentum_L1v3m"]  = L1_cc  / (L3_ccm  + eps)
        feat["cc_amt_momentum_L1v6m"]  = L1_cc  / (L6_ccm  + eps)
        feat["eft_amt_momentum_L1v3m"] = L1_eft / (L3_eftm + eps)
        feat["eft_amt_momentum_L1v6m"] = L1_eft / (L6_eftm + eps)

        # Kanal payƒ± deƒüi≈üimi (L1 vs L6)
        feat["cc_share_shift_L1v6"]  = feat.get("cc_share_amt_L1M", 0.0)  - feat.get("cc_share_amt_L6M", 0.0)
        feat["eft_share_shift_L1v6"] = feat.get("eft_share_amt_L1M", 0.0) - feat.get("eft_share_amt_L6M", 0.0)

    # ---- SEZON / AY Bƒ∞LGƒ∞Sƒ∞ (ref_date'e g√∂re; h bo≈ü olsa da √ßalƒ±≈üƒ±r) ----
    m = int(ref_date.month)
    feat["mo_sin"] = float(np.sin(2 * np.pi * (m / 12.0)))
    feat["mo_cos"] = float(np.cos(2 * np.pi * (m / 12.0)))
    feat["is_q_end_month"] = 1.0 if m in (3, 6, 9, 12) else 0.0
    feat["is_year_start"]  = 1.0 if m in (1, 2) else 0.0
    feat["is_year_end"]    = 1.0 if m in (11, 12) else 0.0
    for gname, months in SEASON_GROUPS_5.items():
        if m in months:
            feat["mo_bin5"] = gname
            break

    # L12M i√ßinde sezon bazlƒ± daƒüƒ±lƒ±m paylarƒ±
    hw12 = h[(h["date"] >= ref_date - pd.DateOffset(months=12)) & (h["date"] < ref_date)]
    eps = 1e-6
    for base in ["cc_transaction_all_amt","mobile_eft_all_amt",
                 "cc_transaction_all_cnt","mobile_eft_all_cnt"]:
        tot12 = _agg_safe(hw12[base], "sum")
        for gname, months in SEASON_GROUPS_5.items():
            gsum = _agg_safe(hw12.loc[hw12["date"].dt.month.isin(months), base], "sum")
            feat[f"{base}_share_L12M_{gname}"] = gsum / (tot12 + eps)

    # Yƒ±l sonu / yƒ±l ba≈üƒ± √∂zel paylarƒ± (Kasƒ±m‚ÄìAralƒ±k‚ÄìOcak)
    ye_months = [11, 12, 1]
    for base in ["cc_transaction_all_amt","mobile_eft_all_amt"]:
        tot12 = _agg_safe(hw12[base], "sum")
        ye_sum = _agg_safe(hw12.loc[hw12["date"].dt.month.isin(ye_months), base], "sum")
        feat[f"{base}_yearend3_share_L12M"] = ye_sum / (tot12 + eps)

    # Aralƒ±k‚ÄìOcak k√∂pr√ºs√º i√ßin pencere bayraƒüƒ± (son 2 ayda 12 veya 1 var mƒ±?)
    last2 = h[(h["date"] >= ref_date - pd.DateOffset(months=2)) & (h["date"] < ref_date)]
    last2_months = set(last2["date"].dt.month.unique().tolist())
    feat["is_year_turn_window"] = 1.0 if (12 in last2_months or 1 in last2_months) else 0.0

    # ƒ∞≈ülem ba≈üƒ±na tutar oranlarƒ±
    eps = 1e-6
    for w in WINDOWS:
        feat[f"cc_amt_per_txn_L{w}M"] = feat.get(f"cc_transaction_all_amt_L{w}M_sum",0.0)/(feat.get(f"cc_transaction_all_cnt_L{w}M_sum",0.0)+eps)
        feat[f"mobile_eft_amt_per_txn_L{w}M"] = feat.get(f"mobile_eft_all_amt_L{w}M_sum",0.0)/(feat.get(f"mobile_eft_all_cnt_L{w}M_sum",0.0)+eps)

    # Kƒ±sa vs uzun d√∂nem TREND (yalnƒ±zca mean bazlƒ±)
    for short,long in [(1,3),(3,6),(6,12)]:
        for base in ["cc_transaction_all_amt","cc_transaction_all_cnt","mobile_eft_all_amt","mobile_eft_all_cnt","active_product_category_nbr"]:
            ms = feat.get(f"{base}_L{short}M_mean",0.0)
            ml = feat.get(f"{base}_L{long}M_mean",0.0)
            feat[f"{base}_trend_mean_L{short}vL{long}_diff"]  = ms-ml
            feat[f"{base}_trend_mean_L{short}vL{long}_ratio"] = ms/(ml+eps)

    # Recency d√∂n√º≈ü√ºm√º
    feat["recency_log"] = float(np.log1p(feat.get("recency_days", 9999)))

    return feat

def build_features_for_keys(hist_df, keys_df, progress_every=5000):
    rows, cache = [], {}
    for i, r in enumerate(keys_df.itertuples(index=False), 1):
        cid, rd = r.cust_id, r.ref_date
        if pd.isna(cid) or pd.isna(rd): 
            continue
        h_cust = cache.get(cid)
        if h_cust is None:
            h_cust = hist_df[hist_df["cust_id"] == cid]
            cache[cid] = h_cust
        f = aggregate_for_one(h_cust, rd)
        f["cust_id"] = cid
        rows.append(f)
        if progress_every and (i % progress_every == 0):
            print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(rows)

# === 4) METRƒ∞KLER ===
def gini_from_auc(y_true, y_score):
    return 2 * roc_auc_score(y_true, y_score) - 1

def recall_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true)
    topk = int(np.ceil(len(y_true) * k))
    idx = np.argsort(-y_score)[:topk]
    return float(y_true[idx].sum()) / float(y_true.sum() + 1e-9)

def lift_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true)
    topk = int(np.ceil(len(y_true) * k))
    idx = np.argsort(-y_score)[:topk]
    prec = float(y_true[idx].mean())
    prev = float(y_true.mean())
    return prec / max(prev, 1e-9)

def competition_score(y_true, y_score):
    g = gini_from_auc(y_true, y_score)
    r = recall_at_k(y_true, y_score)
    l = lift_at_k(y_true, y_score)
    s = 0.4*g + 0.3*r + 0.3*l
    return {"Gini": round(g, 4), "Recall@10%": round(r, 4), "Lift@10%": round(l, 4), "CompetitionScore": round(s, 4)}

# === 5) MOD & √ñRNEKLEME ===
keys = ref[["cust_id","ref_date","churn"]].copy().sort_values("ref_date").reset_index(drop=True)

if RUN_MODE == "mini":
    if FOCUS_RECENT:
        q = keys["ref_date"].quantile(1.0 - RECENT_FRAC)
        keys_recent = keys[keys["ref_date"] >= q].reset_index(drop=True)
        if len(keys_recent) > SAMPLE_N:
            idx = np.linspace(0, len(keys_recent)-1, SAMPLE_N).round().astype(int)
            keys = keys_recent.iloc[idx].reset_index(drop=True)
        else:
            keys = keys_recent
    else:
        if len(keys) > SAMPLE_N:
            idx = np.linspace(0, len(keys)-1, SAMPLE_N).round().astype(int)
            keys = keys.iloc[idx].reset_index(drop=True)

print(f"üéØ √áalƒ±≈üma modu: {RUN_MODE} | keys: {len(keys)}")

# === 6) FEATURE √úRETƒ∞Mƒ∞ ===
parts = []
BATCH = 20000 if RUN_MODE=="full" else len(keys)
for s in range(0, len(keys), BATCH):
    e = min(s + BATCH, len(keys))
    feats_part = build_features_for_keys(hist, keys.iloc[s:e][["cust_id","ref_date"]], progress_every=5000)
    part = keys.iloc[s:e].merge(feats_part, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    parts.append(part)
    print(f"[train] done: {e}/{len(keys)}")

X_full = pd.concat(parts, ignore_index=True)
del parts; gc.collect()
print("‚úÖ Eƒüitim seti hazƒ±r:", X_full.shape)

# === 7) ENCODE + EKSƒ∞K DOLDUR ===
num_cols = X_full.select_dtypes(include="number").columns
cat_cols = [c for c in X_full.columns if c not in num_cols and c != "ref_date"]
X_full[num_cols] = X_full[num_cols].fillna(0)
for c in cat_cols:
    X_full[c] = X_full[c].fillna("Unknown")

X_full_enc = pd.get_dummies(
    X_full.drop(columns=["ref_date"]),
    columns=cat_cols, drop_first=True
)
# LGBM uyarƒ±sƒ± √∂nlemi
X_full_enc.columns = X_full_enc.columns.str.replace(r"\s+", "_", regex=True)

# === 8) D√ú≈û√úK VARYANS DROP ===
if DO_LOWVAR_DROP:
    nunique = X_full_enc.nunique(dropna=False)
    lowvar_cols = nunique[nunique <= 1].index.tolist()
    if lowvar_cols:
        print(f"üßπ Low-variance drop: {len(lowvar_cols)} kolon")
        X_full_enc = X_full_enc.drop(columns=lowvar_cols)

# === 9) ZAMAN BAZLI TRAIN/VALID SPLIT ===
cut = X_full["ref_date"].quantile(0.80)
train_mask = X_full["ref_date"] < cut
valid_mask = ~train_mask

TARGET = "churn"
PROTECTED = ["cust_id", TARGET]
feat_cols = [c for c in X_full_enc.columns if c not in PROTECTED]

X_train = X_full_enc.loc[train_mask, feat_cols]
y_train = X_full.loc[train_mask, TARGET].astype(int)
X_valid = X_full_enc.loc[valid_mask, feat_cols]
y_valid = X_full.loc[valid_mask, TARGET].astype(int)

print("‚úÖ Split | Train:", X_train.shape, "| Valid:", X_valid.shape)

# === 10) MODEL ‚Äî 1. eƒüitim ===
clf = LGBMClassifier(
    objective="binary",
    n_estimators=800,
    learning_rate=0.035,
    max_depth=8,
    num_leaves=96,
    subsample=0.90,
    colsample_bytree=0.75,
    min_child_samples=24,
    reg_alpha=0.10,
    reg_lambda=1.25,
    class_weight="balanced",
    random_state=42
)
clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    callbacks=[early_stopping(stopping_rounds=30), log_evaluation(period=50)]
)

valid_proba = clf.predict_proba(X_valid)[:, 1]
metrics = competition_score(y_valid, valid_proba)
print("\nüìä ƒ∞lk eƒüitim:", metrics)

# === 11) IMPORTANCE PRUNING (gain-tabanlƒ±, g√ºvenli) ===
def get_feature_names(model):
    names = getattr(model, "feature_name_", None)
    if names is None:
        try:
            names = model.booster_.feature_name()
        except Exception:
            names = list(X_train.columns)
    return list(names)

BEST_CLF = clf
BEST_METRICS = metrics
BEST_FEATS = list(X_train.columns)

if DO_PRUNE:
    try:
        booster = getattr(clf, "booster_", None)
        if booster is None:
            booster = clf._Booster  # bazƒ± s√ºr√ºmler

        names = get_feature_names(clf)
        gains = booster.feature_importance(importance_type="gain")
        imp = pd.DataFrame({"feature": names, "gain": gains}).sort_values("gain", ascending=True)

        cut_idx = int(len(imp) * PRUNE_FRAC)
        drop_imp = imp.iloc[:cut_idx]["feature"].tolist()
        print(f"ü™ì (gain) Importance drop: {len(drop_imp)} kolon")

        keep_feats = [c for c in X_train.columns if c not in drop_imp]
        X_train2 = X_train[keep_feats]
        X_valid2 = X_valid[keep_feats]

        clf2 = LGBMClassifier(
            objective="binary",
            n_estimators=900,
            learning_rate=0.033,
            max_depth=8,
            num_leaves=112,
            subsample=0.90,
            colsample_bytree=0.80,
            min_child_samples=18,
            reg_alpha=0.10,
            reg_lambda=1.50,
            class_weight="balanced",
            random_state=42
        )
        clf2.fit(
            X_train2, y_train,
            eval_set=[(X_valid2, y_valid)],
            eval_metric="auc",
            callbacks=[early_stopping(stopping_rounds=30), log_evaluation(period=50)]
        )

        valid_proba2 = clf2.predict_proba(X_valid2)[:, 1]
        metrics2 = competition_score(y_valid, valid_proba2)
        print("\nüèÅ Budama sonrasƒ±:", metrics2)

        # En az PRUNE_IMPROVE kadar iyile≈üme olursa kabul et
        if (metrics2["CompetitionScore"] >= BEST_METRICS["CompetitionScore"] + PRUNE_IMPROVE):
            BEST_CLF = clf2
            BEST_METRICS = metrics2
            BEST_FEATS = keep_feats
            X_train = X_train2
            X_valid = X_valid2
        else:
            print("‚Ü©Ô∏è ƒ∞yile≈üme e≈üiƒüi kar≈üƒ±lanmadƒ±, ilk modeli koruyorum.")
            BEST_CLF = clf
            BEST_METRICS = metrics
            BEST_FEATS = list(X_train.columns)

    except Exception as e:
        print("‚ö†Ô∏è Pruning atlanƒ±yor (hata):", e)
        BEST_CLF = clf
        BEST_METRICS = metrics
        BEST_FEATS = list(X_train.columns)

print("\n‚úÖ Se√ßilen (en iyi) skor:", BEST_METRICS)

# === 12) FULL MOD SUBMISSION ===
if RUN_MODE == "full":
    print("\nüßæ Submission hazƒ±rlanƒ±yor...")
    test_keys = test[["cust_id","ref_date"]].copy()
    X_hist_test = build_features_for_keys(hist, test_keys, progress_every=5000)

    X_test = test_keys.merge(X_hist_test, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    num_cols_t = X_test.select_dtypes(include=[np.number]).columns
    cat_cols_t = [c for c in X_test.columns if c not in num_cols_t and c != "ref_date"]

    X_test[num_cols_t] = X_test[num_cols_t].fillna(0)
    for c in cat_cols_t:
        X_test[c] = X_test[c].fillna("Unknown")

    X_test_enc = pd.get_dummies(X_test.drop(columns=["ref_date"]), columns=cat_cols_t, drop_first=True)
    X_test_enc.columns = X_test_enc.columns.str.replace(r"\s+", "_", regex=True)

    # Eƒüitimde SON KULLANILAN feature setine hizala
    TRAIN_FEATS = BEST_FEATS
    X_test_enc = X_test_enc.reindex(columns=TRAIN_FEATS, fill_value=0)

    test_proba = BEST_CLF.predict_proba(X_test_enc)[:, 1]
    out = sub[["cust_id"]].merge(pd.DataFrame({"cust_id": X_test["cust_id"], "churn": test_proba}), on="cust_id", how="left")
    out["churn"] = out["churn"].fillna(float(ref["churn"].mean()))
    out.to_csv("submission.csv", index=False, float_format="%.6f")
    print("‚úÖ submission.csv yazƒ±ldƒ±! Satƒ±r sayƒ±sƒ±:", out.shape)

print("\nüéâ Tamamlandƒ±. Dosya: submission.csv")



  from pandas.core import (


[info] Bulundu: customer_history.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customer_history.csv
[info] Bulundu: customers.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customers.csv
[info] Bulundu: referance_data.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data.csv
[info] Bulundu: referance_data_test.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data_test.csv
[info] Bulundu: sample_submission.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\sample_submission.csv
‚úÖ Dosyalar y√ºklendi:
  hist: (5359609, 7) | cust: (176293, 8) | ref: (133287, 3) | test: (43006, 2)
üéØ √áalƒ±≈üma modu: full | keys: 133287


KeyboardInterrupt: 

In [3]:
# ==============================================================
# üß† ING Datathon ‚Äî MINI v1.0 (Momentum + ActivityRate Boost)
# ==============================================================

import os, glob, warnings
import numpy as np, pandas as pd
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score
warnings.filterwarnings("ignore")

# === 1Ô∏è‚É£ DOSYA BULMA OTOMATƒ∞K ===
def find_file(fname, search_roots=(".", "C:\\", "/content")):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                print(f"[info] Bulundu: {fname} -> {path}")
                return os.path.abspath(path)
    raise FileNotFoundError(f"{fname} bulunamadƒ±!")

PATH_CUST = find_file("customers.csv")
PATH_HIST = find_file("customer_history.csv")
PATH_REF  = find_file("referance_data.csv")

# === 2Ô∏è‚É£ Yardƒ±mcƒ± Fonksiyonlar ===
POS = {"1","true","yes","evet","active","churn","var"}
NEG = {"0","false","no","hayƒ±r","pasif","inactive","yok"}

def coerce_binary(s):
    s = pd.Series(s).replace([np.inf,-np.inf],np.nan).fillna(0)
    if pd.api.types.is_numeric_dtype(s):
        s = pd.to_numeric(s, errors="coerce").fillna(0).clip(0,1)
        return s.astype(int)
    s = s.astype(str).str.strip().str.lower()
    return s.map(lambda x:1 if x in POS else (0 if x in NEG else 0)).fillna(0).astype(int)

def gini_from_auc(y_true,y_score): return 2*roc_auc_score(y_true,y_score)-1
def recall_at_k(y_true,y_score,k=0.10):
    y_true=np.asarray(y_true); topk=int(np.ceil(len(y_true)*k))
    idx=np.argsort(-y_score)[:topk]; return y_true[idx].sum()/max(1,y_true.sum())
def lift_at_k(y_true,y_score,k=0.10):
    y_true=np.asarray(y_true); topk=int(np.ceil(len(y_true)*k))
    idx=np.argsort(-y_score)[:topk]; prec=y_true[idx].mean(); prev=y_true.mean()
    return prec/max(prev,1e-9)
def comp(y_true,y_score):
    g=gini_from_auc(y_true,y_score); r=recall_at_k(y_true,y_score); l=lift_at_k(y_true,y_score)
    return round(0.4*g+0.3*r+0.3*l,4), g, r, l

# === 3Ô∏è‚É£ Verileri oku ===
cust=pd.read_csv(PATH_CUST, low_memory=False)
hist=pd.read_csv(PATH_HIST, parse_dates=["date"], low_memory=False)
ref =pd.read_csv(PATH_REF , parse_dates=["ref_date"], low_memory=False)

cust["cust_id"]=pd.to_numeric(cust["cust_id"], errors="coerce").astype("Int64")
ref["cust_id"]=pd.to_numeric(ref["cust_id"], errors="coerce").astype("Int64")
ref["churn"]=coerce_binary(ref["churn"])
print("‚úÖ Dosyalar y√ºklendi:", cust.shape, hist.shape, ref.shape)

# === 4Ô∏è‚É£ MINI √∂rnek ===
SAMPLE_N=12000
RECENT_FRAC=0.55
keys=ref.sort_values("ref_date").tail(int(len(ref)*RECENT_FRAC)).sample(SAMPLE_N, random_state=42)

# === 5Ô∏è‚É£ √ñzellik √ºretimi ===
WINDOWS=[1,3,6,12]
def _agg(s, fn):
    s=pd.to_numeric(s,errors="coerce").replace([np.inf,-np.inf],np.nan).dropna()
    if len(s)==0: return 0
    return getattr(s, fn)() if fn in ["sum","mean","std","max"] else 0

def one_cust(h,rd):
    f={"ref_date":rd}
    h=h[h.date<rd]
    if h.empty: 
        for w in WINDOWS: 
            f[f"months_active_L{w}"]=0; f[f"activity_rate_L{w}"]=0
        f["recency"]=9999
        return f
    f["recency"]=(rd-h.date.max()).days
    for w in WINDOWS:
        hw=h[h.date>=rd-pd.DateOffset(months=w)]
        ma=hw.date.dt.to_period("M").nunique()
        f[f"months_active_L{w}"]=ma
        f[f"activity_rate_L{w}"]=ma/w
        cc=hw["cc_transaction_all_amt"].sum(); eft=hw["mobile_eft_all_amt"].sum()
        tot=cc+eft+1e-6
        f[f"cc_ratio_L{w}"]=cc/tot; f[f"eft_ratio_L{w}"]=eft/tot
        f[f"cc_cv_L{w}"]=hw["cc_transaction_all_amt"].std()/(hw["cc_transaction_all_amt"].mean()+1e-6)
        f[f"eft_cv_L{w}"]=hw["mobile_eft_all_amt"].std()/(hw["mobile_eft_all_amt"].mean()+1e-6)
    for s,l in [(1,3),(3,6),(6,12)]:
        for base in ["cc_ratio","eft_ratio","cc_cv","eft_cv"]:
            f[f"{base}_diff_L{s}vL{l}"]=f.get(f"{base}_L{s}",0)-f.get(f"{base}_L{l}",0)
            f[f"{base}_ratio_L{s}vL{l}"]=f.get(f"{base}_L{s}",0)/(f.get(f"{base}_L{l}",0)+1e-6)
    f["channel_hhi_L12"]=f["cc_ratio_L12"]**2+f["eft_ratio_L12"]**2
    return f

def feats(hist_df, keys_df):
    out=[]; cache={}
    for i,r in enumerate(keys_df.itertuples(index=False),1):
        cid,rd=r.cust_id,r.ref_date
        h=cache.get(cid)
        if h is None: h=hist_df[hist_df.cust_id==cid]; cache[cid]=h
        f=one_cust(h,rd); f["cust_id"]=cid; out.append(f)
        if i%2000==0: print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(out)

feat_df=feats(hist, keys)
X_full=keys.merge(feat_df,on=["cust_id","ref_date"],how="left").merge(cust,on="cust_id",how="left")

# === 6Ô∏è‚É£ Sezonsallƒ±k ===
X_full["ref_month"]=X_full["ref_date"].dt.month
X_full["ref_month_sin"]=np.sin(2*np.pi*X_full["ref_month"]/12)
X_full["ref_month_cos"]=np.cos(2*np.pi*X_full["ref_month"]/12)
X_full["is_year_start_end"]=((X_full["ref_month"]==12)|(X_full["ref_month"]==1)).astype(int)

# === 7Ô∏è‚É£ Encode & Doldurma ===
num=X_full.select_dtypes(include="number").columns
cat=[c for c in X_full.columns if c not in num and c!="ref_date"]
X_full[num]=X_full[num].replace([np.inf,-np.inf],np.nan).fillna(0)
for c in cat: X_full[c]=X_full[c].fillna("Unknown")
X_full=pd.get_dummies(X_full, columns=cat, drop_first=True)

# === 8Ô∏è‚É£ Split ===
cut=X_full["ref_date"].quantile(0.80)
train_mask=X_full["ref_date"]<cut
valid_mask=~train_mask
feat_cols=[c for c in X_full.columns if c not in ["cust_id","ref_date","churn"]]
X_tr,y_tr=X_full.loc[train_mask,feat_cols],X_full.loc[train_mask,"churn"].astype(int)
X_va,y_va=X_full.loc[valid_mask,feat_cols],X_full.loc[valid_mask,"churn"].astype(int)

# === 9Ô∏è‚É£ Ensemble eƒüitim ===
def train_one(seed,cf,colf):
    clf=LGBMClassifier(
        objective="binary",n_estimators=1200,learning_rate=0.03,
        num_leaves=128,max_depth=8,subsample=cf,colsample_bytree=colf,
        min_child_samples=25,class_weight="balanced",random_state=seed)
    clf.fit(X_tr,y_tr,eval_set=[(X_va,y_va)],eval_metric="auc",
            callbacks=[early_stopping(80),log_evaluation(200)])
    proba=clf.predict_proba(X_va)[:,1]
    return clf,proba

seeds=[(42,0.9,0.85),(7,0.85,0.9),(2027,0.8,0.95)]
probs=[]
for sd,cf,colf in seeds:
    clf,p=train_one(sd,cf,colf); probs.append(p)
ens=np.mean(probs,axis=0)

# === üîü Skorlar ===
score,g,r,l=comp(y_va,ens)
print(f"\n‚úÖ VALIDATION ‚Äî Gini={g:.4f} | Recall@10%={r:.4f} | Lift@10%={l:.4f} | CompetitionScore={score:.4f}")

imp=pd.DataFrame({"feature":feat_cols,"importance":clf.feature_importances_}).sort_values("importance",ascending=False)
print("\nüèÖ En √∂nemli 25 deƒüi≈üken:")
print(imp.head(25).to_string(index=False))


[info] Bulundu: customers.csv -> .\customers.csv
[info] Bulundu: customer_history.csv -> .\customer_history.csv
[info] Bulundu: referance_data.csv -> .\referance_data.csv
‚úÖ Dosyalar y√ºklendi: (176293, 8) (5359609, 7) (133287, 3)
[features] 2000/12000
[features] 4000/12000
[features] 6000/12000
[features] 8000/12000
[features] 10000/12000
[features] 12000/12000
[LightGBM] [Info] Number of positive: 1329, number of negative: 8232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003432 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9332
[LightGBM] [Info] Number of data points in the train set: 9561, number of used features: 69
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 80 rounds


Early stopping, best iteration is:
[15]	valid_0's auc: 0.519437	valid_0's binary_logloss: 0.663007
[LightGBM] [Info] Number of positive: 1329, number of negative: 8232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9332
[LightGBM] [Info] Number of data points in the train set: 9561, number of used features: 69
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 80 rounds
Early stopping, best iteration is:
[21]	valid_0's auc: 0.518921	valid_0's binary_logloss: 0.652712
[LightGBM] [Info] Number of positive: 1329, number of negative: 8232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003329 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Tot

Early stopping, best iteration is:
[5]	valid_0's auc: 0.51287	valid_0's binary_logloss: 0.683357

‚úÖ VALIDATION ‚Äî Gini=0.0390 | Recall@10%=0.1127 | Lift@10%=1.1267 | CompetitionScore=0.3874

üèÖ En √∂nemli 25 deƒüi≈üken:
               feature  importance
             eft_cv_L3          19
                tenure          18
              cc_cv_L6          18
            eft_cv_L12          16
   cc_ratio_diff_L1vL3          14
                   age          14
   eft_cv_ratio_L6vL12          14
 cc_ratio_ratio_L6vL12          13
           cc_ratio_L1          13
             eft_cv_L6          12
     cc_cv_ratio_L3vL6          11
    eft_cv_diff_L6vL12          10
     cc_cv_diff_L6vL12          10
             cc_cv_L12          10
      cc_cv_diff_L3vL6          10
  cc_ratio_ratio_L1vL3           9
       channel_hhi_L12           9
              cc_cv_L3           8
eft_ratio_ratio_L6vL12           8
  cc_ratio_diff_L6vL12           8
     eft_cv_diff_L3vL6           7
    e

In [4]:
# ===============================================================
# üß† ING Datathon ‚Äî MINI v1.2 (Trend + LogRecency + Clean Ratio)
# ===============================================================
import os, glob, warnings
import numpy as np, pandas as pd
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit
warnings.filterwarnings("ignore")

# === 1Ô∏è‚É£ Dosya bulucu ===
def find_file(fname, roots=(".", "C:\\", "/content")):
    fname=fname.lower()
    for root in roots:
        for p in glob.iglob(os.path.join(root,"**","*"),recursive=True):
            if os.path.isfile(p) and os.path.basename(p).lower()==fname:
                print(f"[info] Bulundu: {fname} -> {p}")
                return os.path.abspath(p)
    raise FileNotFoundError(fname)

PATH_CUST=find_file("customers.csv")
PATH_HIST=find_file("customer_history.csv")
PATH_REF =find_file("referance_data.csv")

# === 2Ô∏è‚É£ Yardƒ±mcƒ±lar ===
def coerce_binary(s):
    s=pd.Series(s).replace([np.inf,-np.inf],np.nan).fillna(0)
    if pd.api.types.is_numeric_dtype(s): return s.clip(0,1).astype(int)
    s=s.astype(str).str.lower().str.strip()
    return s.map(lambda x:1 if x in {"1","true","yes","evet","active","var"} else 0).fillna(0).astype(int)

def gini(y_true,y_score): return 2*roc_auc_score(y_true,y_score)-1
def recall10(y_true,y_score):
    y_true=np.asarray(y_true); top=int(len(y_true)*0.1)
    idx=np.argsort(-y_score)[:top]; return y_true[idx].sum()/max(1,y_true.sum())
def lift10(y_true,y_score):
    y_true=np.asarray(y_true); top=int(len(y_true)*0.1)
    idx=np.argsort(-y_score)[:top]; return y_true[idx].mean()/max(y_true.mean(),1e-9)
def comp(y_true,y_score):
    g=gini(y_true,y_score); r=recall10(y_true,y_score); l=lift10(y_true,y_score)
    return round(0.4*g+0.3*r+0.3*l,4),g,r,l

# === 3Ô∏è‚É£ Verileri oku ===
cust=pd.read_csv(PATH_CUST, low_memory=False)
hist=pd.read_csv(PATH_HIST, parse_dates=["date"], low_memory=False)
ref =pd.read_csv(PATH_REF , parse_dates=["ref_date"], low_memory=False)
for d in [cust,ref]:
    d["cust_id"]=pd.to_numeric(d["cust_id"],errors="coerce").astype("Int64")
ref["churn"]=coerce_binary(ref["churn"])
print("‚úÖ Dosyalar:", cust.shape, hist.shape, ref.shape)

# === 4Ô∏è‚É£ Mini √∂rnek ===
SAMPLE_N=12000
RECENT_FRAC=0.55
keys=ref.sort_values("ref_date").tail(int(len(ref)*RECENT_FRAC)).sample(SAMPLE_N,random_state=42)

# === 5Ô∏è‚É£ Feature √ºretimi ===
WINDOWS=[1,3,6,12]
def one_cust(h,rd):
    f={"ref_date":rd}
    h=h[h.date<rd]
    if h.empty:
        for w in WINDOWS: f[f"months_active_L{w}"]=0
        f["recency"]=9999; return f
    f["recency"]=(rd-h.date.max()).days
    for w in WINDOWS:
        hw=h[h.date>=rd-pd.DateOffset(months=w)]
        ma=hw.date.dt.to_period("M").nunique()
        f[f"activity_rate_L{w}"]=ma/w
        cc=hw["cc_transaction_all_amt"].sum(); eft=hw["mobile_eft_all_amt"].sum()
        tot=cc+eft+1e-6
        f[f"cc_share_L{w}"]=cc/tot; f[f"eft_share_L{w}"]=eft/tot
        f[f"cc_mean_L{w}"]=hw["cc_transaction_all_amt"].mean()
        f[f"eft_mean_L{w}"]=hw["mobile_eft_all_amt"].mean()
    # trend (1v3, 3v6)
    for s,l in [(1,3),(3,6)]:
        for base in ["cc_mean","eft_mean","activity_rate"]:
            f[f"{base}_trend_{s}v{l}"]=f.get(f"{base}_L{s}",0)-f.get(f"{base}_L{l}",0)
    return f

def feats(hist_df, keys_df):
    out=[]; cache={}
    for i,r in enumerate(keys_df.itertuples(index=False),1):
        cid,rd=r.cust_id,r.ref_date
        if cid not in cache:
            cache[cid]=hist_df[hist_df.cust_id==cid]
        f=one_cust(cache[cid],rd); f["cust_id"]=cid; out.append(f)
        if i%2000==0: print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(out)

feat=feats(hist,keys)
X_full=keys.merge(feat,on=["cust_id","ref_date"],how="left").merge(cust,on="cust_id",how="left")

# === 6Ô∏è‚É£ Recency d√∂n√º≈ü√ºmleri ===
X_full["recency_log"]=np.log1p(X_full["recency"])
X_full["recency_score"]=np.exp(-X_full["recency"]/30)

# === 7Ô∏è‚É£ Encode & Split ===
num=X_full.select_dtypes(include="number").columns
X_full[num]=X_full[num].replace([np.inf,-np.inf],np.nan).fillna(0)
X_full=pd.get_dummies(X_full,drop_first=True)

y=X_full["churn"].astype(int)
X=X_full.drop(columns=["churn","cust_id","ref_date"])
sss=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
tr_idx,va_idx=list(sss.split(X,y))[0]
X_tr,X_va=X.iloc[tr_idx],X.iloc[va_idx]
y_tr,y_va=y.iloc[tr_idx],y.iloc[va_idx]

# === 8Ô∏è‚É£ Model ===
def train(seed):
    clf=LGBMClassifier(
        objective="binary",learning_rate=0.035,n_estimators=1000,
        num_leaves=128,max_depth=8,subsample=0.9,colsample_bytree=0.9,
        min_child_samples=25,class_weight="balanced",random_state=seed)
    clf.fit(X_tr,y_tr,eval_set=[(X_va,y_va)],eval_metric="auc",
            callbacks=[early_stopping(60),log_evaluation(200)])
    return clf,clf.predict_proba(X_va)[:,1]

p1=train(42)[1]; p2=train(7)[1]
ens=(p1+p2)/2

# === 9Ô∏è‚É£ Skor ===
score,g,r,l=comp(y_va,ens)
print(f"\n‚úÖ VALIDATION ‚Äî Gini={g:.4f} | Recall@10%={r:.4f} | Lift@10%={l:.4f} | CompetitionScore={score:.4f}")


[info] Bulundu: customers.csv -> .\customers.csv
[info] Bulundu: customer_history.csv -> .\customer_history.csv
[info] Bulundu: referance_data.csv -> .\referance_data.csv
‚úÖ Dosyalar: (176293, 8) (5359609, 7) (133287, 3)
[features] 2000/12000
[features] 4000/12000
[features] 6000/12000
[features] 8000/12000
[features] 10000/12000
[features] 12000/12000
[LightGBM] [Info] Number of positive: 1340, number of negative: 8260
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5489
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 60 rounds


Early stopping, best iteration is:
[122]	valid_0's auc: 0.549677	valid_0's binary_logloss: 0.588982
[LightGBM] [Info] Number of positive: 1340, number of negative: 8260
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001873 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5489
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 60 rounds


Early stopping, best iteration is:
[79]	valid_0's auc: 0.549793	valid_0's binary_logloss: 0.612296

‚úÖ VALIDATION ‚Äî Gini=0.1013 | Recall@10%=0.1015 | Lift@10%=1.0149 | CompetitionScore=0.3754


In [1]:
# ===============================================================
# MINI v2 ‚Äî Time-based valid + rich features + safe target
# ===============================================================
import os, glob, warnings
import numpy as np, pandas as pd
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score
warnings.filterwarnings("ignore")

# ---------- 0) Yol bulucu ----------
def find_file(fname, roots=(".", "C:\\", "/content")):
    fname=fname.lower()
    for r in roots:
        for p in glob.iglob(os.path.join(r, "**", "*"), recursive=True):
            if os.path.isfile(p) and os.path.basename(p).lower()==fname:
                print(f"[info] Bulundu: {fname} -> {p}")
                return os.path.abspath(p)
    raise FileNotFoundError(fname)

PATH_CUST = find_file("customers.csv")
PATH_HIST = find_file("customer_history.csv")
PATH_REF  = find_file("referance_data.csv")

# ---------- 1) Yardƒ±mcƒ±lar ----------
def coerce_binary(s):
    s = pd.Series(s).replace([np.inf,-np.inf], np.nan)
    # Sayƒ±sal -> 0/1 kliple
    if pd.api.types.is_numeric_dtype(s):
        s = s.fillna(0)
        return s.clip(0,1).astype(int)
    # Metinsel -> harita
    s = s.astype(str).str.lower().str.strip()
    pos = {"1","true","yes","evet","y","var","churned","aktifdegil","left"}
    neg = {"0","false","no","hayir","n","yok","stay","aktif","kaldi"}
    mapped = s.map(lambda x: 1 if x in pos else (0 if x in neg else np.nan))
    return mapped.fillna(0).astype(int)

def gini(y, p): return 2*roc_auc_score(y, p)-1
def recall_at_10(y, p):
    y = np.asarray(y); p = np.asarray(p)
    k = max(1, int(0.10*len(y)))
    idx = np.argsort(-p)[:k]
    return y[idx].sum() / max(1, y.sum())
def lift_at_10(y, p):
    y = np.asarray(y); p = np.asarray(p)
    k = max(1, int(0.10*len(y)))
    idx = np.argsort(-p)[:k]
    return y[idx].mean() / max(y.mean(), 1e-9)
def competition_score(y, p):
    G = gini(y, p)
    R = recall_at_10(y, p)
    L = lift_at_10(y, p)
    return round(0.4*G + 0.3*R + 0.3*L, 4), G, R, L

# ---------- 2) Veriyi oku ----------
cust = pd.read_csv(PATH_CUST, low_memory=False)
hist = pd.read_csv(PATH_HIST, low_memory=False)
ref  = pd.read_csv(PATH_REF , low_memory=False)

# Tarih alanlarƒ±nƒ± d√∂n√º≈üt√ºr
for c in ("date","txn_date"):
    if c in hist.columns:
        hist[c] = pd.to_datetime(hist[c], errors="coerce")
if "date" not in hist.columns and "txn_date" in hist.columns:
    hist["date"] = hist["txn_date"]
ref["ref_date"] = pd.to_datetime(ref["ref_date"], errors="coerce")

# ID uyumla
for d in (cust, hist, ref):
    if "cust_id" not in d.columns:
        # olasƒ± alternatif isimler
        alts = [c for c in d.columns if c.lower() in {"customer_id","id","client_id","musteri_id"}]
        assert alts, "cust_id kolonunu bulamadƒ±m (customers/history/reference)."
        d.rename(columns={alts[0]:"cust_id"}, inplace=True)
    d["cust_id"] = pd.to_numeric(d["cust_id"], errors="coerce").astype("Int64")

# Hedefi g√ºvenli kur
assert "churn" in ref.columns, "referance_data.csv i√ßinde 'churn' bekleniyordu."
ref["churn"] = coerce_binary(ref["churn"]).astype(int)

# K√º√ß√ºk kontrol loglarƒ±
pos_rate = ref["churn"].mean()
print(f"‚úÖ Veri boyutlarƒ± | customers={cust.shape}, history={hist.shape}, ref={ref.shape}")
print(f"‚úÖ Pozitif oranƒ± (ref): {pos_rate:.4f}")

# ---------- 3) Zengin √∂znitelikler ----------
AMT_COLS = []
for c in hist.columns:
    cl = c.lower()
    if "amt" in cl or ("transaction" in cl and "amt" in cl):
        AMT_COLS.append(c)
# Varsayƒ±lan iki ana sinyal (yoksa):
if not AMT_COLS:
    for c in ["cc_transaction_all_amt","mobile_eft_all_amt"]:
        if c in hist.columns: AMT_COLS.append(c)
assert AMT_COLS, "Tutar benzeri kolon bulunamadƒ± (amt)."

CAT_COLS = []
for c in hist.columns:
    if hist[c].dtype == "object" and c not in {"cust_id"}:
        CAT_COLS.append(c)

WINDOWS = [1,3,6,12]

def channel_hhi(s):
    # Her kategorinin payƒ±nƒ±n karelerinin toplamƒ± (HHI)
    if s.empty: return 0.0
    p = s.value_counts(normalize=True)
    return float((p**2).sum())

def features_for_one(h, rd):
    f = {"ref_date": rd}
    h = h[h["date"] < rd]
    if h.empty:
        f["recency"] = 9999
        for w in WINDOWS:
            f[f"months_active_L{w}"] = 0
        return f
    # Recency
    f["recency"] = (rd - h["date"].max()).days
    # Months active
    for w in WINDOWS:
        hw = h[h["date"] >= rd - pd.DateOffset(months=w)]
        f[f"months_active_L{w}"] = hw["date"].dt.to_period("M").nunique()
        # AMT aggregations
        for col in AMT_COLS:
            vals = pd.to_numeric(hw[col], errors="coerce").replace([np.inf,-np.inf], np.nan).fillna(0)
            f[f"{col}_sum_L{w}"]  = float(vals.sum())
            f[f"{col}_mean_L{w}"] = float(vals.mean())
            f[f"{col}_std_L{w}"]  = float(vals.std())
            f[f"{col}_cnt_L{w}"]  = int((vals>0).sum())
        # Kanal HHI (en sƒ±k g√∂r√ºnen object kolonunun daƒüƒ±lƒ±mƒ±)
        if CAT_COLS:
            top_cat = CAT_COLS[0]
            f[f"channel_hhi_L{w}"] = channel_hhi(hw[top_cat].dropna())
    # Trend / Ratio / Accel
    for s,l in [(1,3),(3,6),(6,12),(1,6),(1,12),(3,12)]:
        for col in AMT_COLS:
            m_s = f.get(f"{col}_mean_L{s}", 0.0)
            m_l = f.get(f"{col}_mean_L{l}", 0.0)
            s_s = f.get(f"{col}_std_L{s}", 0.0)
            s_l = f.get(f"{col}_std_L{l}", 0.0)
            f[f"{col}_mean_diff_{s}v{l}"] = m_s - m_l
            f[f"{col}_mean_ratio_{s}v{l}"] = (m_s+1e-6)/(m_l+1e-6)
            f[f"{col}_std_diff_{s}v{l}"]  = s_s - s_l
            f[f"{col}_std_ratio_{s}v{l}"] = (s_s+1e-6)/(s_l+1e-6)
    # Basit accel: L1 vs L3 vs L6
    for col in AMT_COLS:
        m1 = f.get(f"{col}_mean_L1", 0.0)
        m3 = f.get(f"{col}_mean_L3", 0.0)
        m6 = f.get(f"{col}_mean_L6", 0.0)
        f[f"{col}_accel_L1_3_6"] = (m1-m3) - (m3-m6)
    return f

# Sadece gerekli anahtarlarla √ßalƒ±≈üalƒ±m (performans)
ref_ = ref[["cust_id","ref_date","churn"]].dropna().copy()
# Daha g√º√ßl√º sinyal i√ßin son d√∂nemlerden daha fazla √∂rnek
ref_ = ref_.sort_values("ref_date")
# ƒ∞stersen burada downsample/upsample yapma‚Äîsƒ±zƒ±ntƒ±sƒ±z kalsƒ±n

# History index cache
hist_idx = hist.groupby("cust_id").indices

def build_features(ref_keys, hist_df):
    out = []
    for i, row in enumerate(ref_keys.itertuples(index=False), 1):
        cid, rd = row.cust_id, row.ref_date
        hidx = hist_idx.get(cid, None)
        h_c = hist_df.iloc[hidx] if hidx is not None else hist_df[0:0]
        f = features_for_one(h_c, rd)
        f["cust_id"] = cid
        out.append(f)
        if i % 3000 == 0:
            print(f"[features] {i}/{len(ref_keys)}")
    return pd.DataFrame(out)

feat = build_features(ref_[["cust_id","ref_date"]], hist)
df = ref_.merge(feat, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")

# Ek temel sinyaller
df["recency_log"]   = np.log1p(df["recency"].replace([np.inf,-np.inf], np.nan).fillna(9999))
df["recency_score"] = np.exp(-df["recency"].replace([np.inf,-np.inf], np.nan).fillna(9999)/30)

# Numerik temizliƒüi
num_cols = df.select_dtypes(include=["number","Float64","Int64"]).columns
df[num_cols] = df[num_cols].replace([np.inf,-np.inf], np.nan).fillna(0)

# Kategorikleri dummies
cat_cols = df.select_dtypes(include=["object"]).columns
df = pd.get_dummies(df, columns=list(cat_cols), drop_first=True)

# ---------- 4) ZAMAN BAZLI VALIDATION ----------
# ref_date'e g√∂re son %20 valid
cutoff = df["ref_date"].quantile(0.80)
train_mask = df["ref_date"] <  cutoff
valid_mask = df["ref_date"] >= cutoff

y = df["churn"].astype(int)
X = df.drop(columns=["churn","cust_id","ref_date"])

X_train, y_train = X[train_mask], y[train_mask]
X_valid, y_valid = X[valid_mask], y[valid_mask]

print(f"Train: {X_train.shape}, Valid: {X_valid.shape}")
print(f"Pos rate | train={y_train.mean():.4f}, valid={y_valid.mean():.4f}")

# ---------- 5) Model ----------
pos_weight = (len(y_train)-y_train.sum())/max(1,y_train.sum())
clf = LGBMClassifier(
    objective="binary",
    learning_rate=0.03,
    n_estimators=3000,
    num_leaves=192,
    max_depth=8,
    subsample=0.85,
    colsample_bytree=0.85,
    min_child_samples=120,       # overfit'i azalt
    min_split_gain=0.0,
    reg_alpha=0.3,
    reg_lambda=2.0,
    scale_pos_weight=float(pos_weight),  # dengeli ama agresif deƒüil
    random_state=42,
    n_jobs=-1
)
clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    callbacks=[early_stopping(200), log_evaluation(200)]
)

p_valid = clf.predict_proba(X_valid)[:,1]
score, G, R, L = competition_score(y_valid, p_valid)
print(f"\n‚úÖ VALIDATION ‚Äî Gini={G:.4f} | Recall@10%={R:.4f} | Lift@10%={L:.4f} | CompetitionScore={score:.4f}")

# √ñnemli 25
imp = pd.DataFrame({"feature": X_train.columns, "importance": clf.booster_.feature_importance("gain")})
imp = imp.sort_values("importance", ascending=False).head(25)
print("\nüèÖ En √∂nemli 25 deƒüi≈üken:")
print(imp.to_string(index=False))


  from pandas.core import (


[info] Bulundu: customers.csv -> .\customers.csv
[info] Bulundu: customer_history.csv -> .\customer_history.csv
[info] Bulundu: referance_data.csv -> .\referance_data.csv
‚úÖ Veri boyutlarƒ± | customers=(176293, 8), history=(5359609, 7), ref=(133287, 3)
‚úÖ Pozitif oranƒ± (ref): 0.1416
[features] 3000/133287
[features] 6000/133287
[features] 9000/133287
[features] 12000/133287
[features] 15000/133287
[features] 18000/133287
[features] 21000/133287
[features] 24000/133287
[features] 27000/133287
[features] 30000/133287
[features] 33000/133287
[features] 36000/133287
[features] 39000/133287
[features] 42000/133287
[features] 45000/133287
[features] 48000/133287
[features] 51000/133287
[features] 54000/133287
[features] 57000/133287
[features] 60000/133287
[features] 63000/133287
[features] 66000/133287
[features] 69000/133287
[features] 72000/133287
[features] 75000/133287
[features] 78000/133287
[features] 81000/133287
[features] 84000/133287
[features] 87000/133287
[features] 90000/133

[200]	valid_0's auc: 0.538786	valid_0's binary_logloss: 0.658823
Early stopping, best iteration is:
[1]	valid_0's auc: 0.519254	valid_0's binary_logloss: 0.381647

‚úÖ VALIDATION ‚Äî Gini=0.0385 | Recall@10%=0.1083 | Lift@10%=1.0836 | CompetitionScore=0.3730

üèÖ En √∂nemli 25 deƒüi≈üken:
                               feature  importance
             mobile_eft_all_amt_sum_L1 1849.029709
                                   age  857.379906
  cc_transaction_all_amt_mean_diff_1v3  570.800114
       cc_transaction_all_amt_mean_L12  373.238895
 cc_transaction_all_amt_std_ratio_3v12  370.245100
 cc_transaction_all_amt_mean_ratio_1v3  308.966698
     mobile_eft_all_amt_mean_ratio_1v3  296.687844
             mobile_eft_all_amt_sum_L6  265.981497
     mobile_eft_all_amt_mean_ratio_3v6  261.354298
     mobile_eft_all_amt_mean_diff_1v12  256.846607
             mobile_eft_all_amt_sum_L3  246.614906
 cc_transaction_all_amt_mean_diff_6v12  210.900398
        cc_transaction_all_amt_cnt_L12  206.31

In [3]:
# ============================================================
# üß† ING Datathon ‚Äî FULL VERSION (Training + Test + Submission)
#   - T√ºm verilerle tam eƒüitim veya sadece submission
#   - "churn" s√ºtunu yoksa otomatik algƒ±lar
# ============================================================

import os, glob, gc
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

pd.set_option("display.width", 160)
pd.set_option("display.max_columns", 220)

SEED = 42

# === 1) DOSYA BULUCU ===
def find_file(fname: str, search_roots=(".",)):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                return os.path.abspath(path)
    return None

def resolve_path(default_path: str, fallback_name: str):
    if os.path.exists(default_path):
        return default_path
    p = find_file(fallback_name, search_roots=(".", os.getcwd()))
    if p:
        print(f"[info] Bulundu: {fallback_name} -> {p}")
        return p
    raise FileNotFoundError(f"Dosya bulunamadƒ±: {fallback_name}")

PATH_HISTORY  = resolve_path("/mnt/data/customer_history.csv", "customer_history.csv")
PATH_CUSTOMERS= resolve_path("/mnt/data/customers.csv", "customers.csv")
PATH_REF      = resolve_path("/mnt/data/referance_data.csv", "referance_data.csv")
PATH_TEST     = resolve_path("/mnt/data/referance_data_test.csv", "referance_data_test.csv")
PATH_SUB      = resolve_path("/mnt/data/sample_submission.csv", "sample_submission.csv")

# === 2) VERƒ∞LERƒ∞ Y√úKLE ===
hist = pd.read_csv(PATH_HISTORY, parse_dates=["date"], low_memory=False)
cust = pd.read_csv(PATH_CUSTOMERS, low_memory=False)
ref  = pd.read_csv(PATH_REF, parse_dates=["ref_date"], low_memory=False)
test = pd.read_csv(PATH_TEST, parse_dates=["ref_date"], low_memory=False)
sub  = pd.read_csv(PATH_SUB)

for df in (hist, cust, ref, test):
    if "cust_id" in df.columns:
        df["cust_id"] = pd.to_numeric(df["cust_id"], errors="coerce").astype("Int64")

cust.columns = cust.columns.str.strip()
hist.columns = hist.columns.str.strip()

print("‚úÖ Dosyalar y√ºklendi: ",
      "\n  hist:", hist.shape,
      "| cust:", cust.shape,
      "| ref:", ref.shape,
      "| test:", test.shape)

# === 3) CHURN KOLON KONTROL√ú ===
if "churn" not in ref.columns:
    alt_names = [c for c in ["is_churn", "target", "label"] if c in ref.columns]
    if alt_names:
        ref.rename(columns={alt_names[0]: "churn"}, inplace=True)
        print(f"‚öôÔ∏è  '{alt_names[0]}' s√ºtunu 'churn' olarak yeniden adlandƒ±rƒ±ldƒ±.")
    else:
        print("‚ö†Ô∏è  referance_data.csv i√ßinde 'churn' s√ºtunu bulunamadƒ±!")
        print("üî∏ Model eƒüitimi atlanacak, sadece submission √ºretilecek.")
        ref["churn"] = np.nan  # dummy ekle

# === 4) FEATURE √úRETƒ∞Mƒ∞ (Mini versiyon ile birebir aynƒ±) ===
HIST_COLS = [
    "cust_id","date",
    "mobile_eft_all_cnt","active_product_category_nbr",
    "mobile_eft_all_amt","cc_transaction_all_amt","cc_transaction_all_cnt"
]
hist = hist[[c for c in HIST_COLS if c in hist.columns]].sort_values(["cust_id","date"]).reset_index(drop=True)
WINDOWS = [1,3,6,12]
BASE_NUM_COLS = [c for c in HIST_COLS if c not in ["cust_id","date"]]

def _agg_safe(s, fn):
    s = pd.to_numeric(s, errors="coerce"); s = s[np.isfinite(s)]
    if s.size == 0: return 0.0
    if fn=="sum":  return float(s.sum())
    if fn=="mean": return float(s.mean())
    if fn=="std":  return float(s.std(ddof=0))
    if fn=="max":  return float(s.max())
    return 0.0

def _months_active(frame):
    if frame.empty: return 0
    cond = (pd.to_numeric(frame.get("cc_transaction_all_cnt", 0), errors="coerce").fillna(0) > 0) | \
           (pd.to_numeric(frame.get("mobile_eft_all_cnt", 0), errors="coerce").fillna(0) > 0)
    return int(cond.sum())

def _ewm_last(series, span):
    series = pd.to_numeric(series, errors="coerce").fillna(0.0)
    if series.empty: return 0.0
    return float(series.ewm(span=span, adjust=False).mean().iloc[-1])

def aggregate_for_one(h_cust, ref_date):
    feat = {"ref_date": ref_date}
    h = h_cust[h_cust["date"] < ref_date]
    if h.empty:
        for col in BASE_NUM_COLS:
            for w in WINDOWS:
                for fn in ["sum","mean","std","max"]:
                    feat[f"{col}_L{w}M_{fn}"] = 0.0
        feat["recency_days"] = 9999
        return feat

    feat["recency_days"] = int((ref_date - h["date"].max()).days)
    for w in WINDOWS:
        start = ref_date - pd.DateOffset(months=w)
        hw = h[(h["date"] >= start) & (h["date"] < ref_date)]
        for col in BASE_NUM_COLS:
            feat[f"{col}_L{w}M_sum"]  = _agg_safe(hw[col], "sum")
            feat[f"{col}_L{w}M_mean"] = _agg_safe(hw[col], "mean")
            feat[f"{col}_L{w}M_std"]  = _agg_safe(hw[col], "std")
            feat[f"{col}_L{w}M_max"]  = _agg_safe(hw[col], "max")
    return feat

def build_features_for_keys(hist_df, keys_df, progress_every=5000):
    rows, cache = [], {}
    for i, r in enumerate(keys_df.itertuples(index=False), 1):
        cid, rd = r.cust_id, r.ref_date
        if pd.isna(cid) or pd.isna(rd): continue
        h_cust = cache.get(cid)
        if h_cust is None:
            h_cust = hist_df[hist_df["cust_id"] == cid]
            cache[cid] = h_cust
        f = aggregate_for_one(h_cust, rd); f["cust_id"]=cid; rows.append(f)
        if progress_every and (i % progress_every == 0):
            print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(rows)

def add_calendar_feats(df):
    df["ref_month"]   = df["ref_date"].dt.month.astype(int)
    df["ref_quarter"] = df["ref_date"].dt.quarter.astype(int)
    return df

# === 5) MODEL Eƒûƒ∞Tƒ∞Mƒ∞ + SUBMISSION ===
def train_full_and_submit():
    feats_train = build_features_for_keys(hist, ref[["cust_id","ref_date"]], progress_every=5000)
    feats_test  = build_features_for_keys(hist, test[["cust_id","ref_date"]], progress_every=5000)

    X_train_full = ref.merge(feats_train, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    X_test_full  = test.merge(feats_test, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")

    X_train_full = add_calendar_feats(X_train_full)
    X_test_full  = add_calendar_feats(X_test_full)

    num_cols = X_train_full.select_dtypes(include="number").columns
    cat_cols = [c for c in X_train_full.columns if c not in num_cols and c != "ref_date"]

    for df in (X_train_full, X_test_full):
        df[num_cols] = df[num_cols].fillna(0)
        for c in cat_cols: df[c] = df[c].fillna("Unknown")

    X_train_enc = pd.get_dummies(X_train_full.drop(columns=["ref_date"]), columns=cat_cols, drop_first=True)
    X_test_enc  = pd.get_dummies(X_test_full.drop(columns=["ref_date"]), columns=cat_cols, drop_first=True)
    X_test_enc  = X_test_enc.reindex(columns=X_train_enc.columns, fill_value=0)

    if "churn" not in X_train_full.columns or X_train_full["churn"].isna().all():
        print("‚ö†Ô∏è Etiket (churn) bulunamadƒ±, eƒüitim atlanƒ±yor. Dummy model kullanƒ±lacak.")
        X_train_full["churn"] = np.random.randint(0, 2, size=len(X_train_full))

    y_train = X_train_full["churn"].astype(int)
    feat_cols = [c for c in X_train_enc.columns if c not in ["cust_id","churn"]]

    clf = LGBMClassifier(
        objective="binary",
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=8,
        num_leaves=128,
        subsample=0.9,
        colsample_bytree=0.85,
        min_child_samples=25,
        reg_alpha=0.1,
        reg_lambda=1.0,
        class_weight="balanced",
        random_state=SEED
    )

    clf.fit(X_train_enc[feat_cols], y_train)
    test_pred = clf.predict_proba(X_test_enc[feat_cols])[:,1]

    sub["churn_probability"] = test_pred
    sub.to_csv("submit.csv", index=False)
    print("‚úÖ SUBMISSION dosyasƒ± kaydedildi: submit.csv")

# === 6) √áALI≈ûTIR ===
if __name__ == "__main__":
    train_full_and_submit()


[info] Bulundu: customer_history.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customer_history.csv
[info] Bulundu: customers.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customers.csv
[info] Bulundu: referance_data.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data.csv
[info] Bulundu: referance_data_test.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data_test.csv
[info] Bulundu: sample_submission.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\sample_submission.csv
‚úÖ Dosyalar y√ºklendi:  
  hist: (5359609, 7) | cust: (176293, 8) | ref: (133287, 3) | test: (43006, 2)
[features] 5000/133287
[features] 10000/133287
[features] 15000/133287
[features] 20000/133287
[features] 25000/133287
[features] 30000/133287
[features] 35000/133287
[features] 40000/133287
[features] 45000/133287
[features] 50000/133287
[features] 55000/133287
[features] 60000/133287
[features] 65000/133287

KeyError: "['churn'] not in index"

In [1]:
# ============================================================
# ‚úÖ ING Datathon ‚Äî FULL TRAIN + SUBMISSION (LightGBM + Native Categorical)
# ============================================================

import os, glob, gc
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score

pd.set_option("display.width", 160)
pd.set_option("display.max_columns", 220)

# === PARAMETRELER ===
SEED = 42
DO_TUNE_ON_SAMPLE = True
TUNE_SAMPLE_N     = 12000
TUNE_RECENT_FRAC  = 0.55
RECENT_FRAC_TRAIN_FULL = 1.00
DO_LOWVAR_DROP = True
VALID_SPLIT_Q  = 0.80

BEST_PARAMS = dict(
    objective="binary",
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=8,
    num_leaves=128,
    min_child_samples=25,
    subsample=0.90,
    subsample_freq=1,
    colsample_bytree=0.85,
    reg_alpha=0.10,
    reg_lambda=1.00,
    class_weight="balanced",
    random_state=SEED
)

SUBMISSION_PATH = "submission.csv"


  from pandas.core import (


In [2]:
def find_file(fname: str, search_roots=(".",)):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                return os.path.abspath(path)
    return None

def resolve_path(default_path: str, fallback_name: str):
    if os.path.exists(default_path): return default_path
    p = find_file(fallback_name, search_roots=(".", os.getcwd()))
    if p:
        print(f"[info] Bulundu: {fallback_name} -> {p}")
        return p
    raise FileNotFoundError(f"Dosya bulunamadƒ±: {fallback_name}")

PATH_HISTORY   = resolve_path("/mnt/data/customer_history.csv", "customer_history.csv")
PATH_CUSTOMERS = resolve_path("/mnt/data/customers.csv",       "customers.csv")
PATH_REF       = resolve_path("/mnt/data/referance_data.csv",  "referance_data.csv")

hist = pd.read_csv(PATH_HISTORY, parse_dates=["date"], low_memory=False)
cust = pd.read_csv(PATH_CUSTOMERS, low_memory=False)
ref  = pd.read_csv(PATH_REF, parse_dates=["ref_date"], low_memory=False)

for df in (hist, cust, ref):
    if "cust_id" in df.columns:
        df["cust_id"] = pd.to_numeric(df["cust_id"], errors="coerce").astype("Int64")

if "churn" in ref.columns:
    ref["churn"] = pd.to_numeric(ref["churn"], errors="coerce").fillna(0).astype("Int8")

cust.columns = cust.columns.str.strip()
hist.columns = hist.columns.str.strip()

print("‚úÖ Dosyalar y√ºklendi:",
      "\n  hist:", hist.shape,
      "| cust:", cust.shape,
      "| ref:", ref.shape)


[info] Bulundu: customer_history.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customer_history.csv
[info] Bulundu: customers.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customers.csv
[info] Bulundu: referance_data.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data.csv
‚úÖ Dosyalar y√ºklendi: 
  hist: (5359609, 7) | cust: (176293, 8) | ref: (133287, 3)


In [3]:
HIST_COLS = [
    "cust_id","date",
    "mobile_eft_all_cnt","active_product_category_nbr",
    "mobile_eft_all_amt","cc_transaction_all_amt","cc_transaction_all_cnt"
]
hist = hist[[c for c in HIST_COLS if c in hist.columns]].sort_values(["cust_id","date"]).reset_index(drop=True)
WINDOWS = [1,3,6,12]
BASE_NUM_COLS = [c for c in HIST_COLS if c not in ["cust_id","date"]]

def _agg_safe(s, fn):
    s = pd.to_numeric(s, errors="coerce"); s = s[np.isfinite(s)]
    if s.size == 0: return 0.0
    if fn=="sum":  return float(s.sum())
    if fn=="mean": return float(s.mean())
    if fn=="std":  return float(s.std(ddof=0))
    if fn=="max":  return float(s.max())
    return 0.0

def _months_active(frame):
    if frame.empty: return 0
    cond = (pd.to_numeric(frame.get("cc_transaction_all_cnt", 0), errors="coerce").fillna(0) > 0) | \
           (pd.to_numeric(frame.get("mobile_eft_all_cnt", 0), errors="coerce").fillna(0) > 0)
    return int(cond.sum())

def _ewm_last(series, span):
    series = pd.to_numeric(series, errors="coerce").fillna(0.0)
    if series.empty: return 0.0
    return float(series.ewm(span=span, adjust=False).mean().iloc[-1])


In [4]:
def aggregate_for_one(h_cust, ref_date):
    feat = {"ref_date": ref_date}
    h = h_cust[h_cust["date"] < ref_date]

    if h.empty:
        for col in BASE_NUM_COLS:
            for w in WINDOWS:
                for fn in ["sum","mean","std","max"]:
                    feat[f"{col}_L{w}M_{fn}"] = 0.0
        feat["recency_days"] = 9999
        return feat

    feat["recency_days"] = int((ref_date - h["date"].max()).days)
    for w in WINDOWS:
        start = ref_date - pd.DateOffset(months=w)
        hw = h[(h["date"] >= start) & (h["date"] < ref_date)]
        for col in BASE_NUM_COLS:
            feat[f"{col}_L{w}M_sum"]  = _agg_safe(hw[col], "sum")
            feat[f"{col}_L{w}M_mean"] = _agg_safe(hw[col], "mean")
            feat[f"{col}_L{w}M_std"]  = _agg_safe(hw[col], "std")
            feat[f"{col}_L{w}M_max"]  = _agg_safe(hw[col], "max")
    return feat

def build_features_for_keys(hist_df, keys_df, progress_every=3000):
    rows, cache = [], {}
    for i, r in enumerate(keys_df.itertuples(index=False), 1):
        cid, rd = r.cust_id, r.ref_date
        if pd.isna(cid) or pd.isna(rd): continue
        h_cust = cache.get(cid)
        if h_cust is None:
            h_cust = hist_df[hist_df["cust_id"] == cid]
            cache[cid] = h_cust
        f = aggregate_for_one(h_cust, rd); f["cust_id"]=cid; rows.append(f)
        if progress_every and (i % progress_every == 0):
            print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(rows)


In [5]:
def gini_from_auc(y_true, y_score):
    return 2 * roc_auc_score(y_true, y_score) - 1

def recall_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true)
    topk = int(np.ceil(len(y_true)*k))
    idx = np.argsort(-y_score)[:topk]
    return float(y_true[idx].sum()) / float(y_true.sum() + 1e-9)

def lift_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true)
    topk = int(np.ceil(len(y_true)*k))
    idx = np.argsort(-y_score)[:topk]
    prec=float(y_true[idx].mean()); prev=float(y_true.mean())
    return prec/max(prev,1e-9)

def competition_score(y_true, y_score):
    g=gini_from_auc(y_true,y_score); r=recall_at_k(y_true,y_score); l=lift_at_k(y_true,y_score)
    return {"Gini":round(g,4),"Recall@10%":round(r,4),"Lift@10%":round(l,4),"CompetitionScore":round(0.4*g+0.3*r+0.3*l,4)}

def prepare_frame_for_model(X_full: pd.DataFrame):
    X = X_full.copy()
    num_cols = X.select_dtypes(include="number").columns.tolist()
    cat_cols = [c for c in X.columns if c not in num_cols and c != "ref_date"]
    X[num_cols] = X[num_cols].fillna(0)
    for c in cat_cols:
        X[c] = X[c].astype("category")
        if "Unknown" not in list(X[c].cat.categories):
            X[c] = X[c].cat.add_categories("Unknown")
        X[c] = X[c].fillna("Unknown")
    X_enc = X.drop(columns=["ref_date"])
    feat_cols = [c for c in X_enc.columns if c not in ["cust_id", "churn"]]
    cat_feats = [c for c in feat_cols if str(X_enc[c].dtype)=="category"]
    return X_enc, feat_cols, cat_feats


In [6]:
def main():
    # üîπ √ñrnek (mini) tuning
    if DO_TUNE_ON_SAMPLE:
        keys_tune = ref.sample(n=TUNE_SAMPLE_N, random_state=SEED)
        feats_tune = build_features_for_keys(hist, keys_tune[["cust_id","ref_date"]], progress_every=2000)
        X_tune_full = keys_tune.merge(feats_tune, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
        X_tune_full["ref_month"] = X_tune_full["ref_date"].dt.month.astype(int)
        X_tune_enc, feat_cols_tune, cat_feats_tune = prepare_frame_for_model(X_tune_full)

        cut = X_tune_full["ref_date"].quantile(VALID_SPLIT_Q)
        train_mask = X_tune_full["ref_date"] < cut
        valid_mask = ~train_mask

        X_train = X_tune_enc.loc[train_mask, feat_cols_tune]
        y_train = X_tune_full.loc[train_mask, "churn"].astype(int)
        X_valid = X_tune_enc.loc[valid_mask, feat_cols_tune]
        y_valid = X_tune_full.loc[valid_mask, "churn"].astype(int)

        clf = LGBMClassifier(**BEST_PARAMS)
        clf.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric="auc",
            callbacks=[early_stopping(60), log_evaluation(50)],
            categorical_feature=cat_feats_tune
        )
        proba = clf.predict_proba(X_valid)[:,1]
        print("\nüìä TUNE VALID:", competition_score(y_valid, proba))

    # üîπ FULL TRAIN
    feats_full = build_features_for_keys(hist, ref[["cust_id","ref_date"]], progress_every=5000)
    X_full = ref.merge(feats_full, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    X_full["ref_month"] = X_full["ref_date"].dt.month.astype(int)
    X_enc, feat_cols, cat_feats = prepare_frame_for_model(X_full)

    clf_full = LGBMClassifier(**BEST_PARAMS)
    clf_full.fit(
        X_enc[feat_cols], X_full["churn"].astype(int),
        categorical_feature=cat_feats
    )
    print("‚úÖ FULL TRAIN tamam.")

    # üîπ SUBMISSION
    sub_score = clf_full.predict_proba(X_enc[feat_cols])[:,1]
    sub_out = X_full[["cust_id","ref_date"]].copy()
    sub_out["score"] = sub_score
    sub_out["ref_date"] = pd.to_datetime(sub_out["ref_date"]).dt.strftime("%Y-%m-%d")
    sub_out.to_csv(SUBMISSION_PATH, index=False)
    print("üíæ Submission yazƒ±ldƒ±:", SUBMISSION_PATH)
    print(sub_out.head())

if __name__ == "__main__":
    main()


[features] 2000/12000
[features] 4000/12000
[features] 6000/12000
[features] 8000/12000
[features] 10000/12000
[features] 12000/12000
[LightGBM] [Info] Number of positive: 1374, number of negative: 7919
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005652 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12931
[LightGBM] [Info] Number of data points in the train set: 9293, number of used features: 84
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 60 rounds
[50]	valid_0's auc: 0.703793	valid_0's binary_logloss: 0.611403
Early stopping, best iteration is:
[12]	valid_0's auc: 0.706581	valid_0's binary_logloss: 0.65738

üìä TUNE VALID: {'Gini': 0.4132, 'Recall@10%': 0.2232, 'Lift@10%': 2.2294, 'Competition









‚úÖ FULL TRAIN tamam.
üíæ Submission yazƒ±ldƒ±: submission.csv
   cust_id    ref_date     score
0        0  2017-09-01  0.665351
1        3  2018-10-01  0.266400
2        5  2018-03-01  0.744502
3        6  2018-04-01  0.670079
4        7  2018-05-01  0.108861


In [7]:
import pandas as pd

# Mevcut submission.csv dosyasƒ±nƒ± oku
sub = pd.read_csv("submission.csv")

# Sadece cust_id ve score s√ºtunlarƒ±nƒ± se√ß
sub = sub[["cust_id", "score"]]

# Yeniden kaydet
sub.to_csv("submission.csv", index=False)

print("‚úÖ submission.csv sadele≈ütirildi:")
print(sub.head())


‚úÖ submission.csv sadele≈ütirildi:
   cust_id     score
0        0  0.665351
1        3  0.266400
2        5  0.744502
3        6  0.670079
4        7  0.108861


In [11]:
# ============================================================
# ‚úÖ ING Datathon ‚Äî FULL SUBMISSION (T√ºm S√ºtunlar + Score)
# ============================================================

import os, glob, gc
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier

pd.set_option("display.width", 160)
pd.set_option("display.max_columns", 220)

SEED = 42


In [12]:
def find_file(fname: str, search_roots=(".",)):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                return os.path.abspath(path)
    return None

def resolve_path(default_path: str, fallback_name: str):
    if os.path.exists(default_path): return default_path
    p = find_file(fallback_name, search_roots=(".", os.getcwd()))
    if p:
        print(f"[info] Bulundu: {fallback_name} -> {p}")
        return p
    raise FileNotFoundError(f"Dosya bulunamadƒ±: {fallback_name}")

PATH_HISTORY  = resolve_path("/mnt/data/customer_history.csv", "customer_history.csv")
PATH_CUSTOMERS= resolve_path("/mnt/data/customers.csv", "customers.csv")
PATH_REF      = resolve_path("/mnt/data/referance_data.csv", "referance_data.csv")
PATH_TEST     = resolve_path("/mnt/data/referance_data_test.csv", "referance_data_test.csv")

hist = pd.read_csv(PATH_HISTORY, parse_dates=["date"], low_memory=False)
cust = pd.read_csv(PATH_CUSTOMERS, low_memory=False)
ref  = pd.read_csv(PATH_REF, parse_dates=["ref_date"], low_memory=False)
test = pd.read_csv(PATH_TEST, parse_dates=["ref_date"], low_memory=False)

print("‚úÖ Dosyalar y√ºklendi:", 
      "\n  hist:", hist.shape, 
      "| cust:", cust.shape, 
      "| ref:", ref.shape, 
      "| test:", test.shape)


[info] Bulundu: customer_history.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customer_history.csv
[info] Bulundu: customers.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\customers.csv
[info] Bulundu: referance_data.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data.csv
[info] Bulundu: referance_data_test.csv -> C:\Users\hp\OneDrive\Masa√ºst√º\ing-hubs-turkiye-datathon\referance_data_test.csv
‚úÖ Dosyalar y√ºklendi: 
  hist: (5359609, 7) | cust: (176293, 8) | ref: (133287, 3) | test: (43006, 2)


In [13]:
WINDOWS = [1,3,6,12]
HIST_COLS = [
    "cust_id","date",
    "mobile_eft_all_cnt","active_product_category_nbr",
    "mobile_eft_all_amt","cc_transaction_all_amt","cc_transaction_all_cnt"
]
hist = hist[[c for c in HIST_COLS if c in hist.columns]]

def _agg_safe(s, fn):
    s = pd.to_numeric(s, errors="coerce"); s = s[np.isfinite(s)]
    if s.size == 0: return 0.0
    if fn=="sum": return float(s.sum())
    if fn=="mean": return float(s.mean())
    if fn=="std": return float(s.std(ddof=0))
    if fn=="max": return float(s.max())
    return 0.0

def aggregate_for_one(h_cust, ref_date):
    feat = {"ref_date": ref_date}
    h = h_cust[h_cust["date"] < ref_date]
    if h.empty:
        for col in HIST_COLS:
            for w in WINDOWS:
                feat[f"{col}_L{w}M_sum"] = feat[f"{col}_L{w}M_mean"] = 0.0
        feat["recency_days"] = 9999
        return feat
    feat["recency_days"] = int((ref_date - h["date"].max()).days)
    for w in WINDOWS:
        start = ref_date - pd.DateOffset(months=w)
        hw = h[(h["date"] >= start) & (h["date"] < ref_date)]
        for col in HIST_COLS:
            if col in hw.columns:
                feat[f"{col}_L{w}M_sum"]  = _agg_safe(hw[col], "sum")
                feat[f"{col}_L{w}M_mean"] = _agg_safe(hw[col], "mean")
    return feat

def build_features_for_keys(hist_df, keys_df, progress_every=3000):
    rows, cache = [], {}
    for i, r in enumerate(keys_df.itertuples(index=False), 1):
        cid, rd = r.cust_id, r.ref_date
        if pd.isna(cid) or pd.isna(rd): continue
        h_cust = cache.get(cid)
        if h_cust is None:
            h_cust = hist_df[hist_df["cust_id"] == cid]
            cache[cid] = h_cust
        f = aggregate_for_one(h_cust, rd); f["cust_id"]=cid; rows.append(f)
        if progress_every and (i % progress_every == 0):
            print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(rows)


In [14]:
clf_full = LGBMClassifier(
    objective="binary",
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=8,
    num_leaves=128,
    subsample=0.9,
    colsample_bytree=0.85,
    min_child_samples=25,
    reg_alpha=0.1,
    reg_lambda=1.0,
    class_weight="balanced",
    random_state=SEED
)

# k√º√ß√ºk √∂rnekle (ref) eƒüit
feats_train = build_features_for_keys(hist, ref[["cust_id","ref_date"]], progress_every=5000)
X_train = ref.merge(feats_train, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
X_train = X_train.fillna(0)
y_train = X_train["churn"].astype(int) if "churn" in X_train.columns else np.random.randint(0,2,len(X_train))
clf_full.fit(X_train.select_dtypes(include=[np.number]), y_train)
print("‚úÖ Model eƒüitimi tamamlandƒ±.")


[features] 5000/133287
[features] 10000/133287
[features] 15000/133287
[features] 20000/133287
[features] 25000/133287
[features] 30000/133287
[features] 35000/133287
[features] 40000/133287
[features] 45000/133287
[features] 50000/133287
[features] 55000/133287
[features] 60000/133287
[features] 65000/133287
[features] 70000/133287
[features] 75000/133287
[features] 80000/133287
[features] 85000/133287
[features] 90000/133287
[features] 95000/133287
[features] 100000/133287
[features] 105000/133287
[features] 110000/133287
[features] 115000/133287
[features] 120000/133287
[features] 125000/133287
[features] 130000/133287
[LightGBM] [Info] Number of positive: 18870, number of negative: 114417
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027649 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10316
[LightGBM] [Info] Number of data points in the train set: 133287, number of used features: 61
[LightG















‚úÖ Model eƒüitimi tamamlandƒ±.


In [15]:
feats_test = build_features_for_keys(hist, test[["cust_id","ref_date"]], progress_every=5000)
X_test_full = test.merge(feats_test, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")

X_test_full["ref_month"]   = X_test_full["ref_date"].dt.month.astype(int)
X_test_full["ref_quarter"] = X_test_full["ref_date"].dt.quarter.astype(int)
print("‚úÖ Test verisi tamamlandƒ±:", X_test_full.shape)


[features] 5000/43006
[features] 10000/43006
[features] 15000/43006
[features] 20000/43006
[features] 25000/43006
[features] 30000/43006
[features] 35000/43006
[features] 40000/43006
‚úÖ Test verisi tamamlandƒ±: (43006, 68)


In [18]:
# === MODEL Eƒûƒ∞Tƒ∞Mƒ∞ SONRASINDA EKLE ===
train_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
print("üí° Eƒüitimde kullanƒ±lan feature sayƒ±sƒ±:", len(train_features))


üí° Eƒüitimde kullanƒ±lan feature sayƒ±sƒ±: 61


In [20]:
# === TEST SETƒ∞Nƒ∞ Eƒûƒ∞Tƒ∞M ƒ∞LE Hƒ∞ZALA ===
X_test_full = X_test_full.fillna(0)

# Eƒüer testte ekstra kolon varsa d√º≈ü√ºr
extra_cols = [c for c in X_test_full.columns if c not in train_features]
if extra_cols:
    print(f"‚ö†Ô∏è Testte fazla {len(extra_cols)} kolon bulundu, atƒ±lƒ±yor:", extra_cols[:5])
    X_test_full = X_test_full.drop(columns=extra_cols)

# Eƒüer testte eksik kolon varsa ekle (0 ile)
missing_cols = [c for c in train_features if c not in X_test_full.columns]
if missing_cols:
    print(f"‚ö†Ô∏è Testte eksik {len(missing_cols)} kolon bulundu, ekleniyor:", missing_cols[:5])
    for c in missing_cols:
        X_test_full[c] = 0

# Aynƒ± sƒ±rada olacak ≈üekilde sƒ±ralayalƒ±m
X_test_full = X_test_full[train_features]
print("‚úÖ Feature hizalamasƒ± tamamlandƒ±:", X_test_full.shape)


‚ö†Ô∏è Testte fazla 8 kolon bulundu, atƒ±lƒ±yor: ['ref_date', 'gender', 'province', 'religion', 'work_type']
‚ö†Ô∏è Testte eksik 1 kolon bulundu, ekleniyor: ['churn']
‚úÖ Feature hizalamasƒ± tamamlandƒ±: (43006, 61)


In [22]:
# === REF_DATE KOLONUNU KONTROL ET ===
if "ref_date" not in X_test_full.columns:
    print("‚ö†Ô∏è 'ref_date' kolonunu test setinden yeniden ekliyorum...")
    if 'cust_id' in X_test_full.columns and 'cust_id' in test.columns:
        X_test_full = X_test_full.merge(
            test[["cust_id", "ref_date"]],
            on="cust_id",
            how="left"
        )
        print("‚úÖ 'ref_date' eklendi:", X_test_full.shape)
    else:
        print("‚ùå ref_date eklenemedi: cust_id e≈üle≈ümesi bulunamadƒ±.")
else:
    print("‚úÖ 'ref_date' zaten mevcut.")


‚ö†Ô∏è 'ref_date' kolonunu test setinden yeniden ekliyorum...
‚úÖ 'ref_date' eklendi: (43006, 62)


In [24]:
# === SUBMISSION KAYDET ===
X_test_full["ref_date"] = pd.to_datetime(X_test_full["ref_date"]).dt.strftime("%Y-%m-%d")
X_test_full.to_csv("submission.csv", index=False)
print("üíæ SUBMISSION kaydedildi: submission.csv")
print("üìÑ Boyut:", X_test_full.shape)
print("üìä √ñrnek:")
print(X_test_full.head())


üíæ SUBMISSION kaydedildi: submission.csv
üìÑ Boyut: (43006, 62)
üìä √ñrnek:
   cust_id  churn  recency_days  cust_id_L1M_sum  cust_id_L1M_mean  date_L1M_sum  date_L1M_mean  mobile_eft_all_cnt_L1M_sum  mobile_eft_all_cnt_L1M_mean  \
0        1      0            31              1.0               1.0  1.546301e+18   1.546301e+18                         0.0                          0.0   
1        2      0            31              2.0               2.0  1.543622e+18   1.543622e+18                         4.0                          4.0   
2        9      0            28              9.0               9.0  1.548979e+18   1.548979e+18                         2.0                          2.0   
3       15      0            31             15.0              15.0  1.556669e+18   1.556669e+18                         0.0                          0.0   
4       19      0            31             19.0              19.0  1.543622e+18   1.543622e+18                         1.0                 

In [25]:
# === KAGGLE SUBMISSION (SADE FORMAT: id, score) ===

# 1Ô∏è‚É£ √ñnce s√ºtun adƒ±nƒ± id olarak deƒüi≈ütir
if "cust_id" in X_test_full.columns:
    submission_simple = X_test_full[["cust_id", "score"]].copy()
    submission_simple.rename(columns={"cust_id": "id"}, inplace=True)
else:
    raise KeyError("cust_id kolonu bulunamadƒ± ‚Äî test verisinde olduƒüundan emin ol.")

# 2Ô∏è‚É£ CSV olarak kaydet
submission_simple.to_csv("submission.csv", index=False)

print("üíæ KAGGLE SUBMISSION kaydedildi: submission.csv")
print("üìÑ Boyut:", submission_simple.shape)
print("üìä √ñrnek veri:")
print(submission_simple.head())


KeyError: "['score'] not in index"

In [26]:
print("üîπ Model ile tahmin √ºretiliyor...")
X_test_full = X_test_full.fillna(0)

# Eƒüitimde kullandƒ±ƒüƒ±n feature kolonlarƒ±nƒ± bul (√∂rnek: train_features)
if 'train_features' in locals():
    used_cols = train_features
else:
    used_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Testi aynƒ± feature setine hizala
extra_cols = [c for c in X_test_full.columns if c not in used_cols]
missing_cols = [c for c in used_cols if c not in X_test_full.columns]
if extra_cols:
    print(f"‚ö†Ô∏è Fazla {len(extra_cols)} kolon bulundu, atƒ±lƒ±yor...")
    X_test_full = X_test_full.drop(columns=extra_cols)
for c in missing_cols:
    X_test_full[c] = 0
X_test_full = X_test_full[used_cols]

# Tahmin al
y_pred = clf_full.predict_proba(X_test_full)[:, 1]
X_test_full["score"] = y_pred
print("‚úÖ Tahminler ba≈üarƒ±yla eklendi.")


üîπ Model ile tahmin √ºretiliyor...
‚ö†Ô∏è Fazla 1 kolon bulundu, atƒ±lƒ±yor...
‚úÖ Tahminler ba≈üarƒ±yla eklendi.


In [27]:
# === KAGGLE SUBMISSION ===
if "cust_id" in X_test_full.columns:
    submission_simple = X_test_full[["cust_id", "score"]].copy()
    submission_simple.rename(columns={"cust_id": "id"}, inplace=True)
    submission_simple.to_csv("submission.csv", index=False)
    print("üíæ KAGGLE SUBMISSION olu≈üturuldu: submission.csv")
    print(submission_simple.head())
else:
    print("‚ùå cust_id kolonu bulunamadƒ±. Test setini kontrol et.")


üíæ KAGGLE SUBMISSION olu≈üturuldu: submission.csv
   id     score
0   1  0.000002
1   2  0.000001
2   9  0.000003
3  15  0.000003
4  19  0.000002


In [28]:
# Kaggle sample_submission.csv dosyasƒ±nƒ± oku
sample = pd.read_csv("sample_submission.csv")

# Senin skorlarƒ±nƒ± bu sƒ±raya g√∂re hizala
merged = sample[["id"]].merge(submission_simple, on="id", how="left")

# Bo≈ü deƒüerleri 0 yap
merged["score"] = merged["score"].fillna(0)

# Kaydet
merged.to_csv("submission.csv", index=False)

print("‚úÖ Kaggle formatƒ±na g√∂re hizalanmƒ±≈ü dosya kaydedildi: submission.csv")
print(merged.head())


KeyError: "None of [Index(['id'], dtype='object')] are in the [columns]"

In [29]:
import pandas as pd

# 1Ô∏è‚É£ Dosyalarƒ± oku
sample = pd.read_csv("sample_submission.csv")
submission_simple = pd.read_csv("submission.csv")

# 2Ô∏è‚É£ Sample i√ßindeki ID kolonunu tespit et
id_col = None
for c in sample.columns:
    if "id" in c.lower():
        id_col = c
        break

if id_col is None:
    raise KeyError("‚ùå sample_submission.csv i√ßinde ID kolonu bulunamadƒ±.")

print(f"‚úÖ Sample dosyasƒ±ndaki ID kolonu: '{id_col}'")

# 3Ô∏è‚É£ Submission dosyasƒ±ndaki ID kolonunu da aynƒ± isme getir
if "id" not in submission_simple.columns and "cust_id" in submission_simple.columns:
    submission_simple.rename(columns={"cust_id": id_col}, inplace=True)
else:
    submission_simple.rename(columns={"id": id_col}, inplace=True)

# 4Ô∏è‚É£ Merge (sƒ±ralƒ± hizalama)
merged = sample[[id_col]].merge(submission_simple, on=id_col, how="left")

# 5Ô∏è‚É£ Eksik skorlarƒ± doldur
merged["score"] = merged["score"].fillna(0)

# 6Ô∏è‚É£ Final dosya
merged.to_csv("submission.csv", index=False)

print("üíæ Kaggle formatƒ±na g√∂re hizalanmƒ±≈ü final submission olu≈üturuldu!")
print("üìÑ Boyut:", merged.shape)
print("üìä ƒ∞lk 5 satƒ±r:")
print(merged.head())


‚úÖ Sample dosyasƒ±ndaki ID kolonu: 'cust_id'
üíæ Kaggle formatƒ±na g√∂re hizalanmƒ±≈ü final submission olu≈üturuldu!
üìÑ Boyut: (43006, 2)
üìä ƒ∞lk 5 satƒ±r:
   cust_id     score
0        1  0.000002
1        2  0.000001
2        9  0.000003
3       15  0.000003
4       19  0.000002


In [30]:
import pandas as pd

# Mevcut dosyayƒ± oku
sub = pd.read_csv("submission.csv")

# score s√ºtununu churn olarak yeniden adlandƒ±r
if "score" in sub.columns:
    sub.rename(columns={"score": "churn"}, inplace=True)
else:
    print("‚ö†Ô∏è 'score' kolonu bulunamadƒ±, zaten doƒüru isimde olabilir.")

# ID s√ºtunu sample_submission.csv ile aynƒ± adda mƒ± kontrol et
sample = pd.read_csv("sample_submission.csv")
id_col = [c for c in sample.columns if "id" in c.lower()][0]
if id_col != sub.columns[0]:
    sub.rename(columns={sub.columns[0]: id_col}, inplace=True)

# Final dosyayƒ± kaydet
sub.to_csv("submission.csv", index=False)

print("‚úÖ D√ºzeltildi! Yeni submission.csv olu≈üturuldu.")
print("üìÑ Kolonlar:", list(sub.columns))
print("üìä ƒ∞lk 5 satƒ±r:")
print(sub.head())


‚úÖ D√ºzeltildi! Yeni submission.csv olu≈üturuldu.
üìÑ Kolonlar: ['cust_id', 'churn']
üìä ƒ∞lk 5 satƒ±r:
   cust_id     churn
0        1  0.000002
1        2  0.000001
2        9  0.000003
3       15  0.000003
4       19  0.000002


In [34]:
!pip install optuna --quiet




In [33]:
# ============================================================
# üß† ING Datathon ‚Äî LightGBM Mini Tuning (Optuna)
# ============================================================

import os, glob, gc, warnings
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score
import optuna

warnings.filterwarnings("ignore")
pd.set_option("display.width", 160)
pd.set_option("display.max_columns", 220)

SEED = 42
SAMPLE_N = 12000     # K√º√ß√ºk √∂rnek boyutu
RECENT_FRAC = 0.55   # Son d√∂neme vurgu


ModuleNotFoundError: No module named 'optuna'

In [None]:
def find_file(fname: str, search_roots=(".",)):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                return os.path.abspath(path)
    return None

def resolve_path(default_path: str, fallback_name: str):
    if os.path.exists(default_path): return default_path
    p = find_file(fallback_name, search_roots=(".", os.getcwd()))
    if p:
        print(f"[info] Bulundu: {fallback_name} -> {p}")
        return p
    raise FileNotFoundError(f"Dosya bulunamadƒ±: {fallback_name}")

PATH_HISTORY   = resolve_path("/mnt/data/customer_history.csv", "customer_history.csv")
PATH_CUSTOMERS = resolve_path("/mnt/data/customers.csv",       "customers.csv")
PATH_REF       = resolve_path("/mnt/data/referance_data.csv",  "referance_data.csv")

hist = pd.read_csv(PATH_HISTORY, parse_dates=["date"], low_memory=False)
cust = pd.read_csv(PATH_CUSTOMERS, low_memory=False)
ref  = pd.read_csv(PATH_REF, parse_dates=["ref_date"], low_memory=False)

for df in (hist, cust, ref):
    if "cust_id" in df.columns:
        df["cust_id"] = pd.to_numeric(df["cust_id"], errors="coerce").astype("Int64")

if "churn" in ref.columns:
    ref["churn"] = pd.to_numeric(ref["churn"], errors="coerce").fillna(0).astype("Int8")

print("‚úÖ Dosyalar y√ºklendi:")
print("  hist:", hist.shape, "| cust:", cust.shape, "| ref:", ref.shape)


In [None]:
# Son d√∂neme odaklƒ± mini √∂rnek (validation uyumu i√ßin)
q = ref["ref_date"].quantile(1.0 - RECENT_FRAC)
recent_ref = ref[ref["ref_date"] >= q].reset_index(drop=True)

if len(recent_ref) > SAMPLE_N:
    idx = np.linspace(0, len(recent_ref)-1, SAMPLE_N).round().astype(int)
    ref_sample = recent_ref.iloc[idx].reset_index(drop=True)
else:
    ref_sample = recent_ref

print("üéØ Mini √∂rnek se√ßildi:", ref_sample.shape)


In [None]:
# Basit toplu √∂zellikler (1,3,6 ay yok - sadece √∂rnek hƒ±z i√ßin)
hist_simple = hist.groupby("cust_id").agg({
    "mobile_eft_all_amt": "sum",
    "mobile_eft_all_cnt": "sum",
    "cc_transaction_all_amt": "sum",
    "cc_transaction_all_cnt": "sum"
}).reset_index()

train = ref_sample.merge(hist_simple, on="cust_id", how="left").merge(cust, on="cust_id", how="left")
train = train.fillna(0)

num_cols = train.select_dtypes(include=[np.number]).columns
X = train[num_cols].drop(columns=["churn", "cust_id"], errors="ignore")
y = train["churn"].astype(int)

print("‚úÖ Feature matrix hazƒ±r:", X.shape)


In [None]:
def objective(trial):
    params = {
        "objective": "binary",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05),
        "num_leaves": trial.suggest_int("num_leaves", 64, 256),
        "max_depth": trial.suggest_int("max_depth", 6, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 40),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 0.3),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "class_weight": "balanced",
        "random_state": SEED,
        "n_estimators": 600
    }

    cut = train["ref_date"].quantile(0.8)
    mask_train = train["ref_date"] < cut
    mask_valid = ~mask_train

    X_train, X_valid = X.loc[mask_train], X.loc[mask_valid]
    y_train, y_valid = y.loc[mask_train], y.loc[mask_valid]

    model = LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric="auc",
        callbacks=[early_stopping(stopping_rounds=50)],
        verbose=-1
    )
    y_pred = model.predict_proba(X_valid)[:,1]
    return roc_auc_score(y_valid, y_pred)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15, show_progress_bar=True)

print("‚úÖ En iyi AUC:", study.best_value)
print("üèÜ En iyi parametreler:")
print(study.best_params)
