In [1]:
# === 1) KÜTÜPHANELER ===
import os, glob
import pandas as pd
import numpy as np

# === 2) DOSYA BULUCU (aynı klasörde ya da /mnt/data'da arar) ===
def find_file(fname: str, search_roots=(".",)):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                return os.path.abspath(path)
    return None

def resolve_path(default_path: str, fallback_name: str):
    if os.path.exists(default_path):
        return default_path
    p = find_file(fallback_name, search_roots=(".", os.getcwd()))
    if p: 
        print(f"[info] Bulundu: {fallback_name} -> {p}")
        return p
    raise FileNotFoundError(f"Dosya bulunamadı: {fallback_name}")

# === 3) DOSYALARI OKU ===
PATH_HISTORY = resolve_path("/mnt/data/customer_history.csv", "customer_history.csv")
PATH_CUSTOMERS = resolve_path("/mnt/data/customers.csv", "customers.csv")
PATH_REF = resolve_path("/mnt/data/referance_data.csv", "referance_data.csv")
PATH_TEST = resolve_path("/mnt/data/referance_data_test.csv", "referance_data_test.csv")
PATH_SUB = resolve_path("/mnt/data/sample_submission.csv", "sample_submission.csv")

hist = pd.read_csv(PATH_HISTORY, parse_dates=["date"], low_memory=False)
cust = pd.read_csv(PATH_CUSTOMERS, low_memory=False)
ref  = pd.read_csv(PATH_REF, parse_dates=["ref_date"], low_memory=False)
test = pd.read_csv(PATH_TEST, parse_dates=["ref_date"], low_memory=False)
sub  = pd.read_csv(PATH_SUB, low_memory=False)

# === 4) TİPLERİ DÜZELT ===
for df in (hist, cust, ref, test, sub):
    if "cust_id" in df.columns:
        df["cust_id"] = pd.to_numeric(df["cust_id"], errors="coerce").astype("Int64")
if "churn" in ref.columns:
    ref["churn"] = pd.to_numeric(ref["churn"], errors="coerce").fillna(0).astype("Int8")

print("✅ Dosyalar yüklendi:")
print("hist:", hist.shape, "| cust:", cust.shape, "| ref:", ref.shape, "| test:", test.shape)


  from pandas.core import (


[info] Bulundu: customer_history.csv -> C:\Users\hp\OneDrive\Masaüstü\ing-hubs-turkiye-datathon\customer_history.csv
[info] Bulundu: customers.csv -> C:\Users\hp\OneDrive\Masaüstü\ing-hubs-turkiye-datathon\customers.csv
[info] Bulundu: referance_data.csv -> C:\Users\hp\OneDrive\Masaüstü\ing-hubs-turkiye-datathon\referance_data.csv
[info] Bulundu: referance_data_test.csv -> C:\Users\hp\OneDrive\Masaüstü\ing-hubs-turkiye-datathon\referance_data_test.csv
[info] Bulundu: sample_submission.csv -> C:\Users\hp\OneDrive\Masaüstü\ing-hubs-turkiye-datathon\sample_submission.csv
✅ Dosyalar yüklendi:
hist: (5359609, 7) | cust: (176293, 8) | ref: (133287, 3) | test: (43006, 2)


In [8]:
# === 2) ÖZELLİK ÜRETME FONKSİYONLARI ===
HIST_COLS = [
    "cust_id","date",
    "mobile_eft_all_cnt","active_product_category_nbr",
    "mobile_eft_all_amt","cc_transaction_all_amt","cc_transaction_all_cnt"
]
hist = hist[[c for c in HIST_COLS if c in hist.columns]].sort_values(["cust_id","date"]).reset_index(drop=True)
WINDOWS = [1,3,6,12]
BASE_NUM_COLS = [c for c in HIST_COLS if c not in ["cust_id","date"]]

def _agg_safe(s, fn):
    s = pd.to_numeric(s, errors="coerce"); s = s[np.isfinite(s)]
    if s.size == 0: return 0.0
    if fn=="sum":  return float(s.sum())
    if fn=="mean": return float(s.mean())
    if fn=="std":  return float(s.std(ddof=0))
    if fn=="max":  return float(s.max())
    return 0.0

def aggregate_for_one(h_cust: pd.DataFrame, ref_date: pd.Timestamp):
    feat = {"ref_date": ref_date}
    h = h_cust[h_cust["date"] < ref_date]
    if h.empty:
        for col in BASE_NUM_COLS:
            for w in WINDOWS:
                feat[f"{col}_L{w}M_sum"]=feat[f"{col}_L{w}M_mean"]=feat[f"{col}_L{w}M_std"]=feat[f"{col}_L{w}M_max"]=0.0
        feat["recency_days"] = 9999
    else:
        feat["recency_days"] = int((ref_date - h["date"].max()).days)
        for w in WINDOWS:
            start = ref_date - pd.DateOffset(months=w)
            hw = h[(h["date"] >= start) & (h["date"] < ref_date)]
            for col in BASE_NUM_COLS:
                feat[f"{col}_L{w}M_sum"]  = _agg_safe(hw[col], "sum")
                feat[f"{col}_L{w}M_mean"] = _agg_safe(hw[col], "mean")
                feat[f"{col}_L{w}M_std"]  = _agg_safe(hw[col], "std")
                feat[f"{col}_L{w}M_max"]  = _agg_safe(hw[col], "max")

    eps = 1e-6
    for w in WINDOWS:
        feat[f"cc_amt_per_txn_L{w}M"] = feat.get(f"cc_transaction_all_amt_L{w}M_sum",0.0)/(feat.get(f"cc_transaction_all_cnt_L{w}M_sum",0.0)+eps)
        feat[f"mobile_eft_amt_per_txn_L{w}M"] = feat.get(f"mobile_eft_all_amt_L{w}M_sum",0.0)/(feat.get(f"mobile_eft_all_cnt_L{w}M_sum",0.0)+eps)

    for short,long in [(1,3),(3,6),(6,12)]:
        for base in ["cc_transaction_all_amt","cc_transaction_all_cnt","mobile_eft_all_amt","mobile_eft_all_cnt","active_product_category_nbr"]:
            ms = feat.get(f"{base}_L{short}M_mean",0.0)
            ml = feat.get(f"{base}_L{long}M_mean",0.0)
            feat[f"{base}_trend_mean_L{short}vL{long}_diff"]  = ms-ml
            feat[f"{base}_trend_mean_L{short}vL{long}_ratio"] = ms/(ml+1e-6)
    return feat

def build_features_for_keys(hist_df, keys_df, n_limit=None, progress_every=5000):
    if n_limit is not None:
        keys_df = keys_df.head(n_limit).copy()
    h = hist_df
    cache = {}
    rows = []
    for i, r in enumerate(keys_df.itertuples(index=False), 1):
        cid, rd = r.cust_id, r.ref_date
        if pd.isna(cid) or pd.isna(rd): continue
        if cid in cache: h_cust = cache[cid]
        else:
            h_cust = h[h["cust_id"] == cid]; cache[cid] = h_cust
        f = aggregate_for_one(h_cust, rd); f["cust_id"]=cid; rows.append(f)
        if progress_every and (i % progress_every == 0):
            print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(rows)


In [9]:
# === 3) TÜM EĞİTİM VERİSİNİ PARTİ PARTİ ÜRET ===
BATCH = 20000
keys = ref[["cust_id","ref_date","churn"]].sort_values("ref_date").reset_index(drop=True)
parts = []
for s in range(0, len(keys), BATCH):
    e = min(s + BATCH, len(keys))
    keys_part = keys.iloc[s:e][["cust_id","ref_date"]]
    feats_part = build_features_for_keys(hist, keys_part, n_limit=None)
    part = keys.iloc[s:e].merge(feats_part, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
    parts.append(part)
    print(f"[train] done: {e}/{len(keys)}")
X_full = pd.concat(parts, ignore_index=True)

# eksik doldur
num_cols = X_full.select_dtypes(include="number").columns
cat_cols = [c for c in X_full.columns if c not in num_cols and c != "ref_date"]
X_full[num_cols] = X_full[num_cols].fillna(0)
for c in cat_cols: X_full[c] = X_full[c].fillna("Unknown")


[features] 5000/20000
[features] 10000/20000
[features] 15000/20000
[features] 20000/20000
[train] done: 20000/133287
[features] 5000/20000
[features] 10000/20000
[features] 15000/20000
[features] 20000/20000
[train] done: 40000/133287
[features] 5000/20000
[features] 10000/20000
[features] 15000/20000
[features] 20000/20000
[train] done: 60000/133287
[features] 5000/20000
[features] 10000/20000
[features] 15000/20000
[features] 20000/20000
[train] done: 80000/133287
[features] 5000/20000
[features] 10000/20000
[features] 15000/20000
[features] 20000/20000
[train] done: 100000/133287
[features] 5000/20000
[features] 10000/20000
[features] 15000/20000
[features] 20000/20000
[train] done: 120000/133287
[features] 5000/13287
[features] 10000/13287
[train] done: 133287/133287


In [21]:
!pip install lightgbm -q


In [29]:
# === 4) MODEL: LIGHTGBM (güncel ve hatasız sürüm) ===
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score
import numpy as np

TARGET = "churn"
PROTECTED = ["cust_id","ref_date",TARGET]

# 1️⃣ Encode işlemi (kategorikleri sayıya çevir)
X_full_enc = pd.get_dummies(
    X_full, 
    columns=[c for c in X_full.columns if (X_full[c].dtype=="object" and c not in PROTECTED)],
    drop_first=True
)

# 2️⃣ Zamansal bölme (%80 train, %20 validation)
cut = X_full_enc["ref_date"].quantile(0.80)
train_mask = X_full_enc["ref_date"] < cut
valid_mask = ~train_mask

y_train = X_full_enc.loc[train_mask, TARGET].astype(int).values
y_valid = X_full_enc.loc[valid_mask, TARGET].astype(int).values
feat_cols = [c for c in X_full_enc.columns if c not in PROTECTED]
X_train = X_full_enc.loc[train_mask, feat_cols].fillna(0)
X_valid = X_full_enc.loc[valid_mask,  feat_cols].fillna(0)

# 3️⃣ Model tanımı
clf = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight="balanced",
    random_state=42
)

# 4️⃣ Eğitim (yeni LightGBM sürümüne uygun callbacks yapısı)
clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=50)
    ]
)

# 5️⃣ Tahmin ve metrikler
valid_proba = clf.predict_proba(X_valid)[:,1]

def gini_from_auc(y_true, y_score):
    return 2 * roc_auc_score(y_true, y_score) - 1

def recall_at_k(y_true, y_score, k=0.10):
    n = len(y_true)
    topk = int(np.ceil(n * k))
    idx = np.argsort(-y_score)[:topk]
    return float(y_true[idx].sum()) / float(y_true.sum() + 1e-9)

def lift_at_k(y_true, y_score, k=0.10):
    n = len(y_true)
    topk = int(np.ceil(n * k))
    idx = np.argsort(-y_score)[:topk]
    prec = float(y_true[idx].mean())
    prev = float(y_true.mean())
    return prec / max(prev, 1e-9)

gini  = gini_from_auc(y_valid, valid_proba)
rec10 = recall_at_k(y_valid, valid_proba, 0.10)
lift10= lift_at_k(y_valid, valid_proba, 0.10)
score = 0.4*gini + 0.3*rec10 + 0.3*lift10

print({
    "Gini": round(gini, 4),
    "Recall@10%": round(rec10, 4),
    "Lift@10%": round(lift10, 4),
    "CompetitionScore": round(score, 4)
})

# 6️⃣ Eğitimdeki kolonları test pipeline’da hizalamak için kaydet
TRAIN_FEATS = X_train.columns.tolist()
print("✅ LightGBM modeli eğitildi ve TRAIN_FEATS kaydedildi.")


[LightGBM] [Info] Number of positive: 15178, number of negative: 88713
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035258 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21950
[LightGBM] [Info] Number of data points in the train set: 103891, number of used features: 144
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.703013	valid_0's binary_logloss: 0.676788
Early stopping, best iteration is:
[19]	valid_0's auc: 0.702374	valid_0's binary_logloss: 0.673293
{'Gini': 0.4047, 'Recall@10%': 0.2067, 'Lift@10%': 2.0663, 'CompetitionScore': 0.8438}
✅ LightGBM modeli eğitildi ve TRAIN_FEATS kaydedildi.


In [30]:
import os

BASE_DIR = r"C:\Users\hp\OneDrive\Masaüstü\ing-hubs-turkiye-datathon"

for f in os.listdir(BASE_DIR):
    print(f)

    

.ipynb_checkpoints
customers.csv
customer_history.csv
LightGBM.ipynb
referance_data.csv
referance_data_test.csv
sample_submission.csv
started.ipynb
submission.csv


In [31]:
# === 5) TEST ÖZELLİKLERİ VE SUBMISSION ===
test_keys = test[["cust_id","ref_date"]].copy()
X_hist_test = build_features_for_keys(hist, test_keys, n_limit=None, progress_every=5000)

X_test = test_keys.merge(X_hist_test, on=["cust_id","ref_date"], how="left").merge(cust, on="cust_id", how="left")
num_cols = X_test.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_test.columns if c not in num_cols and c not in ["ref_date"]]
X_test[num_cols] = X_test[num_cols].fillna(0)
for c in cat_cols: X_test[c] = X_test[c].fillna("Unknown")

X_test_enc = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)
X_test_enc = X_test_enc.reindex(columns=TRAIN_FEATS, fill_value=0)

test_proba = clf.predict_proba(X_test_enc)[:, 1]
out = sub[["cust_id"]].merge(pd.DataFrame({"cust_id": X_test["cust_id"], "churn": test_proba}), on="cust_id", how="left")
out["churn"] = out["churn"].fillna(float(ref["churn"].mean()))
out.to_csv("submission.csv", index=False, float_format="%.6f")
print("✅ submission.csv oluşturuldu! Satır sayısı:", out.shape)
out.head()


[features] 5000/43006
[features] 10000/43006
[features] 15000/43006
[features] 20000/43006
[features] 25000/43006
[features] 30000/43006
[features] 35000/43006
[features] 40000/43006
✅ submission.csv oluşturuldu! Satır sayısı: (43006, 2)


Unnamed: 0,cust_id,churn
0,1,0.482841
1,2,0.49926
2,9,0.619356
3,15,0.594264
4,19,0.452892


In [2]:
# ============================================================
# 🧠 ING Datathon — MINI (LGBM + CatBoost Regularized Ensemble)
# ============================================================

# === 0) Kurulum (CatBoost yüklü değilse) ===
try:
    from catboost import CatBoostClassifier
except ModuleNotFoundError:
    print("📦 CatBoost kurulumu başlatılıyor...")
    !pip install catboost -q
    from catboost import CatBoostClassifier

# === 1) Kütüphaneler ===
import os, glob, warnings, gc
import pandas as pd, numpy as np
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score

warnings.filterwarnings("ignore")
pd.set_option("display.width", 160)
pd.set_option("display.max_columns", 240)

# === 2) Sabitler ===
SAMPLE_N       = 12000
FOCUS_RECENT   = True
RECENT_FRAC    = 0.6
SEED           = 42
ENSEMBLE_SEEDS = [42, 87, 123, 2025, 7]

# === 3) Dosya bulucu ===
def find_file(fname: str, search_roots=(".",)):
    fname = fname.lower()
    for root in search_roots:
        for path in glob.iglob(os.path.join(root, "**", "*"), recursive=True):
            if os.path.isfile(path) and os.path.basename(path).lower() == fname:
                return os.path.abspath(path)
    return None

def resolve_path(default_path: str, fallback_name: str):
    if os.path.exists(default_path): return default_path
    p = find_file(fallback_name, search_roots=(".", os.getcwd()))
    if p:
        print(f"[info] Bulundu: {fallback_name} -> {p}")
        return p
    raise FileNotFoundError(f"Dosya bulunamadı: {fallback_name}")

PATH_HISTORY   = resolve_path("/mnt/data/customer_history.csv", "customer_history.csv")
PATH_CUSTOMERS = resolve_path("/mnt/data/customers.csv", "customers.csv")
PATH_REF       = resolve_path("/mnt/data/referance_data.csv", "referance_data.csv")

hist = pd.read_csv(PATH_HISTORY, parse_dates=["date"], low_memory=False)
cust = pd.read_csv(PATH_CUSTOMERS, low_memory=False)
ref  = pd.read_csv(PATH_REF, parse_dates=["ref_date"], low_memory=False)

for df in (hist, cust, ref):
    if "cust_id" in df.columns:
        df["cust_id"] = pd.to_numeric(df["cust_id"], errors="coerce").astype("Int64")
if "churn" in ref.columns:
    ref["churn"] = pd.to_numeric(ref["churn"], errors="coerce").fillna(0).astype("Int8")
cust.columns = cust.columns.str.strip()
hist.columns = hist.columns.str.strip()

print("✅ Dosyalar yüklendi:",
      "\n  hist:", hist.shape,
      "| cust:", cust.shape,
      "| ref:", ref.shape)

# === 4) Metrikler ===
def gini_from_auc(y_true, y_score): return 2*roc_auc_score(y_true, y_score)-1
def recall_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true); topk = int(np.ceil(len(y_true)*k))
    idx = np.argsort(-y_score)[:topk]; return float(y_true[idx].sum())/float(y_true.sum()+1e-9)
def lift_at_k(y_true, y_score, k=0.10):
    y_true = np.array(y_true); topk = int(np.ceil(len(y_true)*k))
    idx = np.argsort(-y_score)[:topk]; prec = float(y_true[idx].mean()); prev = float(y_true.mean())
    return prec / max(prev, 1e-9)
def competition_score(y_true, y_score):
    g=gini_from_auc(y_true,y_score); r=recall_at_k(y_true,y_score); l=lift_at_k(y_true,y_score)
    s=0.4*g+0.3*r+0.3*l; return {"Gini":round(g,4),"Recall@10%":round(r,4),"Lift@10%":round(l,4),"CompetitionScore":round(s,4)}

# === 5) Key seçimi (mini örneklem) ===
def pick_keys_for_mini(ref_df, sample_n=SAMPLE_N, focus_recent=FOCUS_RECENT, recent_frac=RECENT_FRAC, seed=SEED):
    keys = ref_df[["cust_id","ref_date","churn"]].copy().sort_values("ref_date").reset_index(drop=True)
    if focus_recent:
        q = keys["ref_date"].quantile(1.0 - recent_frac)
        keys = keys[keys["ref_date"] >= q].reset_index(drop=True)
    if len(keys) <= sample_n: return keys
    keys["ref_month"] = keys["ref_date"].dt.to_period("M")
    month_sizes = keys["ref_month"].value_counts().sort_index()
    month_target = (month_sizes / month_sizes.sum() * sample_n).round().astype(int)
    samples=[]
    for m,n_target in month_target.items():
        km=keys[keys["ref_month"]==m]
        if n_target<=0 or km.empty: continue
        pos=km[km["churn"]==1]; neg=km[km["churn"]==0]
        pos_ratio=len(pos)/max(len(km),1)
        n_pos=int(round(n_target*pos_ratio)); n_neg=max(n_target-n_pos,0)
        take_pos=pos.sample(n=min(n_pos,len(pos)),random_state=seed) if len(pos) else pos
        take_neg=neg.sample(n=min(n_neg,len(neg)),random_state=seed) if len(neg) else neg
        pack=pd.concat([take_pos,take_neg],axis=0)
        if len(pack)<n_target and len(km)>len(pack):
            extra=km.drop(pack.index).sample(n=min(n_target-len(pack),len(km)-len(pack)),random_state=seed)
            pack=pd.concat([pack,extra],axis=0)
        samples.append(pack)
    keys=pd.concat(samples,axis=0).sample(frac=1.0,random_state=seed)
    return keys.drop(columns=["ref_month"]).reset_index(drop=True)

# === 6) Basit Feature Builder ===
def aggregate_for_one(h_cust, ref_date):
    feat={"ref_date":ref_date}
    h=h_cust[h_cust["date"]<ref_date]
    feat["txn_sum"]=h["cc_transaction_all_amt"].sum() if "cc_transaction_all_amt" in h else 0
    feat["eft_sum"]=h["mobile_eft_all_amt"].sum() if "mobile_eft_all_amt" in h else 0
    feat["txn_cnt"]=h["cc_transaction_all_cnt"].sum() if "cc_transaction_all_cnt" in h else 0
    feat["eft_cnt"]=h["mobile_eft_all_cnt"].sum() if "mobile_eft_all_cnt" in h else 0
    feat["recency_days"]=int((ref_date - h["date"].max()).days) if not h.empty else 9999
    feat["cc_to_eft_ratio"]=(feat["txn_sum"]+1)/(feat["eft_sum"]+1)
    return feat

def build_features_for_keys(hist_df, keys_df, progress_every=2000):
    rows, cache=[],{}
    for i,r in enumerate(keys_df.itertuples(index=False),1):
        cid,rd=r.cust_id,r.ref_date
        if pd.isna(cid) or pd.isna(rd): continue
        h_cust=cache.get(cid) or hist_df[hist_df["cust_id"]==cid]
        cache[cid]=h_cust
        f=aggregate_for_one(h_cust,rd); f["cust_id"]=cid; rows.append(f)
        if i%progress_every==0: print(f"[features] {i}/{len(keys_df)}")
    return pd.DataFrame(rows)

# === 7) Calendar features ===
def add_calendar_feats(df):
    df["ref_month"]=df["ref_date"].dt.month.astype(int)
    df["ref_quarter"]=df["ref_date"].dt.quarter.astype(int)
    df["ref_month_sin"]=np.sin(2*np.pi*df["ref_month"]/12.0)
    df["ref_month_cos"]=np.cos(2*np.pi*df["ref_month"]/12.0)
    return df

# === 8) Encoding ===
def encode_categoricals(X):
    X=X.copy()
    cat_cols=X.select_dtypes(exclude="number").columns
    for c in cat_cols:
        X[c]=X[c].astype(str)
    return pd.get_dummies(X,drop_first=True)

# === 9) Eğitim & Ensemble ===
def train_eval_mini():
    keys = pick_keys_for_mini(ref, SAMPLE_N, FOCUS_RECENT, RECENT_FRAC, SEED)
    print(f"🎯 MINI | keys: {len(keys)}")

    feats = build_features_for_keys(hist, keys[["cust_id","ref_date"]], progress_every=2000)
    X_full = keys.merge(feats,on=["cust_id","ref_date"],how="left").merge(cust,on="cust_id",how="left")
    X_full = add_calendar_feats(X_full)
    X_full_enc = encode_categoricals(X_full)

    cut = X_full["ref_date"].quantile(0.8)
    train_mask = X_full["ref_date"] < cut
    valid_mask = ~train_mask
    TARGET = "churn"
    feat_cols = [c for c in X_full_enc.columns if c not in ["cust_id", TARGET, "ref_date"]]

    X_train, y_train = X_full_enc.loc[train_mask, feat_cols], X_full.loc[train_mask, TARGET]
    X_valid, y_valid = X_full_enc.loc[valid_mask, feat_cols], X_full.loc[valid_mask, TARGET]

    print("✅ Eğitim seti:", X_full.shape, "| Train:", X_train.shape, "| Valid:", X_valid.shape)

    neg, pos = (y_train==0).sum(), (y_train==1).sum()
    scale_pos = float(np.sqrt((neg+1e-9)/(pos+1e-9)))

    params_lgbm = dict(
        objective="binary",
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=6,
        num_leaves=48,
        subsample=0.75,
        colsample_bytree=0.7,
        min_child_samples=100,
        reg_alpha=0.3,
        reg_lambda=1.5,
        scale_pos_weight=scale_pos,
        n_jobs=-1
    )

    params_cat = dict(
        iterations=1200,
        learning_rate=0.04,
        depth=6,
        l2_leaf_reg=5,
        subsample=0.8,
        random_strength=1.5,
        loss_function='Logloss',
        eval_metric='AUC',
        early_stopping_rounds=80,
        verbose=False
    )

    valid_probas_lgb, valid_probas_cat = [], []

    for seed in ENSEMBLE_SEEDS:
        clf_lgb = LGBMClassifier(random_state=seed, **params_lgbm)
        clf_lgb.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric=['auc'],
            callbacks=[early_stopping(stopping_rounds=80), log_evaluation(period=50)]
        )
        valid_probas_lgb.append(clf_lgb.predict_proba(X_valid)[:, 1])

        clf_cat = CatBoostClassifier(random_seed=seed, **params_cat)
        clf_cat.fit(X_train, y_train, eval_set=(X_valid, y_valid))
        valid_probas_cat.append(clf_cat.predict_proba(X_valid)[:, 1])

    p_lgb = np.mean(valid_probas_lgb, axis=0)
    p_cat = np.mean(valid_probas_cat, axis=0)
    valid_proba = 0.6*p_lgb + 0.4*p_cat  # ağırlıklı ensemble

    metrics = competition_score(y_valid, valid_proba)
    print("\n📊 VALIDATION SONUÇLARI (LGBM+CatBoost Ensemble):", metrics)
    return metrics

# === 10) Çalıştır ===
if __name__ == "__main__":
    _ = train_eval_mini()


📦 CatBoost kurulumu başlatılıyor...
[info] Bulundu: customer_history.csv -> C:\Users\hp\OneDrive\Masaüstü\ing-hubs-turkiye-datathon\customer_history.csv
[info] Bulundu: customers.csv -> C:\Users\hp\OneDrive\Masaüstü\ing-hubs-turkiye-datathon\customers.csv
[info] Bulundu: referance_data.csv -> C:\Users\hp\OneDrive\Masaüstü\ing-hubs-turkiye-datathon\referance_data.csv
✅ Dosyalar yüklendi: 
  hist: (5359609, 7) | cust: (176293, 8) | ref: (133287, 3)
🎯 MINI | keys: 12000
[features] 2000/12000
[features] 4000/12000
[features] 6000/12000
[features] 8000/12000
[features] 10000/12000
[features] 12000/12000
✅ Eğitim seti: (12000, 20) | Train: (8706, 50) | Valid: (3294, 50)
[LightGBM] [Info] Number of positive: 1208, number of negative: 7498
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 170

[50]	valid_0's auc: 0.528484	valid_0's binary_logloss: 0.416235
Early stopping, best iteration is:
[1]	valid_0's auc: 0.540698	valid_0's binary_logloss: 0.389403
[LightGBM] [Info] Number of positive: 1208, number of negative: 7498
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000349 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1701
[LightGBM] [Info] Number of data points in the train set: 8706, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138755 -> initscore=-1.825670
[LightGBM] [Info] Start training from score -1.825670
Training until validation scores don't improve for 80 rounds
[50]	valid_0's auc: 0.52176	valid_0's binary_logloss: 0.412664


Early stopping, best iteration is:
[1]	valid_0's auc: 0.497369	valid_0's binary_logloss: 0.389595
[LightGBM] [Info] Number of positive: 1208, number of negative: 7498
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000881 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1701
[LightGBM] [Info] Number of data points in the train set: 8706, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138755 -> initscore=-1.825670
[LightGBM] [Info] Start training from score -1.825670
Training until validation scores don't improve for 80 rounds
[50]	valid_0's auc: 0.528673	valid_0's binary_logloss: 0.414569
Early stopping, best iteration is:
[1]	valid_0's auc: 0.498442	valid_0's binary_logloss: 0.38954


[LightGBM] [Info] Number of positive: 1208, number of negative: 7498
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000516 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1701
[LightGBM] [Info] Number of data points in the train set: 8706, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138755 -> initscore=-1.825670
[LightGBM] [Info] Start training from score -1.825670
Training until validation scores don't improve for 80 rounds
[50]	valid_0's auc: 0.525223	valid_0's binary_logloss: 0.413838
Early stopping, best iteration is:
[1]	valid_0's auc: 0.497369	valid_0's binary_logloss: 0.389595

📊 VALIDATION SONUÇLARI (LGBM+CatBoost Ensemble): {'Gini': 0.1065, 'Recall@10%': 0.1386, 'Lift@10%': 1.3832, 'CompetitionScore': 0.4991}
