In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, log_loss, confusion_matrix, classification_report
from catboost import CatBoostClassifier, Pool

DATA_DIR = "Data"   
SEED = 66


# I/O 
def must_exist(path: str):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing file: {path}")

def read_csv(name: str) -> pd.DataFrame:
    path = os.path.join(DATA_DIR, name)
    must_exist(path)
    return pd.read_csv(path, low_memory=False)

def to_str_cols(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            df[c] = df[c].astype("string")


#  Feature table 
def build_table(people, job, job_sec, pension, sport,
                city_adm, city_pop, departments, regions) -> pd.DataFrame:
    df = people.copy()

    # flags
    for name, sub in [
        ("has_employee_job", job),
        ("has_job_security", job_sec),
        ("has_pension", pension),
        ("is_sport_member", sport),
    ]:
        df[name] = df["UID"].isin(sub["UID"]).astype(int)

    # merges
    df = df.merge(job_sec, on="UID", how="left")
    df = df.merge(job, on="UID", how="left")
    df = df.merge(pension, on="UID", how="left")
    df = df.merge(sport, on="UID", how="left")

    # geographical merges
    df = df.merge(city_adm, on="INSEE", how="left")
    df = df.merge(city_pop, on="INSEE", how="left")
    df = df.merge(departments, on="DEP", how="left")
    df = df.merge(regions, on="Reg", how="left")

    return df


# CatBoost helpers
def make_cat_features(X: pd.DataFrame, X_test: pd.DataFrame):
    # 1) auto-detect object/string cols as categorical
    cat_cols = [c for c in X.columns if (X[c].dtype == "object") or str(X[c].dtype).startswith("string")]

    # 2) some code columns to force as categorical
    force_cat = ["INSEE", "DEP", "Reg", "job_dep", "JOB_SECURITY", "Sports"]
    for c in force_cat:
        if c in X.columns and c not in cat_cols:
            cat_cols.append(c)

    # CatBoost requires string type + fillna
    for c in cat_cols:
        X[c] = X[c].astype("string").fillna("__MISSING__")
        X_test[c] = X_test[c].astype("string").fillna("__MISSING__")

    cat_idx = [X.columns.get_loc(c) for c in cat_cols]
    return cat_cols, cat_idx


def grid_search_catboost(X, y, cat_idx, seed=66):
    # 8 combinations
    grid = []
    for depth in [6, 8]:
        for lr in [0.05, 0.1]:
            for l2 in [3, 7]:
                grid.append({"depth": depth, "learning_rate": lr, "l2_leaf_reg": l2})

    classes = sorted(y.astype(str).unique().tolist())
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

    results = []
    for g in grid:
        accs, f1s, lls = [], [], []

        for tr, va in cv.split(X, y):
            train_pool = Pool(X.iloc[tr], y.iloc[tr], cat_features=cat_idx)
            valid_pool = Pool(X.iloc[va], y.iloc[va], cat_features=cat_idx)

            model = CatBoostClassifier(
                loss_function="Logloss",
                iterations=500,        
                od_type="Iter",        # early stopping
                od_wait=50,
                random_seed=seed,
                thread_count=-1,
                verbose=False,
                **g
            )
            model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

            pred = model.predict(valid_pool).astype(str).ravel()
            proba = model.predict_proba(valid_pool)

            y_true = y.iloc[va].astype(str).values
            accs.append(accuracy_score(y_true, pred))
            f1s.append(f1_score(y_true, pred, average="macro"))
            lls.append(log_loss(y_true, proba, labels=classes))

        row = {
            **g,
            "cv_acc_mean": float(np.mean(accs)),
            "cv_f1_macro_mean": float(np.mean(f1s)),
            "cv_logloss_mean": float(np.mean(lls)),
        }
        results.append(row)
        print(f"Done {g} | F1={row['cv_f1_macro_mean']:.4f} ACC={row['cv_acc_mean']:.4f} LogLoss={row['cv_logloss_mean']:.4f}")

    res_df = pd.DataFrame(results).sort_values(
        by=["cv_f1_macro_mean", "cv_logloss_mean"],
        ascending=[False, True]
    ).reset_index(drop=True)

    res_df.to_csv("grid_results.csv", index=False)
    print("Saved grid_results.csv")

    best = res_df.iloc[0].to_dict()
    best_params = {
        "depth": int(best["depth"]),
        "learning_rate": float(best["learning_rate"]),
        "l2_leaf_reg": float(best["l2_leaf_reg"]),
    }
    print("BEST PARAMS:", best_params)
    print("BEST CV:", {
        "cv_f1_macro_mean": best["cv_f1_macro_mean"],
        "cv_acc_mean": best["cv_acc_mean"],
        "cv_logloss_mean": best["cv_logloss_mean"],
    })
    return best_params, res_df


def main():
    # load 
    learn = read_csv("learn_dataset.csv")
    test = read_csv("test_dataset.csv")

    learn_job = read_csv("learn_dataset_job.csv")
    test_job = read_csv("test_dataset_job.csv")

    learn_js = read_csv("learn_dataset_JOB_SECURITY.csv")
    test_js = read_csv("test_dataset_JOB_SECURITY.csv")

    learn_pension = read_csv("learn_dataset_retired_pension.csv")
    test_pension = read_csv("test_dataset_retired_pension.csv")

    learn_sport = read_csv("learn_dataset_sport.csv")
    test_sport = read_csv("test_dataset_sport.csv")

    city_adm = read_csv("city_adm.csv")
    city_pop = read_csv("city_pop.csv")
    departments = read_csv("departments.csv")
    regions = read_csv("regions.csv")

    # to string
    to_str_cols(learn, ["INSEE"])
    to_str_cols(test, ["INSEE"])
    to_str_cols(city_adm, ["INSEE", "DEP"])
    to_str_cols(city_pop, ["INSEE"])
    to_str_cols(departments, ["DEP", "Reg"])
    to_str_cols(regions, ["Reg"])
    to_str_cols(learn_job, ["job_dep"])
    to_str_cols(test_job, ["job_dep"])
    to_str_cols(learn_js, ["JOB_SECURITY"])
    to_str_cols(test_js, ["JOB_SECURITY"])
    to_str_cols(learn_sport, ["Sports"])
    to_str_cols(test_sport, ["Sports"])

    # merge all
    train_df = build_table(learn, learn_job, learn_js, learn_pension, learn_sport,
                           city_adm, city_pop, departments, regions)
    test_df = build_table(test, test_job, test_js, test_pension, test_sport,
                          city_adm, city_pop, departments, regions)

    # X/y
    y = train_df["target"].astype("string").fillna("__MISSING_TARGET__")
    X = train_df.drop(columns=["target"]).copy()

    test_uid = test_df["UID"].copy()
    X_test = test_df.copy()

    # UID drop
    X.drop(columns=["UID"], inplace=True)
    X_test.drop(columns=["UID"], inplace=True)

    # category features processing
    cat_cols, cat_idx = make_cat_features(X, X_test)

    print(f"Detected task: CLASSIFICATION (classes={y.nunique()})")
    print(f"Feature count: {X.shape[1]} | Cat features: {len(cat_cols)}")

    #  grid search (3-fold)
    best_params, _ = grid_search_catboost(X, y, cat_idx, seed=SEED)

    # other fixed training params
    base_params = dict(
        loss_function="Logloss",
        iterations=500,
        od_type="Iter",
        od_wait=50,
        random_seed=SEED,
        thread_count=-1,
        verbose=False,
        **best_params
    )

    #  confusion matrix (holdout 20%) 
    X_tr, X_va, y_tr, y_va = train_test_split(
        X, y, test_size=0.2, random_state=66, stratify=y
    )
    tr_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
    va_pool = Pool(X_va, y_va, cat_features=cat_idx)

    cm_model = CatBoostClassifier(**base_params)
    cm_model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

    pred_va = cm_model.predict(va_pool).astype(str).ravel()
    y_va_str = y_va.astype(str).values

    print("\nConfusion matrix:")
    print(confusion_matrix(y_va_str, pred_va))
    print("\nClassification report:")
    print(classification_report(y_va_str, pred_va))

    # final train + predict 
    full_pool = Pool(X, y, cat_features=cat_idx)
    test_pool = Pool(X_test, cat_features=cat_idx)

    final_model = CatBoostClassifier(**base_params)
    final_model.fit(full_pool)

    pred_test = final_model.predict(test_pool).astype(str).ravel()
    out = pd.DataFrame({"UID": test_uid, "target": pred_test})
    out.to_csv("predictions.csv", index=False)
    print("\nSaved predictions.csv")


if __name__ == "__main__":
    main()


Detected task: CLASSIFICATION (classes=2)
Feature count: 32 | Cat features: 22
Done {'depth': 6, 'learning_rate': 0.05, 'l2_leaf_reg': 3} | F1=0.6874 ACC=0.6919 LogLoss=0.5757
Done {'depth': 6, 'learning_rate': 0.05, 'l2_leaf_reg': 7} | F1=0.6888 ACC=0.6934 LogLoss=0.5750
Done {'depth': 6, 'learning_rate': 0.1, 'l2_leaf_reg': 3} | F1=0.6881 ACC=0.6923 LogLoss=0.5747
Done {'depth': 6, 'learning_rate': 0.1, 'l2_leaf_reg': 7} | F1=0.6896 ACC=0.6939 LogLoss=0.5729
Done {'depth': 8, 'learning_rate': 0.05, 'l2_leaf_reg': 3} | F1=0.6877 ACC=0.6919 LogLoss=0.5741
Done {'depth': 8, 'learning_rate': 0.05, 'l2_leaf_reg': 7} | F1=0.6895 ACC=0.6939 LogLoss=0.5732
Done {'depth': 8, 'learning_rate': 0.1, 'l2_leaf_reg': 3} | F1=0.6892 ACC=0.6932 LogLoss=0.5743
Done {'depth': 8, 'learning_rate': 0.1, 'l2_leaf_reg': 7} | F1=0.6891 ACC=0.6930 LogLoss=0.5736
Saved grid_results.csv
BEST PARAMS: {'depth': 6, 'learning_rate': 0.1, 'l2_leaf_reg': 7.0}
BEST CV: {'cv_f1_macro_mean': 0.689581973476848, 'cv_acc_m