# Ensemble

## Explore data

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score, log_loss, average_precision_score
import lightgbm as lgb
from catboost import CatBoostClassifier
# import xgboost as xgb
import ast

# ── 1. Load ──────────────────────────────────────────────────────────────────
# train_df = pd.read_csv('Train.csv')
# test_df  = pd.read_csv('Test.csv')
train_df = pd.read_csv('/app/digicow/data/Train.csv')
prior_df = pd.read_csv('/app/digicow/data/Prior.csv')
test_df = pd.read_csv('/app/digicow/data/Test.csv')

prior_df['dt'] = pd.to_datetime(prior_df['training_day'])
train_df['dt'] = pd.to_datetime(train_df['training_day'])
test_df['dt']  = pd.to_datetime(test_df['training_day'])

# ── Overlap check ──────────────────────────────────────────
# Check overlap between Prior and Train sessions
overlap = prior_df.merge(
    train_df[['farmer_name', 'training_day']],
    on=['farmer_name', 'training_day']
)
print(f"Prior rows: {len(prior_df)}")
print(f"Train rows: {len(train_df)}")
print(f"Exact session overlaps (farmer + date): {len(overlap)}")
print(f"Overlap as % of Train: {len(overlap)/len(train_df)*100:.1f}%")

# 1. Trainer overlap
# prior_trainers = set(prior_df.trainer.dropna())
# train_trainers = set(train_df.trainer.dropna())
import ast

# def parse_trainer(s):
#     try:
#         parsed = ast.literal_eval(s)
#         if isinstance(parsed, list):
#             return parsed[0] if len(parsed) == 1 else str(parsed)
#         return s
#     except:
#         return s
def parse_trainer(s):
    try:
        parsed = ast.literal_eval(s)
        if isinstance(parsed, list):
            return parsed[0]  # always take first trainer, even for multi-trainer rows
        return s
    except:
        return s
train_df['trainer'] = train_df['trainer'].apply(parse_trainer)
test_df['trainer']  = test_df['trainer'].apply(parse_trainer)

# Recompute AFTER parsing
prior_trainers = set(prior_df.trainer.dropna())
train_trainers = set(train_df.trainer.dropna())

print("=== TRAINERS ===")
print(f"Prior: {prior_trainers}")
print(f"Train: {train_trainers}")
print(f"Overlap: {prior_trainers & train_trainers}")


# 2. Date ranges
print("\n=== DATES ===")
print(f"Prior:  {prior_df.dt.min().date()} → {prior_df.dt.max().date()}")
print(f"Train:  {train_df.dt.min().date()} → {train_df.dt.max().date()}")
print(f"Test:   {test_df.dt.min().date()}  → {test_df.dt.max().date()}")
print(f"Prior rows overlapping test period (>= 2025-05-02): {(prior_df.dt >= '2025-05-02').sum():,}")

# 3. Adoption rates
print("\n=== ADOPTION RATES ===")
for c in ['adopted_within_07_days','adopted_within_90_days','adopted_within_120_days']:
    print(f"  {c}: Prior={prior_df[c].mean():.4f}  Train={train_df[c].mean():.4f}")


Prior rows: 44882
Train rows: 13536
Exact session overlaps (farmer + date): 0
Overlap as % of Train: 0.0%
=== TRAINERS ===
Prior: {'TRA_hyodnntj', 'TRA_ubcgvofe', 'TRA_szrwyfzz', 'TRA_rkvyofbh', 'TRA_gertumxc', 'TRA_kkzpfdtu', 'TRA_dttdgplk', 'TRA_twwcfcum', 'TRA_suiifsur'}
Train: {'TRA_hyodnntj', 'TRA_ubcgvofe', 'TRA_szrwyfzz', 'TRA_rkvyofbh', 'TRA_gertumxc', 'TRA_kkzpfdtu', 'TRA_dttdgplk', 'TRA_twwcfcum', 'TRA_suiifsur'}
Overlap: {'TRA_hyodnntj', 'TRA_ubcgvofe', 'TRA_szrwyfzz', 'TRA_rkvyofbh', 'TRA_gertumxc', 'TRA_kkzpfdtu', 'TRA_dttdgplk', 'TRA_twwcfcum', 'TRA_suiifsur'}

=== DATES ===
Prior:  2024-01-03 → 2025-12-11
Train:  2024-01-03 → 2025-04-12
Test:   2025-05-02  → 2025-12-12
Prior rows overlapping test period (>= 2025-05-02): 6,546

=== ADOPTION RATES ===
  adopted_within_07_days: Prior=0.0148  Train=0.0113
  adopted_within_90_days: Prior=0.0340  Train=0.0158
  adopted_within_120_days: Prior=0.0465  Train=0.0223


In [4]:
# # unique farmers
# 1. Train has no repeat farmers
print("Train unique farmers:", train_df['farmer_name'].nunique(), "/ total rows:", len(train_df))

# # 2. Prior has repeat farmers (multiple sessions)
print("\nPrior unique farmers:", prior_df['farmer_name'].nunique(), "/ total rows:", len(prior_df))

# # 3. topics_list + training_day explain most repeats — show one farmer
top_farmer = prior_df['farmer_name'].value_counts().index[0]
cols = ['training_day', 'topics_list', 'adopted_within_07_days', 'adopted_within_90_days', 'adopted_within_120_days']
display(prior_df[prior_df['farmer_name'] == top_farmer][cols])

prior_df.groupby(['farmer_name', 'training_day', 'topics_list']).size().value_counts()
# So the "4 people in a group session" theory doesn't hold cleanly. It's more likely just noisy data collection — same record entered multiple times with slight variations.

# # 4. True duplicates: same farmer + date + topic, still multiple rows
group_cols = ['farmer_name', 'training_day', 'topics_list']
grp_sizes = prior_df.groupby(group_cols).size()
print("\nGroups with >1 row (true duplicates):", (grp_sizes > 1).sum())

print("Total groups:", len(grp_sizes))
print("Groups with >1 row:", (grp_sizes > 1).sum())
print("% duplicated:", round((grp_sizes > 1).sum() / len(grp_sizes) * 100, 2))

# # 5. How many of those have conflicting labels?
for col in ['adopted_within_07_days', 'adopted_within_90_days', 'adopted_within_120_days']:
    n = (prior_df.groupby(group_cols)[col].nunique() > 1).sum()
    print(f"  Conflicting {col}: {n}")

# # # 6. Test duplicates (ignoring ID)
# feat_cols = [c for c in test_df.columns if c != 'ID']
# print("\nTest duplicate rows (ignoring ID):", test_df.duplicated(subset=feat_cols).sum(), "/ total:", len(test_df))

Train unique farmers: 13536 / total rows: 13536

Prior unique farmers: 6719 / total rows: 44882


Unnamed: 0,training_day,topics_list,adopted_within_07_days,adopted_within_90_days,adopted_within_120_days
4690,2024-02-07,['How To Rear A Calf With Unga Products'],0,0,0
4691,2024-02-07,['How To Rear A Calf With Unga Products'],0,0,0
4692,2024-02-07,['How To Rear A Calf With Unga Products'],0,0,0
4693,2024-02-07,['How To Rear A Calf With Unga Products'],0,0,0
4694,2024-02-07,['Transition Cow Management- Care For Your Cow...,0,0,0
...,...,...,...,...,...
40225,2025-06-05,['Importance Of Vaccinations And Record'],0,1,1
40226,2025-06-05,['Importance Of Vaccinating Against East Coast...,0,0,0
40227,2025-06-05,['Importance Of Vaccinating Against East Coast...,0,0,0
40228,2025-06-05,['Importance Of Vaccinating Against East Coast...,0,0,0



Groups with >1 row (true duplicates): 3330
Total groups: 39674
Groups with >1 row: 3330
% duplicated: 8.39
  Conflicting adopted_within_07_days: 266
  Conflicting adopted_within_90_days: 429
  Conflicting adopted_within_120_days: 560


In [5]:
# PRIOR: 
# 
# 1. Aggregate duplicates by mean (handles both consistent + conflicting)
prior_clean = prior_df.groupby(['farmer_name', 'training_day', 'topics_list']).agg({
    'adopted_within_07_days': 'mean',
    'adopted_within_90_days': 'mean', 
    'adopted_within_120_days': 'mean',
    # keep other cols
    'county': 'first', 'gender': 'first', 'age': 'first'
}).reset_index()



# 2. Parse topics and explode
prior_clean['topics_parsed'] = prior_clean['topics_list'].apply(ast.literal_eval)
prior_clean['training_day'] = pd.to_datetime(prior_clean['training_day'])
prior_exploded = prior_clean.explode('topics_parsed')

# 3a. For TRAIN — only use Prior history before each row's training date (avoid leakage)
train_df['training_day'] = pd.to_datetime(train_df['training_day'])

prior_for_train = prior_exploded.merge(
    train_df[['farmer_name', 'training_day']].rename(columns={'training_day': 'current_date'}),
    on='farmer_name', how='inner'
)
prior_for_train = prior_for_train[prior_for_train['training_day'] < prior_for_train['current_date']]

farmer_history_train = prior_for_train.groupby('farmer_name').agg(
    n_prior_sessions    = ('training_day', 'nunique'),
    n_prior_topics      = ('topics_parsed', 'nunique'),
    hist_adopt_rate_07  = ('adopted_within_07_days', 'mean'),
    hist_adopt_rate_90  = ('adopted_within_90_days', 'mean'),
    hist_adopt_rate_120 = ('adopted_within_120_days', 'mean'),
    last_training_day   = ('training_day', 'max'),
).reset_index()

train_df = train_df.merge(farmer_history_train, on='farmer_name', how='left')


# 3b. For TEST — do same for test
test_df['training_day'] = pd.to_datetime(test_df['training_day'])

prior_for_test = prior_exploded.merge(
    test_df[['farmer_name', 'training_day']].rename(columns={'training_day': 'current_date'}),
    on='farmer_name', how='inner'
)
prior_for_test = prior_for_test[prior_for_test['training_day'] < prior_for_test['current_date']]

farmer_history_test = prior_for_test.groupby('farmer_name').agg(
    n_prior_sessions    = ('training_day', 'nunique'),
    n_prior_topics      = ('topics_parsed', 'nunique'),
    hist_adopt_rate_07  = ('adopted_within_07_days', 'mean'),
    hist_adopt_rate_90  = ('adopted_within_90_days', 'mean'),
    hist_adopt_rate_120 = ('adopted_within_120_days', 'mean'),
    last_training_day   = ('training_day', 'max'),
).reset_index()

test_df = test_df.merge(farmer_history_test, on='farmer_name', how='left')



history_cols = ['n_prior_sessions', 'n_prior_topics', 'hist_adopt_rate_07', 
                'hist_adopt_rate_90', 'hist_adopt_rate_120']

train_df[history_cols] = train_df[history_cols].fillna(0)
test_df[history_cols]  = test_df[history_cols].fillna(0)

# last_training_day NaN means no history — fill with a sentinel or leave for recency feature later
# Recency feature
train_df['days_since_last_training'] = (
    train_df['training_day'] - pd.to_datetime(train_df['last_training_day'])
).dt.days.fillna(9999)

test_df['days_since_last_training'] = (
    test_df['training_day'] - pd.to_datetime(test_df['last_training_day'])
).dt.days.fillna(9999)

train_df.drop(columns=['last_training_day'], inplace=True)
test_df.drop(columns=['last_training_day'], inplace=True)

print("Train farmers with prior history:", train_df['n_prior_sessions'].gt(0).sum(), '/', len(train_df))
print("Test farmers with prior history: ", test_df['n_prior_sessions'].gt(0).sum(), '/', len(test_df))

train_farmers_in_prior = train_df['farmer_name'].isin(prior_df['farmer_name'])
print("Train farmers appearing in Prior:", train_farmers_in_prior.sum(), '/', len(train_df))


Train farmers with prior history: 3193 / 13536
Test farmers with prior history:  3526 / 5621
Train farmers appearing in Prior: 3193 / 13536


In [6]:
# train_df.columns
print(prior_df['training_day'].max(), train_df['training_day'].min())
# Check date overlap between Prior and Train/Test
print("Prior range:", prior_df['training_day'].min(), "→", prior_df['training_day'].max())
print("Train range:", train_df['training_day'].min(), "→", train_df['training_day'].max())
print("Test range:", test_df['training_day'].min(), "→", test_df['training_day'].max())

2025-12-11 2024-01-03 00:00:00
Prior range: 2024-01-03 → 2025-12-11
Train range: 2024-01-03 00:00:00 → 2025-04-12 00:00:00
Test range: 2025-05-02 00:00:00 → 2025-12-12 00:00:00


In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score, log_loss, average_precision_score
import lightgbm as lgb
from catboost import CatBoostClassifier
import xgboost as xgb
import ast


# ── Group-level adoption rates from Prior ─────────────────────────────────────
# for grp in ['trainer', 'county', 'subcounty', 'ward']:
#     for days in ['07', '90', '120']:
#         col = f'adopted_within_{days}_days'
#         rate_map = prior_df.groupby(grp)[col].mean()
#         feat_name = f'{grp}_prior_adopt_{days}d'
#         global_mean = prior_df[col].mean()
#         train_df[feat_name] = train_df[grp].map(rate_map).fillna(global_mean)
#         test_df[feat_name]  = test_df[grp].map(rate_map).fillna(global_mean)

# group_rate_cols = [f'{grp}_prior_adopt_{days}d' 
#                    for grp in ['trainer', 'county', 'subcounty', 'ward']
#                    for days in ['07', '90', '120']]

for grp in ['trainer', 'county', 'subcounty', 'ward', 'group_name']:  # ← add group_name
    for days in ['07', '90', '120']:
        col = f'adopted_within_{days}_days'
        rate_map = prior_df.groupby(grp)[col].mean()
        feat_name = f'{grp}_prior_adopt_{days}d'
        global_mean = prior_df[col].mean()
        train_df[feat_name] = train_df[grp].map(rate_map).fillna(global_mean)
        test_df[feat_name]  = test_df[grp].map(rate_map).fillna(global_mean)

group_rate_cols = [f'{grp}_prior_adopt_{days}d' 
                   for grp in ['trainer', 'county', 'subcounty', 'ward', 'group_name']  # ← here too
                   for days in ['07', '90', '120']]


# ── 2. Parse topics ──────────────────────────────────────────────────────────
def parse_topics(s):
    try:
        parsed = ast.literal_eval(s)
        if isinstance(parsed, list):
            flat = []
            for item in parsed:
                if isinstance(item, list): flat.extend(item)
                else: flat.append(item)
            return flat
    except:
        return []


mlb = MultiLabelBinarizer()
topics_train = pd.DataFrame(
    mlb.fit_transform(train_df['topics_list'].apply(parse_topics)),
    columns=mlb.classes_, index=train_df.index
)
topics_test = pd.DataFrame(
    mlb.transform(test_df['topics_list'].apply(parse_topics)),
    columns=mlb.classes_, index=test_df.index
)
train_df = pd.concat([train_df, topics_train], axis=1).drop(columns=['topics_list'])
test_df  = pd.concat([test_df,  topics_test],  axis=1).drop(columns=['topics_list'])

# ['Importance Of Vaccination'] how common is it in test? as it doesnt appear in train








In [8]:


# ── Temporal split ────────────────────────────────────────────────────────────
train_df['dt'] = pd.to_datetime(train_df['training_day'])
test_df['dt']  = pd.to_datetime(test_df['training_day'])

cutoff = pd.Timestamp('2025-01-01')
df_val = train_df[train_df['dt'] >= cutoff].copy()
df_tr  = train_df[train_df['dt'] <  cutoff].copy()
print(f"Train: {len(df_tr):,} | Val: {len(df_val):,}")

print(f"Train: {len(df_tr):,} | Val: {len(df_val):,}")


Train: 11,318 | Val: 2,218
Train: 11,318 | Val: 2,218


In [9]:
# ── Features ──────────────────────────────────────────────────────────────────
cat_cols     = ['gender', 'registration', 'age', 'trainer', 'county', 'subcounty', 'ward']
topic_cols   = list(mlb.classes_)
# numeric_cols = ['belong_to_cooperative', 'has_topic_trained_on', #'topic_overlap_frac',
#                 'n_prior_sessions', 'n_prior_topics',
#                 'hist_adopt_rate_07', 'hist_adopt_rate_90', 'hist_adopt_rate_120',
#                 'days_since_last_training'] + topic_cols
numeric_cols = ['belong_to_cooperative', 'has_topic_trained_on',
                'n_prior_sessions', 'n_prior_topics',
                'hist_adopt_rate_07', 'hist_adopt_rate_90', 'hist_adopt_rate_120',
                'days_since_last_training'] + group_rate_cols + topic_cols
feature_cols = cat_cols + numeric_cols

In [10]:
# w/tuning
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
import json


# LightGBM needs categoricals as 'category' dtype
def prep_lgb(df):
    X = df[feature_cols].copy()
    for c in cat_cols:
        X[c] = X[c].astype('category')
    return X

X_tr_lgb   = prep_lgb(df_tr)
X_val_lgb  = prep_lgb(df_val)
X_test_lgb = prep_lgb(test_df)

# CatBoost takes raw strings — no encoding needed
X_tr_cat   = df_tr[feature_cols].fillna('missing')
X_val_cat  = df_val[feature_cols].fillna('missing')
X_test_cat = test_df[feature_cols].fillna('missing')

cat_feature_indices = [feature_cols.index(c) for c in cat_cols]

targets = ['07', '90', '120']
lgb_preds  = {}
cat_preds  = {}
xgb_preds = {}
ens_preds  = {}

# # # ── Tuning cell (run once, save results) ─────────────────────────────────────
# def optimize_weight(cat_prob, lgb_prob, y_true):
#     def objective(trial):
#         w = trial.suggest_float('cat_weight', 0.0, 1.0)
#         return log_loss(y_true, w * cat_prob + (1-w) * lgb_prob)
#     # study = optuna.create_study(direction='minimize')
#     study = optuna.create_study(
#         direction='minimize',
#         sampler=optuna.samplers.TPESampler(n_startup_trials=20, seed=42)
#     )
#     study.optimize(objective, n_trials=100)
#     return study.best_params['cat_weight']

# def tune_lgb(X_tr, y_tr, X_val, y_val, scale_pos):
#     def objective(trial):
#         params = {
#             'objective': 'binary', 'metric': 'binary_logloss',
#             'scale_pos_weight': scale_pos, 'verbose': -1, 'seed': 42,
#             'learning_rate':     trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
#             'num_leaves':        trial.suggest_int('num_leaves', 16, 128),
#             'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
#             'feature_fraction':  trial.suggest_float('feature_fraction', 0.5, 1.0),
#             'bagging_fraction':  trial.suggest_float('bagging_fraction', 0.3, 1.0),
#             'bagging_freq':      trial.suggest_int('bagging_freq', 1, 10),
#             'reg_alpha':         trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True),
#             'reg_lambda':        trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True),
#         }
#         ds_tr  = lgb.Dataset(X_tr, label=y_tr)
#         ds_val = lgb.Dataset(X_val, label=y_val, reference=ds_tr)
#         model  = lgb.train(params, ds_tr, num_boost_round=500, valid_sets=[ds_val],
#                            callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)])
#         return log_loss(y_val, model.predict(X_val))
#     # study = optuna.create_study(direction='minimize')
#     study = optuna.create_study(
#         direction='minimize',
#         sampler=optuna.samplers.TPESampler(n_startup_trials=20, seed=42)
#     )
#     study.optimize(objective, n_trials=100, show_progress_bar=True)
#     return study.best_params

# def tune_cat(X_tr, y_tr, X_val, y_val, cat_feature_indices):
#     def objective(trial):
#         model = CatBoostClassifier(
#             iterations=2000, random_seed=42, verbose=0,
#             cat_features=cat_feature_indices, early_stopping_rounds=50,
#             loss_function='Logloss', eval_metric='Logloss',
#             learning_rate=       trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
#             depth=               trial.suggest_int('depth', 4, 8),
#             l2_leaf_reg=         trial.suggest_float('l2_leaf_reg', 1.0, 20.0),
#             bagging_temperature= trial.suggest_float('bagging_temperature', 0.0, 2.0),
#             random_strength=     trial.suggest_float('random_strength', 0.0, 2.0),
#             border_count=        trial.suggest_int('border_count', 32, 255),
#         )
#         model.fit(X_tr, y_tr, eval_set=(X_val, y_val))
#         return log_loss(y_val, model.predict_proba(X_val)[:, 1])
#     # study = optuna.create_study(direction='minimize')
#     study = optuna.create_study(
#         direction='minimize',
#         sampler=optuna.samplers.TPESampler(n_startup_trials=20, seed=42)
#     )
#     study.optimize(objective, n_trials=50, show_progress_bar=True)
#     return study.best_params

# best_params  = {}
# best_weights = {}

# for days in targets:
#     col = f'adopted_within_{days}_days'
#     y_tr  = df_tr[col].values
#     y_val = df_val[col].values
#     prevalence = y_tr.mean()
#     scale_pos  = (1 - prevalence) / prevalence

#     print(f"\n── Tuning {days}d ──")
#     lgb_best = tune_lgb(X_tr_lgb, y_tr, X_val_lgb, y_val, scale_pos)
#     cat_best = tune_cat(X_tr_cat, y_tr, X_val_cat, y_val, cat_feature_indices)
#     best_params[days] = {'lgb': lgb_best, 'cat': cat_best}

#     # Retrain with best params to get val probs for weight tuning
#     ds_tr  = lgb.Dataset(X_tr_lgb, label=y_tr)
#     ds_val = lgb.Dataset(X_val_lgb, label=y_val, reference=ds_tr)
#     lgb_model = lgb.train(
#         {'objective':'binary','metric':'binary_logloss','scale_pos_weight':scale_pos,
#          'verbose':-1,'seed':42, **lgb_best},
#         ds_tr, num_boost_round=500, valid_sets=[ds_val],
#         callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)]
#     )
#     cat_model = CatBoostClassifier(
#         iterations=2000, loss_function='Logloss', eval_metric='Logloss',
#         cat_features=cat_feature_indices, early_stopping_rounds=50,
#         random_seed=42, verbose=0, **cat_best
#     )
#     cat_model.fit(X_tr_cat, y_tr, eval_set=(X_val_cat, y_val))

#     # best_w = optimize_weight(cat_model.predict_proba(X_val_cat)[:,1], lgb_model.predict(X_val_lgb), y_val)
#     lgb_val_prob_tune = lgb_model.predict(X_val_lgb)
#     cat_val_prob_tune = cat_model.predict_proba(X_val_cat)[:, 1]
#     print(f"  LGB  AUC={roc_auc_score(y_val, lgb_val_prob_tune):.4f}  LL={log_loss(y_val, lgb_val_prob_tune):.4f}")
#     print(f"  CAT  AUC={roc_auc_score(y_val, cat_val_prob_tune):.4f}  LL={log_loss(y_val, cat_val_prob_tune):.4f}")
#     best_w = optimize_weight(cat_val_prob_tune, lgb_val_prob_tune, y_val)
#     best_weights[days] = best_w
#     print(f"{days}d → LGB: {lgb_best} | CAT: {cat_best} | CAT weight: {best_w:.3f}")

# with open('/app/digicow/best_params.json', 'w') as f:
#     json.dump({'params': best_params, 'weights': best_weights}, f, indent=2)
# print("Saved best_params.json")




with open('/app/digicow/best_params.json') as f:
    saved = json.load(f)
best_params  = saved['params']
best_weights = saved['weights']
print(best_params, best_weights)

# ── 5. Train per target (with tuned params) ───────────────────────────────────
for days in targets:
    col = f'adopted_within_{days}_days'
    y_tr  = df_tr[col].values
    y_val = df_val[col].values
    prevalence = y_tr.mean()
    scale_pos  = (1 - prevalence) / prevalence

    # ── LightGBM ──
    lgb_tr  = lgb.Dataset(X_tr_lgb, label=y_tr)
    lgb_val_ds = lgb.Dataset(X_val_lgb, label=y_val, reference=lgb_tr)
    params = {
        'objective': 'binary', 'metric': ['binary_logloss', 'auc'],
        'scale_pos_weight': scale_pos, 'verbose': -1, 'seed': 42,
        **best_params[days]['lgb']
    }
    lgb_model = lgb.train(params, lgb_tr, num_boost_round=500,
        valid_sets=[lgb_val_ds],
        callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)])
    lgb_val_prob  = lgb_model.predict(X_val_lgb)
    lgb_test_prob = lgb_model.predict(X_test_lgb)
    lgb_preds[days] = lgb_test_prob
    print(f"[LGB {days}d]  AUC={roc_auc_score(y_val,lgb_val_prob):.4f}  LogLoss={log_loss(y_val,lgb_val_prob):.4f}  PR-AUC={average_precision_score(y_val,lgb_val_prob):.4f}")

    # ── CatBoost ──
    cat_model = CatBoostClassifier(
        iterations=2000, loss_function='Logloss', eval_metric='Logloss',
        cat_features=cat_feature_indices, early_stopping_rounds=50,
        random_seed=42, verbose=0, **best_params[days]['cat']
    )
    cat_model.fit(X_tr_cat, y_tr, eval_set=(X_val_cat, y_val))
    cat_val_prob  = cat_model.predict_proba(X_val_cat)[:, 1]
    cat_test_prob = cat_model.predict_proba(X_test_cat)[:, 1]
    cat_preds[days] = cat_test_prob
    print(f"[CAT {days}d]  AUC={roc_auc_score(y_val,cat_val_prob):.4f}  LogLoss={log_loss(y_val,cat_val_prob):.4f}  PR-AUC={average_precision_score(y_val,cat_val_prob):.4f}")

    # ── Ensemble (tuned weight) ──
    best_w = best_weights[days]
    ens_val_prob  = best_w * cat_val_prob  + (1-best_w) * lgb_val_prob
    ens_test_prob = best_w * cat_test_prob + (1-best_w) * lgb_test_prob
    ens_preds[days] = ens_test_prob
    print(f"[ENS {days}d]  AUC={roc_auc_score(y_val,ens_val_prob):.4f}  LogLoss={log_loss(y_val,ens_val_prob):.4f}  PR-AUC={average_precision_score(y_val,ens_val_prob):.4f}  CAT_w={best_w:.3f}")
    print()

  from .autonotebook import tqdm as notebook_tqdm


{'07': {'lgb': {'learning_rate': 0.01277588617013218, 'num_leaves': 69, 'min_child_samples': 66, 'feature_fraction': 0.7931799823039588, 'bagging_fraction': 0.38580794509945804, 'bagging_freq': 10, 'reg_alpha': 0.0004814490654559664, 'reg_lambda': 0.1359271205257805}, 'cat': {'learning_rate': 0.05192523263427517, 'depth': 6, 'l2_leaf_reg': 8.210394746047953, 'bagging_temperature': 0.031073459271315294, 'random_strength': 1.3764590239003986, 'border_count': 67}}, '90': {'lgb': {'learning_rate': 0.019998939263562214, 'num_leaves': 54, 'min_child_samples': 12, 'feature_fraction': 0.8686223981720478, 'bagging_fraction': 0.7275940125211089, 'bagging_freq': 7, 'reg_alpha': 0.0014867695769971536, 'reg_lambda': 0.011031682276024198}, 'cat': {'learning_rate': 0.062419052252125025, 'depth': 7, 'l2_leaf_reg': 1.29008959776147, 'bagging_temperature': 0.027956026956829527, 'random_strength': 0.914834670069054, 'border_count': 59}}, '120': {'lgb': {'learning_rate': 0.012744638435231487, 'num_leaves'

In [11]:
# ── Submission ────────────────────────────────────────────────────────────────
ss = pd.read_csv('/app/digicow/data/SampleSubmission.csv')[['ID']]

for days in targets:
    ss[f'Target_{days}_AUC']     = ens_preds[days]
    ss[f'Target_{days}_LogLoss'] = ens_preds[days]

col_order = ['ID',
             'Target_07_AUC', 'Target_90_AUC', 'Target_120_AUC',
             'Target_07_LogLoss', 'Target_90_LogLoss', 'Target_120_LogLoss']

ss = ss[col_order]
ss.to_csv('/app/digicow/submission.csv', index=False)
print(f"Saved submission.csv — {ss.shape[0]} rows")
print(ss.head(3))

Saved submission.csv — 5621 rows
          ID  Target_07_AUC  Target_90_AUC  Target_120_AUC  Target_07_LogLoss  \
0  ID_LEG1GM       0.006235       0.005746        0.013282           0.006235   
1  ID_1UKOKW       0.005378       0.002327        0.011619           0.005378   
2  ID_U5H2YK       0.040670       0.021337        0.031755           0.040670   

   Target_90_LogLoss  Target_120_LogLoss  
0           0.005746            0.013282  
1           0.002327            0.011619  
2           0.021337            0.031755  


In [None]:
## W/O tuning

# LightGBM needs categoricals as 'category' dtype
def prep_lgb(df):
    X = df[feature_cols].copy()
    for c in cat_cols:
        X[c] = X[c].astype('category')
    return X

X_tr_lgb   = prep_lgb(df_tr)
X_val_lgb  = prep_lgb(df_val)
X_test_lgb = prep_lgb(test_df)

# CatBoost takes raw strings — no encoding needed
X_tr_cat   = df_tr[feature_cols].fillna('missing')
X_val_cat  = df_val[feature_cols].fillna('missing')
X_test_cat = test_df[feature_cols].fillna('missing')

cat_feature_indices = [feature_cols.index(c) for c in cat_cols]

targets = ['07', '90', '120']
lgb_preds  = {}
cat_preds  = {}
xgb_preds = {}
ens_preds  = {}


# raise SystemExit

# ── 5. Train per target ──────────────────────────────────────────────────────
for days in targets:
    col = f'adopted_within_{days}_days'
    y_tr  = df_tr[col].values
    y_val = df_val[col].values

    prevalence = y_tr.mean()
    scale_pos  = (1 - prevalence) / prevalence  # for imbalance

    # ── LightGBM ──
    lgb_tr  = lgb.Dataset(X_tr_lgb,  label=y_tr)
    lgb_val = lgb.Dataset(X_val_lgb, label=y_val, reference=lgb_tr)

    params = {
        'objective':        'binary',
        'metric':           ['binary_logloss', 'auc'],
        'scale_pos_weight': scale_pos,
        'learning_rate':    0.05,
        'num_leaves':       31,
        'min_child_samples': 20,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq':     5,
        'verbose':          -1,
        'seed':             42,
    }
    lgb_model = lgb.train(
        params,
        lgb_tr,
        num_boost_round=500,
        valid_sets=[lgb_val],
        callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)]
    )
    lgb_val_prob  = lgb_model.predict(X_val_lgb)
    lgb_test_prob = lgb_model.predict(X_test_lgb)
    lgb_preds[days] = lgb_test_prob

    auc   = roc_auc_score(y_val, lgb_val_prob)
    ll    = log_loss(y_val, lgb_val_prob)
    prauc = average_precision_score(y_val, lgb_val_prob)
    print(f"[LGB {days}d]  AUC={auc:.4f}  LogLoss={ll:.4f}  PR-AUC={prauc:.4f}  Naive-PR={prevalence:.4f}")

    # ── CatBoost ──
    cat_model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.03,
        depth=6,
        loss_function='Logloss',
        eval_metric='Logloss',  # optimise for LogLoss (75% of score)
        l2_leaf_reg=5,
        cat_features=cat_feature_indices,
        early_stopping_rounds=50,
        random_seed=42,
        verbose=0,
    )
    cat_model.fit(
        X_tr_cat, y_tr,
        eval_set=(X_val_cat, y_val),
    )
    cat_val_prob  = cat_model.predict_proba(X_val_cat)[:, 1]
    cat_test_prob = cat_model.predict_proba(X_test_cat)[:, 1]
    cat_preds[days] = cat_test_prob

    auc   = roc_auc_score(y_val, cat_val_prob)
    ll    = log_loss(y_val, cat_val_prob)
    prauc = average_precision_score(y_val, cat_val_prob)
    print(f"[CAT {days}d]  AUC={auc:.4f}  LogLoss={ll:.4f}  PR-AUC={prauc:.4f}")

    
    # ── Ensemble (simple average) ──
    ens_val_prob  = (lgb_val_prob + cat_val_prob) / 2
    ens_test_prob = (lgb_test_prob + cat_test_prob) / 2
    ens_preds[days] = ens_test_prob

    auc   = roc_auc_score(y_val, ens_val_prob)
    ll    = log_loss(y_val, ens_val_prob)
    prauc = average_precision_score(y_val, ens_val_prob)
    print(f"[ENS {days}d]  AUC={auc:.4f}  LogLoss={ll:.4f}  PR-AUC={prauc:.4f}")
    print()


# # ── 6. Submission ────────────────────────────────────────────────────────────
# ss = pd.read_csv('SampleSubmission.csv')[['ID']]
# for days in targets:
#     ss[f'Target_{days}_AUC']     = ens_preds[days]
#     ss[f'Target_{days}_LogLoss'] = ens_preds[days]

# col_order = ['ID',
#              'Target_07_AUC', 'Target_90_AUC', 'Target_120_AUC',
#              'Target_07_LogLoss', 'Target_90_LogLoss', 'Target_120_LogLoss']
# ss[col_order].to_csv('submission_ensemble.csv', index=False)
# print("Saved submission_ensemble.csv")

[LGB 07d]  AUC=0.8320  LogLoss=0.0671  PR-AUC=0.1443  Naive-PR=0.0106
[CAT 07d]  AUC=0.9068  LogLoss=0.0644  PR-AUC=0.1757
[ENS 07d]  AUC=0.9134  LogLoss=0.0632  PR-AUC=0.1916

[LGB 90d]  AUC=0.7277  LogLoss=0.0951  PR-AUC=0.0572  Naive-PR=0.0160
[CAT 90d]  AUC=0.9001  LogLoss=0.0644  PR-AUC=0.1650
[ENS 90d]  AUC=0.8500  LogLoss=0.0779  PR-AUC=0.1334

[LGB 120d]  AUC=0.8882  LogLoss=0.0751  PR-AUC=0.0905  Naive-PR=0.0237
[CAT 120d]  AUC=0.9230  LogLoss=0.0629  PR-AUC=0.2002
[ENS 120d]  AUC=0.9007  LogLoss=0.0679  PR-AUC=0.1825



## OLD w/ Prior concatenated

In [None]:
# # OLD ---- adds prior to train
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import MultiLabelBinarizer
# from sklearn.metrics import roc_auc_score, log_loss, average_precision_score
# import lightgbm as lgb
# from catboost import CatBoostClassifier
# # import xgboost as xgb
# import ast

# # ── 1. Load ──────────────────────────────────────────────────────────────────
# # train_df = pd.read_csv('Train.csv')
# # test_df  = pd.read_csv('Test.csv')
# train_df = pd.read_csv('/app/digicow/data/Train.csv')
# prior_df = pd.read_csv('/app/digicow/data/Prior.csv')
# test_df = pd.read_csv('/app/digicow/data/Test.csv')

# # train_df = pd.concat([prior_df, train_df], ignore_index=True) # PRIOR

# # raise SystemExit("Stopping here to check data.")

# # ── 2. Parse topics ──────────────────────────────────────────────────────────
# def parse_topics(s):
#     try:
#         parsed = ast.literal_eval(s)
#         if isinstance(parsed, list):
#             flat = []
#             for item in parsed:
#                 if isinstance(item, list): flat.extend(item)
#                 else: flat.append(item)
#             return flat
#     except:
#         return []
        
# full_train = pd.concat([prior_df, train_df], ignore_index=True)

# mlb = MultiLabelBinarizer()
# topics_train = pd.DataFrame(
#     # mlb.fit_transform(train_df['topics_list'].apply(parse_topics)),
#     # columns=mlb.classes_, index=train_df.index
#     mlb.fit_transform(full_train['topics_list'].apply(parse_topics)),
#     columns=mlb.classes_, index=full_train.index
# )
# topics_test = pd.DataFrame(
#     mlb.transform(test_df['topics_list'].apply(parse_topics)),
#     columns=mlb.classes_, index=test_df.index
# )
# # train_df = pd.concat([train_df, topics_train], axis=1).drop(columns=['topics_list'])
# full_train = pd.concat([full_train, topics_train], axis=1).drop(columns=['topics_list'])
# test_df  = pd.concat([test_df,  topics_test],  axis=1).drop(columns=['topics_list'])

# # raise SystemExit("Stopping here to check data.")

# # # ── 3. Temporal split ────────────────────────────────────────────────────────
# # train_df['dt'] = pd.to_datetime(train_df['training_day'])
# # test_df['dt']  = pd.to_datetime(test_df['training_day'])
# # cutoff = pd.Timestamp('2025-03-01')

# # df_tr  = train_df[train_df['dt'] < cutoff].copy()
# # df_val = train_df[train_df['dt'] >= cutoff].copy()
# # print(f"Train: {len(df_tr):,} | Val: {len(df_val):,}")

# # ── 3. Temporal split ────────────────────────────────────────────────────────
# # train_df['dt'] = pd.to_datetime(train_df['training_day'])
# full_train['dt'] = pd.to_datetime(full_train['training_day'])
# test_df['dt']  = pd.to_datetime(test_df['training_day'])
# # prior_df['dt'] = pd.to_datetime(prior_df['training_day'])

# prior_encoded = full_train.iloc[:len(prior_df)].copy()
# train_encoded = full_train.iloc[len(prior_df):].copy()

# # cutoff = pd.Timestamp('2025-03-01')
# # df_val = train_df[train_df['dt'] >= cutoff].copy()
# # df_tr  = pd.concat([prior_df, train_df[train_df['dt'] < cutoff]], ignore_index=True)
# # print(f"Train: {len(df_tr):,} | Val: {len(df_val):,}")

# cutoff = pd.Timestamp('2025-01-01')
# df_val = train_encoded[train_encoded['dt'] >= cutoff].copy()
# df_tr  = pd.concat([prior_encoded, train_encoded[train_encoded['dt'] < cutoff]], ignore_index=True)
# print(f"Train: {len(df_tr):,} | Val: {len(df_val):,}")

# # # ── 3b. Target encoding (computed from df_tr only, no leakage) ────────────
# # targets = ['07', '90', '120']
# # te_cols = []
# # for days in targets:
# #     col = f'adopted_within_{days}_days'
# #     for grp in ['trainer', 'county', 'subcounty', 'ward']:
# #         rate_map    = df_tr.groupby(grp)[col].mean()
# #         global_mean = df_tr[col].mean()
# #         col_name    = f'{grp}_adopt_{days}d'
# #         df_tr[col_name]   = df_tr[grp].map(rate_map).fillna(global_mean)
# #         df_val[col_name]  = df_val[grp].map(rate_map).fillna(global_mean)
# #         test_df[col_name] = test_df[grp].map(rate_map).fillna(global_mean)
# #         te_cols.append(col_name)

# # raise SystemExit("Stopping here to check data.")
# # ── 4. Features ──────────────────────────────────────────────────────────────
# cat_cols     = ['gender', 'registration', 'age', 'trainer', 'county', 'subcounty', 'ward']
# topic_cols   = list(mlb.classes_)
# numeric_cols = ['belong_to_cooperative', 'has_topic_trained_on'] + topic_cols #+ te_cols
# feature_cols = cat_cols + numeric_cols

# # LightGBM needs categoricals as 'category' dtype
# def prep_lgb(df):
#     X = df[feature_cols].copy()
#     for c in cat_cols:
#         X[c] = X[c].astype('category')
#     return X

# X_tr_lgb   = prep_lgb(df_tr)
# X_val_lgb  = prep_lgb(df_val)
# X_test_lgb = prep_lgb(test_df)

# # CatBoost takes raw strings — no encoding needed
# X_tr_cat   = df_tr[feature_cols].fillna('missing')
# X_val_cat  = df_val[feature_cols].fillna('missing')
# X_test_cat = test_df[feature_cols].fillna('missing')

# cat_feature_indices = [feature_cols.index(c) for c in cat_cols]

# targets = ['07', '90', '120']
# lgb_preds  = {}
# cat_preds  = {}
# # xgb_preds = {}
# ens_preds  = {}

# # ── 5. Train per target ──────────────────────────────────────────────────────
# for days in targets:
#     col = f'adopted_within_{days}_days'
#     y_tr  = df_tr[col].values
#     y_val = df_val[col].values

#     prevalence = y_tr.mean()
#     scale_pos  = (1 - prevalence) / prevalence  # for imbalance

#     # ── LightGBM ──
#     lgb_tr  = lgb.Dataset(X_tr_lgb,  label=y_tr)
#     lgb_val = lgb.Dataset(X_val_lgb, label=y_val, reference=lgb_tr)

#     params = {
#         'objective':        'binary',
#         'metric':           ['binary_logloss', 'auc'],
#         'scale_pos_weight': scale_pos,
#         'learning_rate':    0.05,
#         'num_leaves':       31,
#         'min_child_samples': 20,
#         'feature_fraction': 0.8,
#         'bagging_fraction': 0.8,
#         'bagging_freq':     5,
#         'verbose':          -1,
#         'seed':             42,
#     }
#     lgb_model = lgb.train(
#         params,
#         lgb_tr,
#         num_boost_round=500,
#         valid_sets=[lgb_val],
#         callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)]
#     )
#     lgb_val_prob  = lgb_model.predict(X_val_lgb)
#     lgb_test_prob = lgb_model.predict(X_test_lgb)
#     lgb_preds[days] = lgb_test_prob

#     auc   = roc_auc_score(y_val, lgb_val_prob)
#     ll    = log_loss(y_val, lgb_val_prob)
#     prauc = average_precision_score(y_val, lgb_val_prob)
#     print(f"[LGB {days}d]  AUC={auc:.4f}  LogLoss={ll:.4f}  PR-AUC={prauc:.4f}  Naive-PR={prevalence:.4f}")

#     # ── CatBoost ──
#     cat_model = CatBoostClassifier(
#         iterations=1000,
#         learning_rate=0.03,
#         depth=6,
#         loss_function='Logloss',
#         eval_metric='Logloss',  # optimise for LogLoss (75% of score)
#         l2_leaf_reg=5,
#         cat_features=cat_feature_indices,
#         early_stopping_rounds=50,
#         random_seed=42,
#         verbose=0,
#     )
#     cat_model.fit(
#         X_tr_cat, y_tr,
#         eval_set=(X_val_cat, y_val),
#     )
#     cat_val_prob  = cat_model.predict_proba(X_val_cat)[:, 1]
#     cat_test_prob = cat_model.predict_proba(X_test_cat)[:, 1]
#     cat_preds[days] = cat_test_prob

#     auc   = roc_auc_score(y_val, cat_val_prob)
#     ll    = log_loss(y_val, cat_val_prob)
#     prauc = average_precision_score(y_val, cat_val_prob)
#     print(f"[CAT {days}d]  AUC={auc:.4f}  LogLoss={ll:.4f}  PR-AUC={prauc:.4f}")

#     # # ── XGBoost ──
#     # xgb_tr  = xgb.DMatrix(X_tr_lgb,  label=y_tr,  enable_categorical=True)
#     # xgb_val = xgb.DMatrix(X_val_lgb, label=y_val, enable_categorical=True)
#     # xgb_test = xgb.DMatrix(X_test_lgb,             enable_categorical=True)

#     # xgb_params = {
#     #     'objective':        'binary:logistic',
#     #     'eval_metric':      ['logloss', 'auc'],
#     #     'scale_pos_weight': scale_pos,
#     #     'learning_rate':    0.05,
#     #     'max_depth':        4,
#     #     'subsample':        0.8,
#     #     'colsample_bytree': 0.8,
#     #     'min_child_weight': 20,
#     #     'tree_method':      'hist',
#     #     'device':           'cpu',
#     #     'seed':             42,
#     # }
#     # xgb_model = xgb.train(
#     #     xgb_params,
#     #     xgb_tr,
#     #     num_boost_round=500,
#     #     evals=[(xgb_val, 'val')],
#     #     early_stopping_rounds=50,
#     #     verbose_eval=False,
#     # )
#     # xgb_val_prob  = xgb_model.predict(xgb_val)
#     # xgb_test_prob = xgb_model.predict(xgb_test)
#     # xgb_preds[days] = xgb_test_prob

#     # auc   = roc_auc_score(y_val, xgb_val_prob)
#     # ll    = log_loss(y_val, xgb_val_prob)
#     # prauc = average_precision_score(y_val, xgb_val_prob)
#     # print(f"[XGB {days}d]  AUC={auc:.4f}  LogLoss={ll:.4f}  PR-AUC={prauc:.4f}")


    
#     # ── Ensemble (simple average) ──
#     ens_val_prob  = (lgb_val_prob + cat_val_prob) / 2
#     ens_test_prob = (lgb_test_prob + cat_test_prob) / 2
#     ens_preds[days] = ens_test_prob

#     auc   = roc_auc_score(y_val, ens_val_prob)
#     ll    = log_loss(y_val, ens_val_prob)
#     prauc = average_precision_score(y_val, ens_val_prob)
#     print(f"[ENS {days}d]  AUC={auc:.4f}  LogLoss={ll:.4f}  PR-AUC={prauc:.4f}")
#     print()

#     # # ── Ensemble (3-way average) ──
#     # ens_val_prob  = (lgb_val_prob + cat_val_prob + xgb_val_prob) / 3
#     # ens_test_prob = (lgb_test_prob + cat_test_prob + xgb_test_prob) / 3
#     # ens_preds[days] = ens_test_prob

#     # auc   = roc_auc_score(y_val, ens_val_prob)
#     # ll    = log_loss(y_val, ens_val_prob)
#     # prauc = average_precision_score(y_val, ens_val_prob)
#     # print(f"[ENS {days}d]  AUC={auc:.4f}  LogLoss={ll:.4f}  PR-AUC={prauc:.4f}")
#     # print()

# # # ── 6. Submission ────────────────────────────────────────────────────────────
# # ss = pd.read_csv('SampleSubmission.csv')[['ID']]
# # for days in targets:
# #     ss[f'Target_{days}_AUC']     = ens_preds[days]
# #     ss[f'Target_{days}_LogLoss'] = ens_preds[days]

# # col_order = ['ID',
# #              'Target_07_AUC', 'Target_90_AUC', 'Target_120_AUC',
# #              'Target_07_LogLoss', 'Target_90_LogLoss', 'Target_120_LogLoss']
# # ss[col_order].to_csv('submission_ensemble.csv', index=False)
# # print("Saved submission_ensemble.csv")

Train: 56,200 | Val: 2,218
[LGB 07d]  AUC=0.9559  LogLoss=0.0614  PR-AUC=0.2416  Naive-PR=0.0140
[CAT 07d]  AUC=0.9427  LogLoss=0.0542  PR-AUC=0.2755
[ENS 07d]  AUC=0.9566  LogLoss=0.0532  PR-AUC=0.2601

[LGB 90d]  AUC=0.9321  LogLoss=0.0779  PR-AUC=0.1605  Naive-PR=0.0304
[CAT 90d]  AUC=0.9518  LogLoss=0.0566  PR-AUC=0.3277
[ENS 90d]  AUC=0.9532  LogLoss=0.0640  PR-AUC=0.2799

[LGB 120d]  AUC=0.9084  LogLoss=0.0883  PR-AUC=0.0823  Naive-PR=0.0419
[CAT 120d]  AUC=0.9167  LogLoss=0.0637  PR-AUC=0.2977
[ENS 120d]  AUC=0.9364  LogLoss=0.0739  PR-AUC=0.2973



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score, log_loss, average_precision_score
import lightgbm as lgb
from catboost import CatBoostClassifier
import optuna
import ast
import json

optuna.logging.set_verbosity(optuna.logging.WARNING)

# ── 1. Load ───────────────────────────────────────────────────────────────────
train_df = pd.read_csv('/app/digicow/data/Train.csv')
prior_df = pd.read_csv('/app/digicow/data/Prior.csv')
test_df  = pd.read_csv('/app/digicow/data/Test.csv')

# ── 2. Parse topics ───────────────────────────────────────────────────────────
def parse_topics(s):
    try:
        parsed = ast.literal_eval(s)
        if isinstance(parsed, list):
            flat = []
            for item in parsed:
                if isinstance(item, list): flat.extend(item)
                else: flat.append(item)
            return flat
    except:
        return []

full_train = pd.concat([prior_df, train_df], ignore_index=True)

mlb = MultiLabelBinarizer()
topics_train = pd.DataFrame(
    mlb.fit_transform(full_train['topics_list'].apply(parse_topics)),
    columns=mlb.classes_, index=full_train.index
)
topics_test = pd.DataFrame(
    mlb.transform(test_df['topics_list'].apply(parse_topics)),
    columns=mlb.classes_, index=test_df.index
)
full_train = pd.concat([full_train, topics_train], axis=1).drop(columns=['topics_list'])
test_df    = pd.concat([test_df, topics_test],     axis=1).drop(columns=['topics_list'])

# ── 3. Temporal split ─────────────────────────────────────────────────────────
full_train['dt'] = pd.to_datetime(full_train['training_day'])
test_df['dt']    = pd.to_datetime(test_df['training_day'])

prior_encoded = full_train.iloc[:len(prior_df)].copy()
train_encoded = full_train.iloc[len(prior_df):].copy()

cutoff = pd.Timestamp('2025-03-01')
df_val = train_encoded[train_encoded['dt'] >= cutoff].copy()
df_tr  = pd.concat([prior_encoded, train_encoded[train_encoded['dt'] < cutoff]], ignore_index=True)
print(f"Train: {len(df_tr):,} | Val: {len(df_val):,}")

# ── 4. Features ───────────────────────────────────────────────────────────────
cat_cols     = ['gender', 'registration', 'age', 'trainer', 'county', 'subcounty', 'ward']
topic_cols   = list(mlb.classes_)
numeric_cols = ['belong_to_cooperative', 'has_topic_trained_on'] + topic_cols
feature_cols = cat_cols + numeric_cols

def prep_lgb(df):
    X = df[feature_cols].copy()
    for c in cat_cols:
        X[c] = X[c].astype('category')
    return X

X_tr_lgb   = prep_lgb(df_tr)
X_val_lgb  = prep_lgb(df_val)
X_test_lgb = prep_lgb(test_df)

X_tr_cat   = df_tr[feature_cols].fillna('missing')
X_val_cat  = df_val[feature_cols].fillna('missing')
X_test_cat = test_df[feature_cols].fillna('missing')

cat_feature_indices = [feature_cols.index(c) for c in cat_cols]
targets = ['07', '90', '120']

# ── 5. Optuna tuning ──────────────────────────────────────────────────────────
def tune_lgb(X_tr, y_tr, X_val, y_val, scale_pos):
    def objective(trial):
        params = {
            'objective':         'binary',
            'metric':            'binary_logloss',
            'scale_pos_weight':  scale_pos,
            'verbose':           -1,
            'seed':              42,
            'learning_rate':     trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'num_leaves':        trial.suggest_int('num_leaves', 16, 128),
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
            'feature_fraction':  trial.suggest_float('feature_fraction', 0.5, 1.0),
            'bagging_fraction':  trial.suggest_float('bagging_fraction', 0.5, 1.0),
            'bagging_freq':      trial.suggest_int('bagging_freq', 1, 10),
            'reg_alpha':         trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True),
            'reg_lambda':        trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True),
        }
        ds_tr  = lgb.Dataset(X_tr, label=y_tr)
        ds_val = lgb.Dataset(X_val, label=y_val, reference=ds_tr)
        model = lgb.train(
            params, ds_tr,
            num_boost_round=500,
            valid_sets=[ds_val],
            callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)]
        )
        return log_loss(y_val, model.predict(X_val))
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=50, show_progress_bar=True)
    return study.best_params

def tune_cat(X_tr, y_tr, X_val, y_val, cat_feature_indices):
    def objective(trial):
        model = CatBoostClassifier(
            iterations=1000, random_seed=42, verbose=0,
            cat_features=cat_feature_indices,
            early_stopping_rounds=50,
            loss_function='Logloss', eval_metric='Logloss',
            learning_rate=      trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            depth=              trial.suggest_int('depth', 4, 8),
            l2_leaf_reg=        trial.suggest_float('l2_leaf_reg', 1.0, 20.0),
            bagging_temperature=trial.suggest_float('bagging_temperature', 0.0, 2.0),
            random_strength=    trial.suggest_float('random_strength', 0.0, 2.0),
            border_count=       trial.suggest_int('border_count', 32, 255),
        )
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val))
        return log_loss(y_val, model.predict_proba(X_val)[:, 1])
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=30, show_progress_bar=True)
    return study.best_params

best_params = {}
for days in targets:
    col = f'adopted_within_{days}_days'
    y_tr  = df_tr[col].values
    y_val = df_val[col].values
    prevalence = y_tr.mean()
    scale_pos  = (1 - prevalence) / prevalence

    print(f"\n── Tuning {days}d ──")
    lgb_best = tune_lgb(X_tr_lgb, y_tr, X_val_lgb, y_val, scale_pos)
    cat_best = tune_cat(X_tr_cat, y_tr, X_val_cat, y_val, cat_feature_indices)
    best_params[days] = {'lgb': lgb_best, 'cat': cat_best}
    print(f"LGB best: {lgb_best}")
    print(f"CAT best: {cat_best}")

# Save
with open('/app/digicow/best_params_concat.json', 'w') as f:
    json.dump({'params': best_params, 'weights': best_weights}, f, indent=2)
print("Saved best_params_concat.json")

# Load
with open('/app/digicow/best_params_concat.json') as f:
    saved = json.load(f)
best_params  = saved['params']
best_weights = saved['weights']

# ── 6. Train with tuned params ────────────────────────────────────────────────
lgb_preds = {}
cat_preds = {}
ens_preds = {}

for days in targets:
    col = f'adopted_within_{days}_days'
    y_tr  = df_tr[col].values
    y_val = df_val[col].values
    prevalence = y_tr.mean()
    scale_pos  = (1 - prevalence) / prevalence

    # ── LightGBM ──
    lgb_tr     = lgb.Dataset(X_tr_lgb, label=y_tr)
    lgb_val_ds = lgb.Dataset(X_val_lgb, label=y_val, reference=lgb_tr)
    params = {
        'objective': 'binary', 'metric': ['binary_logloss', 'auc'],
        'scale_pos_weight': scale_pos, 'verbose': -1, 'seed': 42,
        **best_params[days]['lgb']
    }
    lgb_model = lgb.train(
        params, lgb_tr,
        num_boost_round=500,
        valid_sets=[lgb_val_ds],
        callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)]
    )
    lgb_val_prob  = lgb_model.predict(X_val_lgb)
    lgb_test_prob = lgb_model.predict(X_test_lgb)
    lgb_preds[days] = lgb_test_prob
    print(f"[LGB {days}d]  AUC={roc_auc_score(y_val, lgb_val_prob):.4f}  LogLoss={log_loss(y_val, lgb_val_prob):.4f}")

    # ── CatBoost ──
    cat_model = CatBoostClassifier(
        iterations=1000, loss_function='Logloss', eval_metric='Logloss',
        cat_features=cat_feature_indices, early_stopping_rounds=50,
        random_seed=42, verbose=0,
        **best_params[days]['cat']
    )
    cat_model.fit(X_tr_cat, y_tr, eval_set=(X_val_cat, y_val))
    cat_val_prob  = cat_model.predict_proba(X_val_cat)[:, 1]
    cat_test_prob = cat_model.predict_proba(X_test_cat)[:, 1]
    cat_preds[days] = cat_test_prob
    print(f"[CAT {days}d]  AUC={roc_auc_score(y_val, cat_val_prob):.4f}  LogLoss={log_loss(y_val, cat_val_prob):.4f}")

    # ── Ensemble ──
    ens_val_prob  = (lgb_val_prob + cat_val_prob) / 2
    ens_test_prob = (lgb_test_prob + cat_test_prob) / 2
    ens_preds[days] = ens_test_prob
    print(f"[ENS {days}d]  AUC={roc_auc_score(y_val, ens_val_prob):.4f}  LogLoss={log_loss(y_val, ens_val_prob):.4f}\n")

# # ── 7. Submission ─────────────────────────────────────────────────────────────
# ss = pd.read_csv('/app/digicow/data/SampleSubmission.csv')[['ID']]
# for days in targets:
#     ss[f'Target_{days}_AUC']     = ens_preds[days]
#     ss[f'Target_{days}_LogLoss'] = ens_preds[days]

# col_order = ['ID',
#              'Target_07_AUC', 'Target_90_AUC', 'Target_120_AUC',
#              'Target_07_LogLoss', 'Target_90_LogLoss', 'Target_120_LogLoss']
# ss[col_order].to_csv('/app/digicow/submission_tuned.csv', index=False)
# print("Saved submission_tuned.csv")