# Prueba implementado a nuestros datasets

In [6]:
import pandas as pd

# Cargar el archivo CSV 
clients = pd.read_csv('../data/raw/base_clientes_final.csv')
clients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     1000 non-null   object
 1   fecha_nacimiento       1000 non-null   object
 2   fecha_alta             1000 non-null   object
 3   id_municipio           1000 non-null   int64 
 4   id_estado              1000 non-null   int64 
 5   tipo_persona           1000 non-null   object
 6   genero                 1000 non-null   object
 7   actividad_empresarial  1000 non-null   object
dtypes: int64(2), object(6)
memory usage: 62.6+ KB


In [7]:
# Cargar el archivo CSV usando la ruta absoluta
txn = pd.read_csv('../data/raw/base_transacciones_final.csv')
txn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346011 entries, 0 to 346010
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             346011 non-null  object 
 1   fecha          346011 non-null  object 
 2   comercio       346011 non-null  object 
 3   giro_comercio  340423 non-null  object 
 4   tipo_venta     346011 non-null  object 
 5   monto          346011 non-null  float64
dtypes: float64(1), object(5)
memory usage: 15.8+ MB


In [8]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from fastapi import FastAPI
from datetime import datetime

# --- 1. LOAD AND PREPROCESS DATA --------------------------------------
#txn = pd.read_parquet("TransactionData.parquet")
txn["fecha"] = pd.to_datetime(txn["fecha"])
txn["month"] = txn["fecha"].dt.to_period("M")

agg = (txn.groupby(["id", "month"])
           .agg(spend=("monto", "sum"),
                n_tx=("monto", "size"),
                max_tx=("monto", "max"),
                avg_ticket=("monto", "mean"))
           .reset_index())

In [9]:
# --- 2. COMPLETE CALENDAR --------------------------------------------
full_idx = pd.MultiIndex.from_product(
    [agg["id"].unique(),
     pd.period_range(agg["month"].min(), agg["month"].max(), freq="M")],
    names=["id", "month"]
)
panel = (agg.set_index(["id", "month"])
             .reindex(full_idx, fill_value=0)
             .reset_index())

In [10]:
# --- 3. STATIC CLIENT DATA -------------------------------------------
#clients = pd.read_parquet("ClientData.parquet")
today = pd.Timestamp("2025-05-24")
clients["age"] = ((today - pd.to_datetime(clients["fecha_nacimiento"])).dt.days // 365)
clients["tenure_months"] = ((today - pd.to_datetime(clients["fecha_alta"])).dt.days // 30)

static_cols = ["id", "age", "tenure_months", "id_estado",
               "tipo_persona", "genero", "actividad_empresarial"]
panel = panel.merge(clients[static_cols], on="id", how="left")

In [11]:
# --- 4. LAG FEATURES -------------------------------------------------
panel = panel.sort_values(["id", "month"])
for k in range(1, 7):
    panel[f"spend_lag{k}"] = panel.groupby("id")["spend"].shift(k)
panel["rolling_mean_3"] = panel.groupby("id")["spend"].rolling(3).mean().reset_index(level=0, drop=True)
panel["rolling_std_6"] = panel.groupby("id")["spend"].rolling(6).std().reset_index(level=0, drop=True)


In [12]:
# --- 5. SEASONALITY + HOLIDAYS --------------------------------------
panel["month_idx"] = panel["month"].dt.month
panel["month_sin"] = np.sin(2 * np.pi * panel["month_idx"] / 12)
panel["month_cos"] = np.cos(2 * np.pi * panel["month_idx"] / 12)

In [13]:
# Agrega fechas importantes en México
def is_buen_fin(date): return date.month == 11 and date.day >= 15
def is_navidad(date): return date.month == 12 and date.day >= 20
def is_mothers_day(date): return date.month == 5 and date.day == 10

txn["buen_fin"] = txn["fecha"].apply(is_buen_fin)
txn["navidad"] = txn["fecha"].apply(is_navidad)
txn["dia_madre"] = txn["fecha"].apply(is_mothers_day)

In [14]:
holiday_agg = txn.groupby(["id", txn["fecha"].dt.to_period("M")])[["buen_fin", "navidad", "dia_madre"]].sum().reset_index()
holiday_agg.rename(columns={"fecha": "month"}, inplace=True)
panel = panel.merge(holiday_agg, on=["id", "month"], how="left").fillna(0)

In [15]:
# --- 6. TARGET -------------------------------------------------------
panel["spend_next"] = panel.groupby("id")["spend"].shift(-1)
panel = panel.dropna(subset=["spend_next", "spend_lag1"])

In [16]:
# --- 7. SPLIT --------------------------------------------------------
feature_cols = [
    "spend", "n_tx", "max_tx", "avg_ticket",
    "spend_lag1", "spend_lag2", "spend_lag3", "spend_lag4", "spend_lag5", "spend_lag6",
    "rolling_mean_3", "rolling_std_6",
    "age", "tenure_months", "id_estado", "tipo_persona", "genero", "actividad_empresarial",
    "month_sin", "month_cos",
    "buen_fin", "navidad", "dia_madre"
]

In [17]:
cat_features = ["id_estado", "tipo_persona", "genero", "actividad_empresarial"]
for col in cat_features:
    panel[col] = panel[col].astype("category")

In [18]:
train_mask = panel["month"] <= panel["month"].max() - 3
val_mask = panel["month"] == panel["month"].max() - 2
test_mask = panel["month"] == panel["month"].max() - 1

In [19]:
panel

Unnamed: 0,id,month,spend,n_tx,max_tx,avg_ticket,age,tenure_months,id_estado,tipo_persona,...,spend_lag6,rolling_mean_3,rolling_std_6,month_idx,month_sin,month_cos,buen_fin,navidad,dia_madre,spend_next
0,003d9abe467a91847d566cf455bd2d7d6c8f7e75,2022-01,585.30,62,54.70,9.440323,27,66,61,Persona Fisica Sin Actividad Empresarial,...,0.00,0.000000,0.000000,1,5.000000e-01,8.660254e-01,0.0,0.0,0.0,733.52
1,003d9abe467a91847d566cf455bd2d7d6c8f7e75,2022-02,733.52,62,143.61,11.830968,27,66,61,Persona Fisica Sin Actividad Empresarial,...,0.00,0.000000,0.000000,2,8.660254e-01,5.000000e-01,0.0,0.0,0.0,788.37
2,003d9abe467a91847d566cf455bd2d7d6c8f7e75,2022-03,788.37,62,143.61,12.715645,27,66,61,Persona Fisica Sin Actividad Empresarial,...,0.00,702.396667,0.000000,3,1.000000e+00,6.123234e-17,0.0,0.0,0.0,1329.37
3,003d9abe467a91847d566cf455bd2d7d6c8f7e75,2022-04,1329.37,73,200.48,18.210548,27,66,61,Persona Fisica Sin Actividad Empresarial,...,0.00,950.420000,0.000000,4,8.660254e-01,-5.000000e-01,0.0,0.0,0.0,1654.94
4,003d9abe467a91847d566cf455bd2d7d6c8f7e75,2022-05,1654.94,90,302.04,18.388222,27,66,61,Persona Fisica Sin Actividad Empresarial,...,0.00,1257.560000,0.000000,5,5.000000e-01,-8.660254e-01,0.0,0.0,5.0,1071.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12994,ff67da037fae796809be0e36fb9cdd0e191c38a4,2022-08,1542.14,12,365.43,128.511667,52,80,60,Persona Fisica Sin Actividad Empresarial,...,879.53,2052.326667,930.097937,8,-8.660254e-01,-5.000000e-01,0.0,0.0,0.0,1994.71
12995,ff67da037fae796809be0e36fb9cdd0e191c38a4,2022-09,1994.71,13,930.02,153.439231,52,80,60,Persona Fisica Sin Actividad Empresarial,...,342.24,1918.980000,770.270345,9,-1.000000e+00,-1.836970e-16,0.0,0.0,0.0,1310.88
12996,ff67da037fae796809be0e36fb9cdd0e191c38a4,2022-10,1310.88,14,729.06,93.634286,52,80,60,Persona Fisica Sin Actividad Empresarial,...,1890.11,1615.910000,780.514539,10,-8.660254e-01,5.000000e-01,0.0,0.0,0.0,1037.00
12997,ff67da037fae796809be0e36fb9cdd0e191c38a4,2022-11,1037.00,16,297.79,64.812500,52,80,60,Persona Fisica Sin Actividad Empresarial,...,262.19,1447.530000,536.837271,11,-5.000000e-01,8.660254e-01,7.0,0.0,0.0,2104.34


In [24]:
X_train, y_train = panel.loc[train_mask, feature_cols], panel.loc[train_mask, "spend_next"]
X_val, y_val = panel.loc[val_mask, feature_cols], panel.loc[val_mask, "spend_next"]
X_test, y_test = panel.loc[test_mask, feature_cols], panel.loc[test_mask, "spend_next"]


In [25]:
import lightgbm as lgb

from lightgbm import early_stopping


# --- 8. HYPERPARAMETER OPTIMIZATION ----------------------------------
def objective(trial):
    params = {
        "objective": "regression_l1",
        "n_estimators": 800,
        "learning_rate": 0.05,
        "num_leaves": trial.suggest_int("num_leaves", 16, 128),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "subsample": 0.8,
        "colsample_bytree": 0.8,
    }
    model = lgb.LGBMRegressor(**params)
    model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="l1",
    categorical_feature=["id_estado", "tipo_persona", "genero", "actividad_empresarial"],
    callbacks=[early_stopping(stopping_rounds=50)],
)
    preds = model.predict(X_val)
    return np.mean(np.abs(preds - y_val))

In [26]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=40)
best_params = study.best_params
best_params.update({
    "objective": "regression_l1",
    "n_estimators": 800,
    "learning_rate": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8
})

[I 2025-05-24 15:12:26,982] A new study created in memory with name: no-name-e8b308b0-c52b-4e20-94e8-4652a2e49f75


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001261 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:27,850] Trial 0 finished with value: 537.2234148321185 and parameters: {'num_leaves': 114, 'min_child_samples': 100}. Best is trial 0 with value: 537.2234148321185.


Early stopping, best iteration is:
[161]	valid_0's l1: 537.223
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001535 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:28,735] Trial 1 finished with value: 536.5753779860544 and parameters: {'num_leaves': 100, 'min_child_samples': 46}. Best is trial 1 with value: 536.5753779860544.


Early stopping, best iteration is:
[135]	valid_0's l1: 536.575
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001788 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:29,587] Trial 2 finished with value: 540.1268979413113 and parameters: {'num_leaves': 80, 'min_child_samples': 85}. Best is trial 1 with value: 536.5753779860544.


Early stopping, best iteration is:
[155]	valid_0's l1: 540.127
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001286 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:31,557] Trial 3 finished with value: 532.4420190916958 and parameters: {'num_leaves': 89, 'min_child_samples': 27}. Best is trial 3 with value: 532.4420190916958.


Early stopping, best iteration is:
[423]	valid_0's l1: 532.442
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:33,170] Trial 4 finished with value: 533.7639389851067 and parameters: {'num_leaves': 89, 'min_child_samples': 58}. Best is trial 3 with value: 532.4420190916958.


Early stopping, best iteration is:
[357]	valid_0's l1: 533.764
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003066 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:33,664] Trial 5 finished with value: 535.3572811668444 and parameters: {'num_leaves': 32, 'min_child_samples': 82}. Best is trial 3 with value: 532.4420190916958.


Early stopping, best iteration is:
[198]	valid_0's l1: 535.357
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001134 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:34,543] Trial 6 finished with value: 539.5111788066713 and parameters: {'num_leaves': 72, 'min_child_samples': 75}. Best is trial 3 with value: 532.4420190916958.


Early stopping, best iteration is:
[179]	valid_0's l1: 539.511
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001889 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[515]	valid_0's l1: 535.678


[I 2025-05-24 15:12:36,833] Trial 7 finished with value: 535.6782482626596 and parameters: {'num_leaves': 91, 'min_child_samples': 36}. Best is trial 3 with value: 532.4420190916958.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:37,660] Trial 8 finished with value: 528.8747763932231 and parameters: {'num_leaves': 56, 'min_child_samples': 18}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[262]	valid_0's l1: 528.875
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001364 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:38,674] Trial 9 finished with value: 539.390577599312 and parameters: {'num_leaves': 102, 'min_child_samples': 62}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[199]	valid_0's l1: 539.391
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:39,261] Trial 10 finished with value: 532.7348492341072 and parameters: {'num_leaves': 48, 'min_child_samples': 19}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[170]	valid_0's l1: 532.735
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001282 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:40,325] Trial 11 finished with value: 532.901305410106 and parameters: {'num_leaves': 59, 'min_child_samples': 18}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[298]	valid_0's l1: 532.901
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001268 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:40,973] Trial 12 finished with value: 532.5979864325271 and parameters: {'num_leaves': 23, 'min_child_samples': 31}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[340]	valid_0's l1: 532.598
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:41,827] Trial 13 finished with value: 531.7997926565831 and parameters: {'num_leaves': 54, 'min_child_samples': 10}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[233]	valid_0's l1: 531.8
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001281 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:42,586] Trial 14 finished with value: 532.8019621205613 and parameters: {'num_leaves': 50, 'min_child_samples': 12}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[228]	valid_0's l1: 532.802
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001278 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:43,564] Trial 15 finished with value: 535.9672100486175 and parameters: {'num_leaves': 65, 'min_child_samples': 40}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[234]	valid_0's l1: 535.967
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001890 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:44,325] Trial 16 finished with value: 532.8236170959651 and parameters: {'num_leaves': 38, 'min_child_samples': 12}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[252]	valid_0's l1: 532.824
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001364 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:45,730] Trial 17 finished with value: 535.0490833941051 and parameters: {'num_leaves': 52, 'min_child_samples': 48}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[248]	valid_0's l1: 535.049
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002411 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:47,571] Trial 18 finished with value: 531.2620860398835 and parameters: {'num_leaves': 126, 'min_child_samples': 25}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[239]	valid_0's l1: 531.262
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002546 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:49,439] Trial 19 finished with value: 533.6000917711386 and parameters: {'num_leaves': 118, 'min_child_samples': 22}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[307]	valid_0's l1: 533.6
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001958 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[399]	valid_0's l1: 535.589


[I 2025-05-24 15:12:51,974] Trial 20 finished with value: 535.5891140709217 and parameters: {'num_leaves': 125, 'min_child_samples': 29}. Best is trial 8 with value: 528.8747763932231.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001703 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:53,188] Trial 21 finished with value: 530.0744026370642 and parameters: {'num_leaves': 38, 'min_child_samples': 13}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[238]	valid_0's l1: 530.074
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004381 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[384]	valid_0's l1: 530.426


[I 2025-05-24 15:12:54,456] Trial 22 finished with value: 530.4263078298657 and parameters: {'num_leaves': 40, 'min_child_samples': 23}. Best is trial 8 with value: 528.8747763932231.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001420 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:54,950] Trial 23 finished with value: 531.8829887528436 and parameters: {'num_leaves': 20, 'min_child_samples': 38}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[247]	valid_0's l1: 531.883
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001142 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:55,546] Trial 24 finished with value: 532.2766536960356 and parameters: {'num_leaves': 39, 'min_child_samples': 19}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[217]	valid_0's l1: 532.277
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001193 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:56,043] Trial 25 finished with value: 534.8175761976814 and parameters: {'num_leaves': 31, 'min_child_samples': 49}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[163]	valid_0's l1: 534.818
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001106 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:56,690] Trial 26 finished with value: 530.8645801723285 and parameters: {'num_leaves': 44, 'min_child_samples': 31}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[205]	valid_0's l1: 530.865
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000989 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:57,929] Trial 27 finished with value: 535.5902573567524 and parameters: {'num_leaves': 64, 'min_child_samples': 10}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[282]	valid_0's l1: 535.59
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:58,687] Trial 28 finished with value: 530.9415894542431 and parameters: {'num_leaves': 28, 'min_child_samples': 18}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[332]	valid_0's l1: 530.942
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001904 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:59,288] Trial 29 finished with value: 534.8054905950992 and parameters: {'num_leaves': 16, 'min_child_samples': 70}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[155]	valid_0's l1: 534.805
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001073 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:12:59,977] Trial 30 finished with value: 533.2606749186734 and parameters: {'num_leaves': 40, 'min_child_samples': 35}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[194]	valid_0's l1: 533.261
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001348 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:13:00,924] Trial 31 finished with value: 533.1634110793759 and parameters: {'num_leaves': 42, 'min_child_samples': 24}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[217]	valid_0's l1: 533.163
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002251 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:13:02,200] Trial 32 finished with value: 531.7773300728567 and parameters: {'num_leaves': 43, 'min_child_samples': 31}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[323]	valid_0's l1: 531.777
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:13:03,282] Trial 33 finished with value: 537.2083407205322 and parameters: {'num_leaves': 72, 'min_child_samples': 96}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[228]	valid_0's l1: 537.208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001211 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:13:04,602] Trial 34 finished with value: 537.1306650311991 and parameters: {'num_leaves': 57, 'min_child_samples': 43}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[250]	valid_0's l1: 537.131
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001441 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:13:05,366] Trial 35 finished with value: 531.9819787969093 and parameters: {'num_leaves': 35, 'min_child_samples': 15}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[275]	valid_0's l1: 531.982
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001500 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:13:06,147] Trial 36 finished with value: 530.7459527704068 and parameters: {'num_leaves': 27, 'min_child_samples': 25}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[361]	valid_0's l1: 530.746
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001126 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:13:06,823] Trial 37 finished with value: 530.9174873548678 and parameters: {'num_leaves': 28, 'min_child_samples': 24}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[271]	valid_0's l1: 530.917
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001246 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:13:07,510] Trial 38 finished with value: 535.062008549082 and parameters: {'num_leaves': 47, 'min_child_samples': 54}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[181]	valid_0's l1: 535.062
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001499 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 21
[LightGBM] [Info] Start training from score 796.234985
Training until validation scores don't improve for 50 rounds


[I 2025-05-24 15:13:08,656] Trial 39 finished with value: 536.3462085430441 and parameters: {'num_leaves': 78, 'min_child_samples': 16}. Best is trial 8 with value: 528.8747763932231.


Early stopping, best iteration is:
[236]	valid_0's l1: 536.346


In [1]:
# --- 9. FINAL MODEL TRAINING -----------------------------------------
model = lgb.LGBMRegressor(**best_params)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="l1",
    categorical_feature=["id_estado", "tipo_persona", "genero", "actividad_empresarial"],
    callbacks=[early_stopping(stopping_rounds=50)]
)

NameError: name 'lgb' is not defined

In [28]:
# --- 10. SCORING FUNCTIONS -------------------------------------------
def predict_client_next_month(model, client_id, monthly_panel):
    features = monthly_panel.loc[monthly_panel["id"] == client_id].sort_values("month").tail(1)
    return float(model.predict(features[feature_cols]))

segment_medians = panel.groupby("id_estado")["spend_next"].median().to_dict()

def safe_predict(client_id):
    hist_len = panel.loc[panel["id"] == client_id, "spend"].count()
    if hist_len < 4:
        segment = panel.loc[panel["id"] == client_id, "id_estado"].iat[0]
        return segment_medians.get(segment, 0)
    return predict_client_next_month(model, client_id, panel)

  segment_medians = panel.groupby("id_estado")["spend_next"].median().to_dict()


In [29]:
# --- 11. FASTAPI SERVER ----------------------------------------------
app = FastAPI()

@app.get("/forecast/{client_id}")
def forecast(client_id: int):
    y_hat = safe_predict(client_id)
    return {"client_id": client_id, "next_month_spend": y_hat}


# Completo de Facundo

In [2]:
import pandas as pd

# Cargar el archivo CSV 
clients = pd.read_csv('../data/raw/base_clientes_final.csv')
clients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     1000 non-null   object
 1   fecha_nacimiento       1000 non-null   object
 2   fecha_alta             1000 non-null   object
 3   id_municipio           1000 non-null   int64 
 4   id_estado              1000 non-null   int64 
 5   tipo_persona           1000 non-null   object
 6   genero                 1000 non-null   object
 7   actividad_empresarial  1000 non-null   object
dtypes: int64(2), object(6)
memory usage: 62.6+ KB


In [3]:
txn = pd.read_csv('../data/raw/base_transacciones_final.csv')
txn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346011 entries, 0 to 346010
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             346011 non-null  object 
 1   fecha          346011 non-null  object 
 2   comercio       346011 non-null  object 
 3   giro_comercio  340423 non-null  object 
 4   tipo_venta     346011 non-null  object 
 5   monto          346011 non-null  float64
dtypes: float64(1), object(5)
memory usage: 15.8+ MB


In [5]:
import pandas as pd
import numpy as np

# --- 1.  TXN → MONTHLY PANEL ----------------------------------------
# txn = pd.read_parquet("TransactionData.parquet")
txn["fecha"]  = pd.to_datetime(txn["fecha"])
txn["month"]  = txn["fecha"].dt.to_period("M")

agg = (txn.groupby(["id", "month"])
           .agg(spend      = ("monto", "sum"),
                n_tx       = ("monto", "size"),
                max_tx     = ("monto", "max"),
                avg_ticket = ("monto", "mean"))
           .reset_index())

# --- 2.  COMPLETE CALENDAR PER CLIENT -------------------------------
full_idx = pd.MultiIndex.from_product(
    [agg["id"].unique(),
     pd.period_range(agg["month"].min(), agg["month"].max(), freq="M")],
    names=["id", "month"]
)
panel = (agg.set_index(["id", "month"])
             .reindex(full_idx, fill_value=0)
             .reset_index())

# --- 3.  STATIC CLIENT DATA -----------------------------------------
# clients = pd.read_parquet("ClientData.parquet")
today   = pd.Timestamp("2025-05-24")         # FIXED date for reproducibility
clients["age"]           = ((today - pd.to_datetime(clients["fecha_nacimiento"]))
                            .dt.days // 365)
clients["tenure_months"] = ((today - pd.to_datetime(clients["fecha_alta"]))
                            .dt.days // 30)

static_cols = ["id", "age", "tenure_months", "id_estado",
               "tipo_persona", "genero", "actividad_empresarial"]
panel = panel.merge(clients[static_cols], on="id", how="left")

# --- 4.  LAG FEATURES -----------------------------------------------
panel = panel.sort_values(["id", "month"])
for k in range(1, 7):
    panel[f"spend_lag{k}"] = panel.groupby("id")["spend"].shift(k)
panel["rolling_mean_3"] = (panel.groupby("id")["spend"]
                               .rolling(3).mean().reset_index(level=0, drop=True))
panel["rolling_std_6"]  = (panel.groupby("id")["spend"]
                               .rolling(6).std().reset_index(level=0, drop=True))

# --- 5.  SEASONALITY -----------------------------------------------
panel["month_idx"]  = panel["month"].dt.month
panel["month_sin"]  = np.sin(2 * np.pi * panel["month_idx"] / 12)
panel["month_cos"]  = np.cos(2 * np.pi * panel["month_idx"] / 12)

# --- 5b. HOLIDAY FEATURES -------------------------------------------
def is_buen_fin(date): return date.month == 11 and date.day >= 15
def is_navidad(date): return date.month == 12 and date.day >= 20
def is_mothers_day(date): return date.month == 5 and date.day == 10

txn["buen_fin"] = txn["fecha"].apply(is_buen_fin)
txn["navidad"] = txn["fecha"].apply(is_navidad)
txn["dia_madre"] = txn["fecha"].apply(is_mothers_day)

holiday_agg = txn.groupby(["id", txn["fecha"].dt.to_period("M")])[["buen_fin", "navidad", "dia_madre"]].sum().reset_index()
holiday_agg.rename(columns={"fecha": "month"}, inplace=True)
panel = panel.merge(holiday_agg, on=["id", "month"], how="left").fillna(0)

# --- 6.  TARGET -----------------------------------------------------
panel["spend_next"] = panel.groupby("id")["spend"].shift(-1)
panel = panel.dropna(subset=["spend_next", "spend_lag1"])  # keep only full-feature rows

# --- DEFINE FEATURE COLS -------------------------------------------
feature_cols = [
    "spend", "n_tx", "max_tx", "avg_ticket",
    "spend_lag1", "spend_lag2", "spend_lag3", "spend_lag4", "spend_lag5", "spend_lag6",
    "rolling_mean_3", "rolling_std_6",
    "age", "tenure_months", "id_estado", "tipo_persona", "genero", "actividad_empresarial",
    "month_sin", "month_cos",
    "buen_fin", "navidad", "dia_madre"
]

# chronological split (same cut for every client)
train_mask = panel["month"] <= panel["month"].max() - 3   # months 1-10
val_mask   = panel["month"] == panel["month"].max() - 2   # month 11
test_mask  = panel["month"] == panel["month"].max() - 1   # month 12

X_train, y_train = panel.loc[train_mask, feature_cols], panel.loc[train_mask, "spend_next"]
X_val,   y_val   = panel.loc[val_mask,   feature_cols], panel.loc[val_mask,   "spend_next"]
X_test,  y_test  = panel.loc[test_mask,  feature_cols], panel.loc[test_mask,  "spend_next"]

import lightgbm as lgb
from lightgbm import early_stopping # ADDED for callbacks

cat_features = ["id_estado", "tipo_persona", "genero", "actividad_empresarial"]
for col in cat_features: # Ensure categorical features are category type
    panel[col] = panel[col].astype("category")

model = lgb.LGBMRegressor(
    objective     = "regression_l1",
    n_estimators  = 800,
    learning_rate = 0.05,
    num_leaves    = 64,
    subsample     = 0.8,
    colsample_bytree = 0.8,
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="l1",
    categorical_feature=cat_features,
    callbacks=[early_stopping(stopping_rounds=50)] # Corrected early stopping
)

def predict_client_next_month(model, client_id, monthly_panel):
    features = monthly_panel.loc[
        monthly_panel["id"] == client_id
    ].sort_values("month").tail(1)  # most recent month
    return float(model.predict(features[feature_cols]))

segment_medians = panel.groupby("id_estado")["spend_next"].median().to_dict() # ADDED for safe_predict



ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: tipo_persona: object, genero: object, actividad_empresarial: object

In [None]:
from fastapi import FastAPI # ADDED
app = FastAPI() # ADDED

@app.get("/forecast/{client_id}")
def forecast(client_id: int):
    y_hat = predict_client_next_month(model, client_id, panel)
    return {"client_id": client_id, "next_month_spend": y_hat}

def safe_predict(client_id):
    hist_len = panel.loc[panel["id"] == client_id, "spend"].count()
    if hist_len < 4:
        segment = panel.loc[panel["id"] == client_id, "id_estado"].iat[0]
        return segment_medians.get(segment, 0)   # pre-computed
    return predict_client_next_month(model, client_id, panel)

7 Next steps you should schedule
Optuna sweep (40 trials) to fine-tune num_leaves, min_child_samples, etc.

Holiday calendar dummy for Buen Fin, Navidad, Día de la Madre; they distort Mexican retail spend.

SHAP monitoring—log global and per-segment MAE monthly; trigger alert if drift.

Stateless scoring image—Dockerfile FROM python:3.12-slim, copy artefact, expose /forecast.