In [10]:
# ---------------------------------------------------------
# 03_MODEL_TRAINING.ipynb
# ML Pipeline for AdTech Campaign Optimization
# ---------------------------------------------------------

import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb
import lightgbm as lgb


In [11]:
df = pd.read_csv("../data/processed/features1.csv")
df.head()


Unnamed: 0,event_id,timestamp,channel,campaign_id,ad_group_id,ad_id,country,industry,device_type,placement,...,ROAS,RPC,budget_utilization,impressions_per_click,performance_index,roll_spend_1h,roll_clicks_1h,roll_conv_1h,roll_ROAS_24h,roll_CTR_24h
0,sdv-id-zYbhFh,2025-09-18 07:03:03,Video,CMP_001,AG_036,AD_0353,BR,Tech,Desktop,Search,...,0.0,0.0,0.0048,15.076923,0.033163,7.98,13.0,0.0,0.0,0.066327
1,sdv-id-CsMark,2025-09-18 07:03:03,Social,CMP_001,AG_094,AD_0284,DE,Tech,Desktop,Search,...,0.0,0.0,0.01544,23.3,0.021459,36.02,33.0,0.0,0.0,0.054622
2,sdv-id-eGkMfi,2025-09-18 07:03:03,Video,CMP_001,AG_034,AD_0231,DE,Retail,Mobile,In-stream,...,0.0,,0.012992,,,57.24,33.0,0.0,0.0,0.036415
3,sdv-id-qQgEnb,2025-09-18 07:03:03,Search,CMP_001,AG_060,AD_0079,BR,Gaming,Desktop,Banner,...,0.0,0.0,0.042943,18.5,0.027027,87.78,41.0,0.0,0.0,0.040825
4,sdv-id-lJJKwl,2025-09-18 07:03:03,Search,CMP_001,AG_083,AD_0548,US,Auto,Desktop,Feed,...,0.0,0.0,0.002043,24.52,0.020392,95.59,66.0,0.0,0.0,0.040816


In [12]:
TARGET = "ROAS"

# Remove invalid values
df = df[df[TARGET].notna() & np.isfinite(df[TARGET])]
df = df[df[TARGET] >= 0]
df.head()




Unnamed: 0,event_id,timestamp,channel,campaign_id,ad_group_id,ad_id,country,industry,device_type,placement,...,ROAS,RPC,budget_utilization,impressions_per_click,performance_index,roll_spend_1h,roll_clicks_1h,roll_conv_1h,roll_ROAS_24h,roll_CTR_24h
0,sdv-id-zYbhFh,2025-09-18 07:03:03,Video,CMP_001,AG_036,AD_0353,BR,Tech,Desktop,Search,...,0.0,0.0,0.0048,15.076923,0.033163,7.98,13.0,0.0,0.0,0.066327
1,sdv-id-CsMark,2025-09-18 07:03:03,Social,CMP_001,AG_094,AD_0284,DE,Tech,Desktop,Search,...,0.0,0.0,0.01544,23.3,0.021459,36.02,33.0,0.0,0.0,0.054622
2,sdv-id-eGkMfi,2025-09-18 07:03:03,Video,CMP_001,AG_034,AD_0231,DE,Retail,Mobile,In-stream,...,0.0,,0.012992,,,57.24,33.0,0.0,0.0,0.036415
3,sdv-id-qQgEnb,2025-09-18 07:03:03,Search,CMP_001,AG_060,AD_0079,BR,Gaming,Desktop,Banner,...,0.0,0.0,0.042943,18.5,0.027027,87.78,41.0,0.0,0.0,0.040825
4,sdv-id-lJJKwl,2025-09-18 07:03:03,Search,CMP_001,AG_083,AD_0548,US,Auto,Desktop,Feed,...,0.0,0.0,0.002043,24.52,0.020392,95.59,66.0,0.0,0.0,0.040816


In [13]:
cat_cols = [
    'channel','campaign_id','ad_group_id','ad_id',
    'country','industry','device_type','placement','audience_segment'
]

# Remove missing categorical columns safely
cat_cols = [c for c in cat_cols if c in df.columns]

num_cols = [c for c in df.columns 
            if c not in cat_cols + ["timestamp", TARGET]]


In [18]:
# Identify all categorical (string/object) columns automatically
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

print("Detected categorical columns:", cat_cols)


Detected categorical columns: ['event_id', 'timestamp']


In [19]:
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le

print("Categorical features encoded.")


Categorical features encoded.


In [20]:
joblib.dump(encoders, "encoder1.pkl")
print("Encoders saved as encoder1.pkl")


Encoders saved as encoder1.pkl


In [21]:
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train:", X_train.shape, "X_test:", X_test.shape)


X_train: (46378, 37) X_test: (11595, 37)


In [22]:
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)
mae_rf = mean_absolute_error(y_test, pred_rf)
print("RF MAE:", mae_rf)


RF MAE: 0.519231308844877


In [23]:
xgb_model = xgb.XGBRegressor(
    learning_rate=0.05,
    max_depth=7,
    n_estimators=400,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
)

xgb_model.fit(X_train, y_train)
pred_xgb = xgb_model.predict(X_test)

mae_xgb = mean_absolute_error(y_test, pred_xgb)
print("XGB MAE:", mae_xgb)


XGB MAE: 0.6829184094390349


In [24]:
lgb_model = lgb.LGBMRegressor(
    learning_rate=0.05,
    n_estimators=500,
    max_depth=-1,
    random_state=42
)

lgb_model.fit(X_train, y_train)
pred_lgb = lgb_model.predict(X_test)

mae_lgb = mean_absolute_error(y_test, pred_lgb)
print("LGBM MAE:", mae_lgb)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5677
[LightGBM] [Info] Number of data points in the train set: 46378, number of used features: 37
[LightGBM] [Info] Start training from score 5.521139
LGBM MAE: 1.5680182471169526


In [25]:
mae_scores = {
    "RandomForest": mae_rf,
    "XGBoost": mae_xgb,
    "LightGBM": mae_lgb
}

best_model_name = min(mae_scores, key=mae_scores.get)
print("Best model:", best_model_name)


Best model: RandomForest


In [26]:
if best_model_name == "RandomForest":
    best_model = rf
elif best_model_name == "XGBoost":
    best_model = xgb_model
else:
    best_model = lgb_model

joblib.dump(best_model, "model.pkl")
print(f"{best_model_name} saved as model.pkl")


RandomForest saved as model.pkl


In [27]:
y_pred = best_model.predict(X_test)

print("Final Model MAE:", mean_absolute_error(y_test, y_pred))
print("Final R2:", r2_score(y_test, y_pred))


Final Model MAE: 0.519231308844877
Final R2: 0.7935008107363736


In [31]:
def predict_realtime(input_dict):
    # Load saved model + encoders
    model = joblib.load("model.pkl")
    enc = joblib.load("encoder.pkl")
    
    # Convert input to DataFrame
    row = pd.DataFrame([input_dict])

    # Handle categorical columns safely
    for col in cat_cols:
        if col in row.columns:
            # Column present in input → encode
            if col in enc:
                row[col] = enc[col].transform(row[col].astype(str))
        else:
            # Column missing → fill with the MOST FREQUENT category
            if col in enc:
                row[col] = enc[col].transform(
                    [enc[col].classes_[0]]
                )[0]

    # Fill missing numeric columns with 0
    for col in num_cols:
        if col not in row.columns:
            row[col] = 0

    # Ensure correct column order
    row = row[X_train.columns]

    # Predict
    return float(model.predict(row)[0])

sample = X_test.iloc[0].to_dict()
predict_realtime(sample)



0.0035967646828757513

In [32]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate_model(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return {
        "model": name,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    }

results = []

results.append(evaluate_model("RandomForest", y_test, pred_rf))
results.append(evaluate_model("XGBoost", y_test, pred_xgb))
results.append(evaluate_model("LightGBM", y_test, pred_lgb))

import pandas as pd
comparison_df = pd.DataFrame(results)
comparison_df.sort_values("MAE")


Unnamed: 0,model,MAE,RMSE,R2
0,RandomForest,0.519231,11.979389,0.793501
1,XGBoost,0.682918,13.909417,0.721602
2,LightGBM,1.568018,19.547137,0.450187


In [34]:
# -------------------------------------------------------
# 03_modeling.ipynb (CORRECTED & PRODUCTION-READY)
# Full training pipeline with cleaning + frequency encoding + tuning
# Dataset: ../data/processed/features1.csv
# -------------------------------------------------------

import os
import pandas as pd
import numpy as np
import joblib
from pprint import pprint

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb
import lightgbm as lgb

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# -------------------------
# 1) LOAD DATA
# -------------------------
df = pd.read_csv("../data/processed/features1.csv")
print("Raw shape:", df.shape)
print("Columns:", df.columns.tolist())

# -------------------------
# 2) TARGET & BASIC CLEANING
# -------------------------
TARGET = "ROAS"

df = df[df[TARGET].notna() & np.isfinite(df[TARGET])]

# winsorize ROAS to remove extreme outliers
lower_q, upper_q = df[TARGET].quantile([0.01, 0.99]).values
df[TARGET] = df[TARGET].clip(lower=lower_q, upper=upper_q)

print("After target cleaning:", df.shape)
print(df[TARGET].describe())

# -------------------------
# 3) DROP UNWANTED / PROBLEMATIC COLUMNS
# -------------------------
if "timestamp" in df.columns:
    df = df.drop(columns=["timestamp"])
    print("Dropped timestamp")

rolling_cols = [c for c in df.columns if c.lower().startswith("roll_") or c.lower().startswith("rolling_")]
print("Detected rolling columns:", rolling_cols)

for c in rolling_cols:
    if df[c].isna().all():
        df = df.drop(columns=[c])
        print("Dropped:", c)
    else:
        med = df[c].median()
        df[c] = df[c].fillna(med)
        print(f"Filled NaN in {c} with median {med:.4f}")

# -------------------------
# 4) FIX NUMERIC COLUMNS
# -------------------------
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Try to coerce numeric-like object columns
for c in df.columns:
    if c not in num_cols:
        sample = df[c].dropna().astype(str).head(200)
        parseable = sample.apply(lambda x: x.replace('.', '', 1).replace('-', '', 1).isdigit()).mean()
        if parseable > 0.9:
            df[c] = pd.to_numeric(df[c], errors='coerce')
            print(f"Coerced to numeric:", c)

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric cols:", num_cols)

# Fill NaN numeric values
for c in num_cols:
    if c == TARGET:
        continue
    if df[c].isna().sum() > 0:
        df[c] = df[c].fillna(0)

# -------------------------
# 5) ENCODING
# - Frequency encoding: campaign_id, ad_group_id, ad_id
# - Label encoding: all other categorical columns
# -------------------------
id_like = [c for c in ['campaign_id','ad_group_id','ad_id'] if c in df.columns]

cat_cols_auto = df.select_dtypes(include=['object']).columns.tolist()
cat_cols = [c for c in cat_cols_auto if c not in id_like]

print("Label encodings:", cat_cols)
print("Frequency encodings:", id_like)

# Frequency encoding
freq_maps = {}
for c in id_like:
    mapping = df[c].value_counts(dropna=False) / len(df)
    df[c] = df[c].map(mapping).fillna(0)
    freq_maps[c] = mapping.to_dict()

# Label encoding
from sklearn.preprocessing import LabelEncoder
label_encoders = {}

for c in cat_cols:
    le = LabelEncoder()
    df[c] = le.fit_transform(df[c].astype(str))
    label_encoders[c] = le

# Save encoders
encoders = {
    "freq_maps": freq_maps,
    "label_encoders": label_encoders
}
joblib.dump(encoders, "encoders.pkl")

# Final check no string columns remain
non_numeric = df.select_dtypes(exclude=[np.number]).columns.tolist()
print("Non-numeric after encoding:", non_numeric)
if non_numeric:
    raise RuntimeError("ERROR: some non-numeric columns remain")

# -------------------------
# 6) TRAIN/TEST SPLIT
# -------------------------
X = df.drop(columns=[TARGET])
y = df[TARGET]

feature_columns = X.columns.tolist()
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

print("Train:", X_train.shape, "Test:", X_test.shape)

# -------------------------
# 7) BASELINE MODELS
# -------------------------
rf = RandomForestRegressor(
    n_estimators=200, max_depth=12, random_state=RANDOM_STATE, n_jobs=-1
)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

xgb_model = xgb.XGBRegressor(
    learning_rate=0.05, max_depth=7, n_estimators=300, subsample=0.8,
    colsample_bytree=0.8, random_state=RANDOM_STATE, n_jobs=-1
)
xgb_model.fit(X_train, y_train)
pred_xgb = xgb_model.predict(X_test)

lgb_model = lgb.LGBMRegressor(
    learning_rate=0.05, n_estimators=400, max_depth=-1,
    random_state=RANDOM_STATE, n_jobs=-1
)
lgb_model.fit(X_train, y_train)
pred_lgb = lgb_model.predict(X_test)

# -------------------------
# 8) Evaluation Function
# -------------------------
def evaluate(name, y_true, y_pred):
    return {
        "model": name,
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "R2": r2_score(y_true, y_pred)
    }

results = []
results.append(evaluate("RandomForest", y_test, pred_rf))
results.append(evaluate("XGBoost", y_test, pred_xgb))
results.append(evaluate("LightGBM", y_test, pred_lgb))

results_df = pd.DataFrame(results).sort_values("MAE")
print("\nBaseline comparison:")
display(results_df)

# -------------------------
# 9) LIGHTGBM TUNING
# -------------------------
lgb_params = {
    'num_leaves': [31, 50, 70, 100, 150],
    'max_depth': [-1, 8, 12, 20],
    'learning_rate': [0.01, 0.03, 0.05, 0.08],
    'n_estimators': [200, 400, 600],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8, 1.0]
}

tuner = RandomizedSearchCV(
    estimator=lgb.LGBMRegressor(random_state=RANDOM_STATE, n_jobs=-1),
    param_distributions=lgb_params,
    n_iter=20,
    scoring='neg_mean_absolute_error',
    cv=3,
    random_state=RANDOM_STATE,
    verbose=2,
    n_jobs=-1
)

print("\nTuning LightGBM...")
tuner.fit(X_train, y_train)

print("Best LightGBM params:")
pprint(tuner.best_params_)

best_lgb_tuned = tuner.best_estimator_
pred_lgb_tuned = best_lgb_tuned.predict(X_test)

results.append(evaluate("LightGBM_Tuned", y_test, pred_lgb_tuned))
results_df = pd.DataFrame(results).sort_values("MAE")
print("\nFinal comparison after tuning:")
display(results_df)

# -------------------------
# 10) SAVE ALL MODELS
# -------------------------
model_registry = {}

joblib.dump(rf, "model_randomforest.pkl")
model_registry["RandomForest"] = "model_randomforest.pkl"

joblib.dump(xgb_model, "model_xgboost.pkl")
model_registry["XGBoost"] = "model_xgboost.pkl"

joblib.dump(lgb_model, "model_lightgbm.pkl")
model_registry["LightGBM"] = "model_lightgbm.pkl"

joblib.dump(best_lgb_tuned, "model_lightgbm_tuned.pkl")
model_registry["LightGBM_Tuned"] = "model_lightgbm_tuned.pkl"

print("\nSaved all models:")
print(model_registry)

# Update encoder metadata
enc_meta = joblib.load("encoders.pkl")
enc_meta.update({
    "feature_columns": feature_columns,
    "num_cols": num_cols,
    "model_registry": model_registry
})
joblib.dump(enc_meta, "encoders.pkl")

print("Updated encoders.pkl with registry & metadata")

# -------------------------
# 11) REAL-TIME INFERENCE FUNCTION
# -------------------------
def predict_realtime(input_dict, model_name="LightGBM_Tuned"):
    enc_meta = joblib.load("encoders.pkl")
    model_registry = enc_meta["model_registry"]

    if model_name not in model_registry:
        raise ValueError(f"Invalid model_name. Choose from: {list(model_registry.keys())}")

    model = joblib.load(model_registry[model_name])

    row = pd.DataFrame([input_dict])

    # Frequency encoding
    for c, mapping in enc_meta["freq_maps"].items():
        if c in row.columns:
            row[c] = row[c].map(mapping).fillna(0)
        else:
            row[c] = 0

    # Label encoding
    for c, le in enc_meta["label_encoders"].items():
        if c in row.columns:
            val = str(row.iloc[0][c])
            if val in le.classes_:
                row[c] = le.transform([val])[0]
            else:
                row[c] = 0
        else:
            row[c] = 0

    # Ensure numeric cols
    for col in enc_meta["num_cols"]:
        if col not in row.columns:
            row[col] = 0

    # Reorder
    row = row[enc_meta["feature_columns"]].astype(float)

    return float(model.predict(row)[0])

# Sanity test
sample = X_test.iloc[0].to_dict()
print("Sample prediction:", predict_realtime(sample))


Raw shape: (60000, 38)
Columns: ['event_id', 'timestamp', 'channel', 'campaign_id', 'ad_group_id', 'ad_id', 'country', 'industry', 'device_type', 'placement', 'audience_segment', 'impressions', 'clicks', 'conversions', 'spend', 'revenue', 'daily_budget_campaign', 'hour_of_day', 'day_of_week', 'week_of_year', 'is_weekend', 'hour_sin', 'hour_cos', 'CTR', 'CVR', 'CPC', 'CPM', 'CPA', 'ROAS', 'RPC', 'budget_utilization', 'impressions_per_click', 'performance_index', 'roll_spend_1h', 'roll_clicks_1h', 'roll_conv_1h', 'roll_ROAS_24h', 'roll_CTR_24h']
After target cleaning: (57973, 38)
count    57973.000000
mean         3.871977
std         10.824509
min          0.000000
25%          0.000000
50%          0.000000
75%          1.930743
max         72.699283
Name: ROAS, dtype: float64
Dropped timestamp
Detected rolling columns: ['roll_spend_1h', 'roll_clicks_1h', 'roll_conv_1h', 'roll_ROAS_24h', 'roll_CTR_24h']
Filled NaN in roll_spend_1h with median 19.3000
Filled NaN in roll_clicks_1h with m

Unnamed: 0,model,MAE,RMSE,R2
0,RandomForest,0.073182,0.912941,0.992457
1,XGBoost,0.108292,0.732452,0.995145
2,LightGBM,0.125121,0.618715,0.996536



Tuning LightGBM...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5255
[LightGBM] [Info] Number of data points in the train set: 46378, number of used features: 36
[LightGBM] [Info] Start training from score 3.897647
Best LightGBM params:
{'colsample_bytree': 0.8,
 'learning_rate': 0.03,
 'max_depth': -1,
 'n_estimators': 200,
 'num_leaves': 70,
 'subsample': 0.7}

Final comparison after tuning:


Unnamed: 0,model,MAE,RMSE,R2
0,RandomForest,0.073182,0.912941,0.992457
1,XGBoost,0.108292,0.732452,0.995145
2,LightGBM,0.125121,0.618715,0.996536
3,LightGBM_Tuned,0.134479,0.704781,0.995505



Saved all models:
{'RandomForest': 'model_randomforest.pkl', 'XGBoost': 'model_xgboost.pkl', 'LightGBM': 'model_lightgbm.pkl', 'LightGBM_Tuned': 'model_lightgbm_tuned.pkl'}
Updated encoders.pkl with registry & metadata
Sample prediction: 0.008572464000500959
