In [None]:
# ============================================================
# Purpose: Train an MLP regression model on Revenue
# ============================================================

import os
import math
import json
import numpy as np
import pandas as pd
import tensorflow as tf
#from google.colab import drive
from tensorflow.keras.optimizers.schedules import LearningRateSchedule

In [None]:
# ============================================================
# 1. Setup & Configurations
# ============================================================
#drive.mount('/content/drive')

# File paths
TRAIN_CSV_PATH = "train.csv"
TEST_CSV_PATH  = "test.csv"
MODEL_DIR      = "mlp_kpi_regression_smoothl1"

# Data settings, using all data available
HAS_HEADER = True

#using only fundamentals
EXCEL_FEATURE_START = "C"
EXCEL_FEATURE_END   = "BZ"
EXCEL_TARGET        = "LO" # This is the column for Revenue

# Model training settings
BATCH_SIZE   = 256
EPOCHS       = 500
SEED         = 42
# NOTE: This delta is in ORIGINAL target units (since we invert-scale at the output).
# Consider setting DELTA using your train-set P75(|y - mean(y)|) ≈ 3.03e8 for stability.
DELTA        = 1.0   # Huber delta (original target units)

# Learning rate schedule
BASE_LR      = 1e-3
MIN_LR_RATIO = 0.01  # final lr = BASE_LR * MIN_LR_RATIO

Mounted at /content/drive


In [3]:
# ============================================================
# 2. Reproducibility
# ============================================================
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [4]:

# ============================================================
# 3. Helper Functions & Custom Classes
# ============================================================

def excel_col_to_zero_index(col: str) -> int:
    col = col.strip().upper()
    val = 0
    for ch in col:
        val = val * 26 + (ord(ch) - 64)
    return val - 1

class SuccessRate(tf.keras.metrics.Metric):
    """
    Success Rate = fraction of predictions where |y_true - y_pred| < threshold
    (Absolute error, in original target units.)
    """
    def __init__(self, threshold=0.05, name="success_rate", **kwargs):
        super().__init__(name=name, **kwargs)
        self.threshold = threshold
        self.total = self.add_weight(name="total", initializer="zeros")
        self.successes = self.add_weight(name="successes", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        error = tf.abs(y_true - y_pred)
        success = tf.cast(error < self.threshold, tf.float32)
        self.successes.assign_add(tf.reduce_sum(success))
        self.total.assign_add(tf.cast(tf.size(success), tf.float32))

    def result(self):
        return self.successes / (self.total + 1e-6)

    def reset_states(self):
        self.total.assign(0.0)
        self.successes.assign(0.0)


class LogLearningRate(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        opt = self.model.optimizer
        lr = opt.learning_rate
        if callable(lr):
            lr = lr(opt.iterations)
        lr = float(tf.keras.backend.get_value(lr))
        print(f" - learning_rate: {lr:.6e}")

@tf.keras.utils.register_keras_serializable(package="custom")
class WarmupCosine(LearningRateSchedule):
    def __init__(self, base_lr, total_steps, warmup_steps=0, min_lr_ratio=0.01, name=None):
        super().__init__()
        self.base_lr = float(base_lr)
        self.total_steps = int(total_steps)
        self.warmup_steps = int(warmup_steps)
        self.min_lr_ratio = float(min_lr_ratio)
        self.name = name

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        base_lr = tf.constant(self.base_lr, tf.float32)
        min_lr  = tf.constant(self.base_lr * self.min_lr_ratio, tf.float32)

        def warmup():
            return base_lr * (step / tf.maximum(1.0, float(self.warmup_steps)))

        def cosine():
            progress = (step - float(self.warmup_steps)) / tf.maximum(
                1.0, float(self.total_steps - self.warmup_steps)
            )
            progress = tf.clip_by_value(progress, 0.0, 1.0)
            cos_term = 0.5 * (1.0 + tf.cos(math.pi * progress))
            return min_lr + (base_lr - min_lr) * cos_term

        return tf.cond(step < self.warmup_steps, warmup, cosine)

    def get_config(self):
        return {
            "base_lr": self.base_lr,
            "total_steps": self.total_steps,
            "warmup_steps": self.warmup_steps,
            "min_lr_ratio": self.min_lr_ratio,
            "name": self.name,
        }

In [5]:
# ============================================================
# 4. Data Loading & Preprocessing - with optional one-hot and column alignment
# ============================================================


def load_xy_from_csv(path, has_header=True):
    if has_header:
        df = pd.read_csv(path)
    else:
        df = pd.read_csv(path, header=None)

    feat_start = excel_col_to_zero_index(EXCEL_FEATURE_START)
    feat_end_inclusive = excel_col_to_zero_index(EXCEL_FEATURE_END)
    target_idx = excel_col_to_zero_index(EXCEL_TARGET)

    X_df = df.iloc[:, feat_start:feat_end_inclusive + 1].copy()
    y_series = df.iloc[:, target_idx].copy()

    # Impute features (median for numeric, mode for categorical) -- missing values already handled upstream
    # for col in X_df.columns:
    #     if pd.api.types.is_numeric_dtype(X_df[col]):
    #         X_df[col] = X_df[col].fillna(X_df[col].median())
    #     else:
    #         mode_val = X_df[col].mode().iloc[0] if not X_df[col].mode().empty else ""
    #         X_df[col] = X_df[col].fillna(mode_val)

    # One-hot encode any non-numeric columns
    non_numeric_cols = [c for c in X_df.columns if not pd.api.types.is_numeric_dtype(X_df[c])]
    if non_numeric_cols:
        X_df = pd.get_dummies(X_df, columns=non_numeric_cols, dummy_na=False)

    # Target numeric + impute
    #y = pd.to_numeric(y_series, errors="coerce")
    #y = y.fillna(y.median())
    y = y_series.astype(np.float32).values.reshape(-1, 1)

    return X_df, y

# Load train/test
X_train_df, y_train = load_xy_from_csv(TRAIN_CSV_PATH, HAS_HEADER)
X_test_df,  y_test  = load_xy_from_csv(TEST_CSV_PATH,  HAS_HEADER)

# Align test columns to train columns if one-hot created different sets (fill missing with 0)
if list(X_train_df.columns) != list(X_test_df.columns):
    X_test_df = X_test_df.reindex(columns=X_train_df.columns, fill_value=0)

X_train = X_train_df.values.astype(np.float32)
X_test  = X_test_df.values.astype(np.float32)

# --------------------
# Target stats (for stable training but original-unit metrics)
# --------------------
y_mean = float(np.mean(y_train))
y_std  = float(np.std(y_train) + 1e-6)
print(f"[Target stats] mean={y_mean:.6f}, std={y_std:.6f}")


[Target stats] mean=2673319936.000000, std=8338769920.000000


In [6]:
# ============================================================
# 5. Datasets & Normalization
# ============================================================

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(X_train)

steps_per_epoch = int(math.ceil(len(X_train) / BATCH_SIZE))
total_epochs    = EPOCHS
warmup_epochs   = 0.005 * EPOCHS  # 0.5% warmup

lr_schedule = WarmupCosine(
    base_lr=BASE_LR,
    total_steps=steps_per_epoch * total_epochs,
    warmup_steps=int(steps_per_epoch * warmup_epochs),
    min_lr_ratio=MIN_LR_RATIO
)

def make_ds(X_np, y_np, training):
    ds = tf.data.Dataset.from_tensor_slices((X_np, y_np))
    if training:
        ds = ds.shuffle(min(len(X_np), 10000), seed=SEED, reshuffle_each_iteration=True)
    ds = ds.batch(BATCH_SIZE).map(lambda x, y: (normalizer(x), y)).prefetch(tf.data.AUTOTUNE)
    return ds

train_ds = make_ds(X_train, y_train, training=True)
test_ds  = make_ds(X_test,  y_test,  training=False)

In [8]:
# ============================================================
# 6. Model Architecture - simple MLP, residuals, no LayerNorm
# ============================================================

def build_mlp(input_dim, y_mean, y_std,
              widths=(256, 256, 128, 64, 32),
              drop=0.05,
              activation="gelu",
              weight_decay=1e-5):

    act = (lambda x: tf.keras.activations.gelu(x, approximate=True)) if activation == "gelu" else activation

    inputs = tf.keras.Input(shape=(input_dim,), name="features")
    x = inputs
    for i, w in enumerate(widths):
        x = tf.keras.layers.Dense(
            w,
            activation=act,
            kernel_regularizer=tf.keras.regularizers.l2(1e-5),
            name=f"dense{i+1}"
        )(x)
        if drop and drop > 0:
            x = tf.keras.layers.Dropout(drop, name=f"drop{i+1}")(x)

    z = tf.keras.layers.Dense(1, activation=None, name="z_scaled")(x)
    y_hat = tf.keras.layers.Lambda(lambda t: t * y_std + y_mean, name="y_hat")(z)

    model = tf.keras.Model(inputs, y_hat, name="mlp_simple_tuned")

    huber = tf.keras.losses.Huber(delta=DELTA)
    optimizer = tf.keras.optimizers.AdamW(
        learning_rate=lr_schedule,
        weight_decay=weight_decay,
        clipnorm=1.0
    )

    model.compile(
        optimizer=optimizer,
        loss=huber,
        metrics=[
            tf.keras.metrics.MeanAbsoluteError(name="mae"),
            tf.keras.metrics.RootMeanSquaredError(name="rmse"),
            SuccessRate(threshold=0.05, name="success_rate"),
        ],
    )
    return model


model = build_mlp(
    input_dim=X_train.shape[1],
    y_mean=y_mean, y_std=y_std,
    widths=(256, 256, 128, 64, 32),
    drop=0.05,
    activation="gelu"
)


In [9]:
# ============================================================
# 7. Callbacks
# ============================================================
os.makedirs(MODEL_DIR, exist_ok=True)
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(MODEL_DIR, "best_model.keras"),
        monitor="loss",
        mode="min",
        save_best_only=True
    ),
    LogLearningRate(),
]

In [10]:
# ============================================================
# 8. Training
# ============================================================
history = model.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=2
)

Epoch 1/500
 - learning_rate: 4.000000e-04
6/6 - 5s - 903ms/step - loss: 3164151296.0000 - mae: 3164151296.0000 - rmse: 8432394240.0000 - success_rate: 0.0000e+00
Epoch 2/500



  fn_config = serialization_lib.serialize_keras_object(activation)


 - learning_rate: 8.000000e-04
6/6 - 0s - 45ms/step - loss: 2001186176.0000 - mae: 2001186176.0000 - rmse: 6667919872.0000 - success_rate: 0.0000e+00
Epoch 3/500
 - learning_rate: 9.999976e-04
6/6 - 0s - 49ms/step - loss: 1624415488.0000 - mae: 1624415488.0000 - rmse: 4505021440.0000 - success_rate: 0.0000e+00
Epoch 4/500
 - learning_rate: 9.999778e-04
6/6 - 0s - 40ms/step - loss: 959558848.0000 - mae: 959558848.0000 - rmse: 2358845440.0000 - success_rate: 0.0000e+00
Epoch 5/500
 - learning_rate: 9.999383e-04
6/6 - 0s - 48ms/step - loss: 864643840.0000 - mae: 864643840.0000 - rmse: 2569158144.0000 - success_rate: 0.0000e+00
Epoch 6/500
 - learning_rate: 9.998791e-04
6/6 - 0s - 28ms/step - loss: 700395072.0000 - mae: 700395072.0000 - rmse: 2154988032.0000 - success_rate: 0.0000e+00
Epoch 7/500
 - learning_rate: 9.998003e-04
6/6 - 0s - 28ms/step - loss: 608420672.0000 - mae: 608420672.0000 - rmse: 1915117056.0000 - success_rate: 0.0000e+00
Epoch 8/500
 - learning_rate: 9.997016e-04
6/6 -

In [11]:
# ============================================================
# 9. Evaluation
# ============================================================
eval_results = model.evaluate(test_ds, return_dict=True, verbose=2)

print("\nTest metrics (Smooth L1 / Huber + SuccessRate):")
for k, v in eval_results.items():
    print(f"  {k}: {v:.6f}")

2/2 - 0s - 172ms/step - loss: 362688384.0000 - mae: 362688384.0000 - rmse: 1400402688.0000 - success_rate: 0.0000e+00

Test metrics (Smooth L1 / Huber + SuccessRate):
  loss: 362688384.000000
  mae: 362688384.000000
  rmse: 1400402688.000000
  success_rate: 0.000000


In [12]:
import numpy as np
import pandas as pd

# ============================================================
# 1. Collect Predictions from Test Set
# ============================================================
y_true_all, y_pred_all, X_all = [], [], []

for xb, yb in test_ds:
    preds = model.predict(xb, verbose=0)
    X_all.append(xb.numpy())
    y_true_all.append(yb.numpy().reshape(-1))
    y_pred_all.append(preds.reshape(-1))

# Concatenate results
X_all      = np.vstack(X_all)
y_true_all = np.concatenate(y_true_all, axis=0)
y_pred_all = np.concatenate(y_pred_all, axis=0)


# ============================================================
# 2. Compute Percentage Errors
# ============================================================
denom          = np.maximum(np.abs(y_true_all), 1e-12)   # avoid division by zero
pct_error      = (y_pred_all - y_true_all) / denom * 100.0
abs_pct_error  = np.abs(pct_error)


# ============================================================
# 3. Build Results DataFrame
# ============================================================
results_df = pd.DataFrame(
    np.hstack([
        X_all,
        y_true_all[:, None],
        y_pred_all[:, None],
        pct_error[:, None],
        abs_pct_error[:, None]
    ]),
    columns=[f"feature_{i}" for i in range(X_all.shape[1])]
           + ["y_true", "y_pred", "pct_error", "abs_pct_error"]
)


# ============================================================
# 4. Display Sample Predictions
# ============================================================
print("\n=== Sample Predictions (25 rows) ===")
print(results_df.head(25).to_string(index=False))
# Alternative (random sample):
# print(results_df.sample(25, random_state=42).to_string(index=False))


# ============================================================
# 5. Aggregate Statistics
# ============================================================
mape = abs_pct_error.mean()
print(f"\nMAPE over the whole test set: {mape:.4f}%")



=== Sample Predictions (25 rows) ===
 feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  feature_7  feature_8  feature_9  feature_10  feature_11  feature_12  feature_13  feature_14  feature_15  feature_16  feature_17  feature_18  feature_19  feature_20  feature_21  feature_22  feature_23  feature_24  feature_25  feature_26  feature_27  feature_28  feature_29  feature_30  feature_31  feature_32  feature_33  feature_34  feature_35  feature_36  feature_37  feature_38  feature_39  feature_40  feature_41  feature_42  feature_43  feature_44  feature_45  feature_46  feature_47  feature_48  feature_49  feature_50  feature_51  feature_52  feature_53  feature_54  feature_55  feature_56  feature_57  feature_58  feature_59  feature_60  feature_61  feature_62  feature_63  feature_64  feature_65  feature_66  feature_67  feature_68  feature_69  feature_70  feature_71  feature_72  feature_73  feature_74  feature_75  feature_76  feature_77  feature_78  feature_79  feature_80 

In [13]:
import numpy as np
import tensorflow as tf

# ---------- Collect predictions on the entire test set ----------
y_true_all, y_pred_all = [], []
for xb, yb in test_ds:
    preds = model.predict(xb, verbose=0)
    y_true_all.append(yb.numpy().reshape(-1))
    y_pred_all.append(preds.reshape(-1))

y_true_all = np.concatenate(y_true_all, axis=0)
y_pred_all = np.concatenate(y_pred_all, axis=0)

# ---------- Per-sample errors ----------
eps        = 1e-12
errors     = y_pred_all - y_true_all
abs_errors = np.abs(errors)
sq_errors  = errors**2

# Success rates
pct_error     = (y_pred_all - y_true_all) / np.maximum(np.abs(y_true_all), eps) * 100.0
abs_pct_error = np.abs(pct_error)
success_rate_10 = np.mean(abs_pct_error < 10.0) * 100.0
success_rate_20 = np.mean(abs_pct_error < 20.0) * 100.0
success_rate_40 = np.mean(abs_pct_error < 40.0) * 100.0

# ---------- Global aggregate metrics (whole test set) ----------
model_mae_mean   = np.mean(abs_errors)
model_mae_median = np.median(abs_errors)
model_mae_std    = np.std(abs_errors)

model_rmse_mean  = np.sqrt(np.mean(sq_errors))   # proper RMSE on full dataset
model_r2_mean    = 1.0 - np.sum(sq_errors) / np.sum((y_true_all - np.mean(y_true_all))**2)

# ---------- Batch-level metrics ----------
mae_per_batch, rmse_per_batch, r2_per_batch = [], [], []
for xb, yb in test_ds:
    preds = model.predict(xb, verbose=0).reshape(-1)
    yb_np = yb.numpy().reshape(-1)
    errors = preds - yb_np
    abs_errors_b = np.abs(errors)
    sq_errors_b  = errors**2

    mae_batch  = np.mean(abs_errors_b)
    rmse_batch = np.sqrt(np.mean(sq_errors_b))
    r2_batch   = 1 - np.sum(sq_errors_b) / np.sum((yb_np - np.mean(yb_np))**2 + eps)

    mae_per_batch.append(mae_batch)
    rmse_per_batch.append(rmse_batch)
    r2_per_batch.append(r2_batch)

model_mae_std_batch  = np.std(mae_per_batch)
model_rmse_std_batch = np.std(rmse_per_batch)
model_r2_std_batch   = np.std(r2_per_batch)

# ---------- Baseline (predict train mean) ----------
def compute_train_mean_fallback():
    if 'y_train' in globals() and y_train is not None:
        return float(np.mean(y_train))
    acc, n = 0.0, 0
    for _, yb in train_ds:
        yb_np = yb.numpy().reshape(-1)
        acc += float(np.sum(yb_np))
        n   += yb_np.size
    return acc / max(n, 1)

y_train_mean = compute_train_mean_fallback()
baseline_preds = np.full_like(y_true_all, y_train_mean)

baseline_abs_pct_error = np.abs((baseline_preds - y_true_all) /
                                np.maximum(np.abs(y_true_all), eps)) * 100.0
baseline_mae    = np.mean(np.abs(y_true_all - baseline_preds))
baseline_rmse   = np.sqrt(np.mean((y_true_all - baseline_preds) ** 2))
baseline_ss_res = np.sum((y_true_all - baseline_preds) ** 2)
baseline_r2     = 1.0 - baseline_ss_res / np.sum((y_true_all - np.mean(y_true_all))**2)

baseline_sr_10  = np.mean(baseline_abs_pct_error < 10.0) * 100.0
baseline_sr_20  = np.mean(baseline_abs_pct_error < 20.0) * 100.0
baseline_sr_40  = np.mean(baseline_abs_pct_error < 40.0) * 100.0

# ---------- Print Summary ----------
print("=== Full-Test Evaluation ===")
print(f"MAE mean   : {model_mae_mean:.6f}")
print(f"MAE median : {model_mae_median:.6f}")
print(f"MAE std    (per sample): {model_mae_std:.6f}")
print(f"MAE std    (per batch) : {model_mae_std_batch:.6f}")

print(f"RMSE mean    : {model_rmse_mean:.6f}")
print(f"RMSE std   (per batch) : {model_rmse_std_batch:.6f}")

print(f"R² mean       : {model_r2_mean:.6f}")
print(f"R² std     (per batch) : {model_r2_std_batch:.6f}")

print(f"SuccessRate@10%: {success_rate_10:.2f}%")
print(f"SuccessRate@20%: {success_rate_20:.2f}%")
print(f"SuccessRate@40%: {success_rate_40:.2f}%")

print("\n--- Baseline (predict train mean) ---")
print(f"Train mean      : {y_train_mean:.6f}")
print(f"Baseline MAE    : {baseline_mae:.6f}")
print(f"Baseline RMSE   : {baseline_rmse:.6f}")
print(f"Baseline R^2    : {baseline_r2:.6f}")
print(f"Baseline SR@10% : {baseline_sr_10:.2f}%")
print(f"Baseline SR@20% : {baseline_sr_20:.2f}%")
print(f"Baseline SR@40% : {baseline_sr_40:.2f}%")

print("\n--- Target scale (test set) ---")
print(f"y_true mean : {np.mean(y_true_all):.6f}")
print(f"y_true std  : {np.std(y_true_all):.6f}")
print(f"y_true min  : {np.min(y_true_all):.6f}")
print(f"y_true max  : {np.max(y_true_all):.6f}")


=== Full-Test Evaluation ===
MAE mean   : 362688384.000000
MAE median : 47272464.000000
MAE std    (per sample): 1352621440.000000
MAE std    (per batch) : 112655840.000000
RMSE mean    : 1400402688.000000
RMSE std   (per batch) : 521050784.000000
R² mean       : 0.972895
R² std     (per batch) : 0.008689
SuccessRate@10%: 56.73%
SuccessRate@20%: 78.36%
SuccessRate@40%: 89.47%

--- Baseline (predict train mean) ---
Train mean      : 2673319936.000000
Baseline MAE    : 3391386112.000000
Baseline RMSE   : 8507461120.000000
Baseline R^2    : -0.000329
Baseline SR@10% : 3.80%
Baseline SR@20% : 6.14%
Baseline SR@40% : 14.04%

--- Target scale (test set) ---
y_true mean : 2827531264.000000
y_true std  : 8506063360.000000
y_true min  : -72319000.000000
y_true max  : 97826996224.000000
