<a href="https://colab.research.google.com/github/vimesh630/Revenue_Forecasting/blob/main/XGBoost_for_Revenue_Forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. Import Required Libraries & Mount Google Drive

In [29]:
!pip install optuna

import pandas as pd
import xgboost as xgb
import numpy as np
import os
import optuna
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive')

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.4.0
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#2. Define Paths

In [30]:
output_dir = "/content/drive/MyDrive/VERGER/Revenue_Forecasting"
train_path = os.path.join(output_dir, "train_feature_engineered_forecast_data.csv")
test_path = os.path.join(output_dir, "test_feature_engineered_forecast_data.csv")

#3. Load Data

In [31]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

#4. Handle Infinite Values and NaNs

In [32]:
train_df = train_df.replace([np.inf, -np.inf], np.nan).fillna(0)
test_df = test_df.replace([np.inf, -np.inf], np.nan).fillna(0)

#5. Prepare Features and Targets

In [33]:
target_col = "Forecast_Revenue"
drop_cols = ["Year", "Month", "Account", "Product", "Type", "Quarter", "date"]

X_train = train_df.drop(columns=drop_cols + [target_col], errors="ignore")
y_train = train_df[target_col]

X_test = test_df.drop(columns=drop_cols + [target_col], errors="ignore")
y_test = test_df[target_col]

#6. Apply Log Transformation to Target

In [34]:
y_train_log = np.log1p(y_train)  # log(1+y)
y_test_log = np.log1p(y_test)

#7. Define Optuna Objective Function

In [37]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5),
        "random_state": 42,
        "tree_method": "hist"
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train_log, eval_set=[(X_test, y_test_log)],
              verbose=False) # Removed early_stopping_rounds

    y_pred_log = model.predict(X_test)
    y_pred = np.expm1(y_pred_log)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return rmse

#8. Run Optuna Study

In [44]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100) # Increase n_trials value to get better results
print("✅ Best params:", study.best_params)
print("✅ Best RMSE:", study.best_value)

[I 2025-08-18 05:57:46,450] A new study created in memory with name: no-name-341f09a1-b2fc-4720-aa92-0aea85edf826
[I 2025-08-18 05:57:50,051] Trial 0 finished with value: 12784.292976165185 and parameters: {'n_estimators': 877, 'learning_rate': 0.15980397676122146, 'max_depth': 6, 'subsample': 0.8937807201833878, 'colsample_bytree': 0.7733238122069429, 'gamma': 2.8171722989994885, 'reg_alpha': 3.873895022470373, 'reg_lambda': 0.5818132419964306}. Best is trial 0 with value: 12784.292976165185.
[I 2025-08-18 05:57:50,892] Trial 1 finished with value: 14128.600756911885 and parameters: {'n_estimators': 534, 'learning_rate': 0.0661533448245578, 'max_depth': 4, 'subsample': 0.9606182649189436, 'colsample_bytree': 0.9138157789446654, 'gamma': 4.706014177712648, 'reg_alpha': 0.7507466997788548, 'reg_lambda': 4.424463927745026}. Best is trial 0 with value: 12784.292976165185.
[I 2025-08-18 05:57:51,568] Trial 2 finished with value: 15304.25743991869 and parameters: {'n_estimators': 851, 'lear

✅ Best params: {'n_estimators': 945, 'learning_rate': 0.0683538021526396, 'max_depth': 4, 'subsample': 0.6600530714954989, 'colsample_bytree': 0.8488356029538121, 'gamma': 0.020573175587420042, 'reg_alpha': 0.13934000336080019, 'reg_lambda': 3.9130506640951532}
✅ Best RMSE: 5973.827445053502


#9. Train the Model with Best Parameters

In [45]:
best_params = study.best_params
model = xgb.XGBRegressor(**best_params)
model.fit(X_train, y_train_log)

#10. Predictions

In [46]:
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)

#11. Model Evaluation

In [47]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print("\n📊 Final Model Performance (After Tuning):")
print(f"RMSE: {rmse:,.2f}")
print(f"MAE: {mae:,.2f}")
print(f"MAPE: {mape:.2f}%")
print(f"R²: {r2:.4f}")


📊 Final Model Performance (After Tuning):
RMSE: 6,040.87
MAE: 1,444.55
MAPE: 13.03%
R²: 0.9587


#12. Segmented Performance

In [48]:
test_results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
test_results["Error"] = test_results["Actual"] - test_results["Predicted"]
test_results["APE"] = np.abs(test_results["Error"] / test_results["Actual"]) * 100

segments = {
    "<5K": test_results[test_results["Actual"] < 5000],
    "5K-10K": test_results[(test_results["Actual"] >= 5000) & (test_results["Actual"] < 10000)],
    ">10K": test_results[test_results["Actual"] >= 10000],
}

print("\n📊 Segmented Performance (Tuned):")
for seg, data in segments.items():
    if len(data) > 0:
        seg_mae = data["Error"].abs().mean()
        seg_mape = data["APE"].mean()
        print(f"{seg}: MAE={seg_mae:,.2f}, MAPE={seg_mape:.2f}% (n={len(data)})")
    else:
        print(f"{seg}: No data points")


📊 Segmented Performance (Tuned):
<5K: MAE=189.32, MAPE=19.44% (n=121)
5K-10K: MAE=570.85, MAPE=7.13% (n=48)
>10K: MAE=3,551.31, MAPE=7.66% (n=92)


#13. Optuna Visualization

In [43]:
import optuna.visualization as vis

# Optimization history (RMSE over trials)
fig1 = vis.plot_optimization_history(study)
fig1.show()

# Hyperparameter importance
fig2 = vis.plot_param_importances(study)
fig2.show()

# Parallel coordinate plot (hyperparameters vs RMSE)
fig3 = vis.plot_parallel_coordinate(study)
fig3.show()

# Contour plot (interaction between params)
fig4 = vis.plot_contour(study)
fig4.show()