<a href="https://colab.research.google.com/github/vimesh630/Revenue_Forecasting/blob/main/New_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. Import Required LIbraries and Mount Google Drive

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from google.colab import drive
from datetime import datetime
import calendar

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


#2. Load & Preprocess Data

In [2]:
input_file_path = "/content/drive/MyDrive/VERGER/Revenue_Forecasting/forecasting_data.csv"
df = pd.read_csv(input_file_path)

df = df.dropna(subset=['Year', 'Month_No'])
df['Year'] = df['Year'].astype(int)
df['Month_No'] = df['Month_No'].astype(int)
df['Revenue'] = pd.to_numeric(df['Revenue'], errors='coerce').fillna(0)
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce').fillna(0)

df['Date'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['Month_No'].astype(str) + '-01')
df = df.sort_values(['Account','Product','Type','Date']).reset_index(drop=True)

# Encode categorical features
label_encoders = {}
for col in ['Account','Product','Type']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

#3. Feature Engineering

In [3]:
def create_lags(group, lags=[1,2,3]):
    group = group.sort_values('Date')
    for lag in lags:
        group[f'Revenue_lag{lag}'] = group['Revenue'].shift(lag).fillna(0)
        group[f'Quantity_lag{lag}'] = group['Quantity'].shift(lag).fillna(0)
    return group

def add_rolling_features(group):
    group = group.sort_values('Date')
    group['Revenue_MA3'] = group['Revenue'].rolling(3).mean().fillna(0)
    group['Revenue_MA6'] = group['Revenue'].rolling(6).mean().fillna(0)
    group['Quantity_MA3'] = group['Quantity'].rolling(3).mean().fillna(0)
    group['Quantity_MA6'] = group['Quantity'].rolling(6).mean().fillna(0)
    return group

df = df.groupby(['Account','Product','Type']).apply(create_lags).reset_index(drop=True)
df = df.groupby(['Account','Product','Type']).apply(add_rolling_features).reset_index(drop=True)

# Group-level statistics
df['Account_mean_rev'] = df.groupby('Account')['Revenue'].transform('mean')
df['Product_mean_rev'] = df.groupby('Product')['Revenue'].transform('mean')
df['Type_mean_rev'] = df.groupby('Type')['Revenue'].transform('mean')
df['Account_mean_qty'] = df.groupby('Account')['Quantity'].transform('mean')
df['Product_mean_qty'] = df.groupby('Product')['Quantity'].transform('mean')
df['Type_mean_qty'] = df.groupby('Type')['Quantity'].transform('mean')

# Cyclical features
df['Month_Sin'] = np.sin(2*np.pi*df['Month_No']/12)
df['Month_Cos'] = np.cos(2*np.pi*df['Month_No']/12)
df['Year_Since'] = df['Year'] - df['Year'].min()
df['Quarter_No'] = df['Quarter'].str.replace("Q","").astype(int)

# Apply log transformation to Revenue and Quantity for better model stability
df['Revenue_log'] = np.log1p(df['Revenue'])
df['Quantity_log'] = np.log1p(df['Quantity'])

  df = df.groupby(['Account','Product','Type']).apply(create_lags).reset_index(drop=True)
  df = df.groupby(['Account','Product','Type']).apply(add_rolling_features).reset_index(drop=True)


#4. Features and Targets Columns

In [4]:
# Feature Columns
feature_cols = [
    'Account','Product','Type',
    'Month_No','Month_Sin','Month_Cos','Year_Since','Quarter_No',
    'Revenue_lag1','Revenue_lag2','Revenue_lag3',
    'Quantity_lag1','Quantity_lag2','Quantity_lag3',
    'Revenue_MA3','Revenue_MA6','Quantity_MA3','Quantity_MA6',
    'Account_mean_rev','Product_mean_rev','Type_mean_rev',
    'Account_mean_qty','Product_mean_qty','Type_mean_qty'
]

# Target Variables
X = df[feature_cols]
y_rev = df['Revenue_log']
y_qty = df['Quantity_log']

#5.Train/Test Split

In [5]:
X_train, X_test, y_train_rev, y_test_rev, y_train_qty, y_test_qty = train_test_split(
    X, y_rev, y_qty, test_size=0.2, random_state=42
)

#6. Hyperparameter Grid

In [6]:
param_grid = {
    'n_estimators': [500, 800, 1000, 1200],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'max_depth': [4, 6, 8, 10],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [0.5, 1, 1.5]
}

#7. Randomized Search Setup

In [7]:
def tune_xgb_model(X_train, y_train):
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    random_search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_grid,
        n_iter=50,
        scoring='neg_mean_squared_error',
        cv=3,
        verbose=2,
        random_state=42,
        n_jobs=-1
    )
    random_search.fit(X_train, y_train)
    print("✅ Best parameters found:", random_search.best_params_)
    return random_search.best_estimator_

#8. Train Revenue Model

In [8]:
model_rev = tune_xgb_model(X_train, y_train_rev)
model_rev.fit(X_train, y_train_rev, eval_set=[(X_test, y_test_rev)], verbose=False)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
✅ Best parameters found: {'subsample': 0.8, 'reg_lambda': 0.5, 'reg_alpha': 0.5, 'n_estimators': 800, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.7}


#9. Train Quantity Model

In [9]:
model_qty = tune_xgb_model(X_train, y_train_qty)
model_qty.fit(X_train, y_train_qty, eval_set=[(X_test, y_test_qty)], verbose=False)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
✅ Best parameters found: {'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 0.5, 'n_estimators': 1200, 'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 0.3, 'colsample_bytree': 0.8}


#10. Model Evaluation

In [10]:
def evaluate_model(model, X_test, y_test, target_name):
    y_pred = model.predict(X_test)
    y_pred_inv = np.expm1(y_pred)
    y_test_inv = np.expm1(y_test)
    rmse = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))
    mae = mean_absolute_error(y_test_inv, y_pred_inv)
    r2 = r2_score(y_test_inv, y_pred_inv)
    print(f"{target_name} -> RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

evaluate_model(model_rev, X_test, y_test_rev, 'Revenue')
evaluate_model(model_qty, X_test, y_test_qty, 'Quantity')

Revenue -> RMSE: 27850.05, MAE: 10213.78, R²: 0.6353
Quantity -> RMSE: 284.96, MAE: 132.93, R²: 0.7234


#11. Forecasting Function

In [14]:
def forecast_all_accounts(end_year, end_month):
    forecast_rows = []

    end_date = datetime(end_year, end_month, 1)

    # Group by account-product-type combos
    for (acc, prod, typ), group in df.groupby(['Account','Product','Type']):
        # Find the last available month for this combo
        last_row = group.iloc[-1]
        start_date = datetime(last_row['Year'], int(last_row['Month_No']), 1)

        # Generate future months just for this combo
        month_range = pd.date_range(
            start=start_date + pd.DateOffset(months=1),
            end=end_date,
            freq='MS'
        )

        # Initialize state from last known row
        state = last_row.copy()

        for single_date in month_range:
            year = single_date.year
            month = single_date.month

            # Build feature row
            feature_row = {col: state[col] for col in feature_cols}
            feature_row['Month_No'] = month
            feature_row['Month_Sin'] = np.sin(2*np.pi*month/12)
            feature_row['Month_Cos'] = np.cos(2*np.pi*month/12)
            feature_row['Year_Since'] = year - df['Year'].min()
            feature_row['Quarter_No'] = ((month-1)//3)+1

            X_new = pd.DataFrame([feature_row])[feature_cols]

            # Predict
            qty_pred = max(1, int(round(np.expm1(model_qty.predict(X_new)[0]))))
            rev_pred = max(1, int(round(np.expm1(model_rev.predict(X_new)[0]))))

            forecast_rows.append({
                'Year': year,
                'Month': f"{month:02d}",
                'Account': label_encoders['Account'].inverse_transform([acc])[0],
                'Product': label_encoders['Product'].inverse_transform([prod])[0],
                'Type': label_encoders['Type'].inverse_transform([typ])[0],
                'Forecast_Quantity': qty_pred,
                'Forecast_Revenue': f"{rev_pred:,}"
            })

            # Update state for next iteration
            state['Revenue'] = np.log1p(rev_pred)
            state['Quantity'] = np.log1p(qty_pred)
            state['Revenue_lag1'] = state['Revenue']
            state['Quantity_lag1'] = state['Quantity']

    return pd.DataFrame(forecast_rows)


#13. Run the Forecasting

In [15]:
forecast_df = forecast_all_accounts(end_year=2027, end_month=3)
print(forecast_df)

       Year Month Account                 Product          Type  \
0      2024    10    AEIN  Pepper Black Oleoresin  Conventional   
1      2024    11    AEIN  Pepper Black Oleoresin  Conventional   
2      2024    12    AEIN  Pepper Black Oleoresin  Conventional   
3      2025    01    AEIN  Pepper Black Oleoresin  Conventional   
4      2025    02    AEIN  Pepper Black Oleoresin  Conventional   
...     ...   ...     ...                     ...           ...   
18894  2026    11    ZLAT  Pepper Black Oleoresin  Conventional   
18895  2026    12    ZLAT  Pepper Black Oleoresin  Conventional   
18896  2027    01    ZLAT  Pepper Black Oleoresin  Conventional   
18897  2027    02    ZLAT  Pepper Black Oleoresin  Conventional   
18898  2027    03    ZLAT  Pepper Black Oleoresin  Conventional   

       Forecast_Quantity Forecast_Revenue  
0                   2771          131,637  
1                   1808           65,576  
2                   1873           65,909  
3                  

#14. Saving the Output

In [13]:
"""output_path_excel = "/content/drive/MyDrive/VERGER/Revenue_Forecasting/RBUS_Results4.xlsx"
forecast_df.to_excel(output_path_excel, index=False)
print(f"✅ Forecast saved as Excel at {output_path_excel}")"""

'output_path_excel = "/content/drive/MyDrive/VERGER/Revenue_Forecasting/RBUS_Results4.xlsx"\nforecast_df.to_excel(output_path_excel, index=False)\nprint(f"✅ Forecast saved as Excel at {output_path_excel}")'