<a href="https://colab.research.google.com/github/vimesh630/Revenue_Forecasting/blob/main/New_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. Import Required LIbraries and Mount Google Drive

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from google.colab import drive
from datetime import datetime
import calendar

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


#2. Load & Preprocess Data

In [2]:
input_file_path = "/content/drive/MyDrive/VERGER/Revenue_Forecasting/forecasting_data.csv"
df = pd.read_csv(input_file_path)

# Clean and ensure correct dtypes
df = df.dropna(subset=['Year', 'Month_No'])
df['Year'] = df['Year'].astype(int)
df['Month_No'] = df['Month_No'].astype(int)
df['Revenue'] = pd.to_numeric(df['Revenue'], errors='coerce').fillna(0)
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce').fillna(0)

# Create date column and sort
df['Date'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['Month_No'].astype(str) + '-01')
df = df.sort_values(['Account','Product','Type','Date']).reset_index(drop=True)

# Encode categorical features
label_encoders = {}
for col in ['Account','Product','Type']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str)) # Ensure fitting on string type
    label_encoders[col] = le

#3. Feature Engineering

In [3]:
def create_lags(group, lags=[1,2,3]):
    group = group.sort_values('Date')
    for lag in lags:
        group[f'Revenue_lag{lag}'] = group['Revenue'].shift(lag).fillna(0)
        group[f'Quantity_lag{lag}'] = group['Quantity'].shift(lag).fillna(0)
    return group

def add_rolling_features(group):
    group = group.sort_values('Date')
    group['Revenue_MA3'] = group['Revenue'].rolling(3).mean().fillna(0)
    group['Revenue_MA6'] = group['Revenue'].rolling(6).mean().fillna(0)
    group['Quantity_MA3'] = group['Quantity'].rolling(3).mean().fillna(0)
    group['Quantity_MA6'] = group['Quantity'].rolling(6).mean().fillna(0)
    return group

df = df.groupby(['Account','Product','Type']).apply(create_lags).reset_index(drop=True)
df = df.groupby(['Account','Product','Type']).apply(add_rolling_features).reset_index(drop=True)

# Group-level statistics
df['Account_mean_rev'] = df.groupby('Account')['Revenue'].transform('mean')
df['Product_mean_rev'] = df.groupby('Product')['Revenue'].transform('mean')
df['Type_mean_rev'] = df.groupby('Type')['Revenue'].transform('mean')
df['Account_mean_qty'] = df.groupby('Account')['Quantity'].transform('mean')
df['Product_mean_qty'] = df.groupby('Product')['Quantity'].transform('mean')
df['Type_mean_qty'] = df.groupby('Type')['Quantity'].transform('mean')

# Cyclical features
df['Month_Sin'] = np.sin(2*np.pi*df['Month_No']/12)
df['Month_Cos'] = np.cos(2*np.pi*df['Month_No']/12)
df['Year_Since'] = df['Year'] - df['Year'].min()
df['Quarter_No'] = df['Quarter'].str.replace("Q","").astype(int)

  df = df.groupby(['Account','Product','Type']).apply(create_lags).reset_index(drop=True)
  df = df.groupby(['Account','Product','Type']).apply(add_rolling_features).reset_index(drop=True)


#4. Features and Targets Columns

In [4]:
feature_cols = [
    'Account','Product','Type',
    'Month_No','Month_Sin','Month_Cos','Year_Since','Quarter_No',
    'Revenue_lag1','Revenue_lag2','Revenue_lag3',
    'Quantity_lag1','Quantity_lag2','Quantity_lag3',
    'Revenue_MA3','Revenue_MA6','Quantity_MA3','Quantity_MA6',
    'Account_mean_rev','Product_mean_rev','Type_mean_rev',
    'Account_mean_qty','Product_mean_qty','Type_mean_qty'
]

X = df[feature_cols]
y_rev = df['Revenue']
y_qty = df['Quantity']

#5.Test-Train Split

In [5]:
X_train, X_test, y_train_rev, y_test_rev, y_train_qty, y_test_qty = train_test_split(
    X, y_rev, y_qty, test_size=0.2, random_state=42
)

#6. Hyperparameter Grid

In [6]:
param_grid = {
    'n_estimators': [500, 800, 1000, 1200],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'max_depth': [4, 6, 8, 10],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [0.5, 1, 1.5]
}

#7. Randomized Search Setup

In [7]:
def tune_xgb_model(X_train, y_train):
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    random_search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_grid,
        n_iter=50,  # number of parameter combinations to try
        scoring='neg_mean_squared_error',
        cv=3,
        verbose=2,
        random_state=42,
        n_jobs=-1
    )
    random_search.fit(X_train, y_train)
    print("✅ Best parameters found:", random_search.best_params_)
    return random_search.best_estimator_

#8. Train Revenue Model

In [8]:
model_rev = tune_xgb_model(X_train, y_train_rev)
model_rev.fit(
    X_train, y_train_rev,
    eval_set=[(X_test, y_test_rev)],
    verbose=False
)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
✅ Best parameters found: {'subsample': 0.7, 'reg_lambda': 1.5, 'reg_alpha': 0, 'n_estimators': 500, 'min_child_weight': 7, 'max_depth': 6, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 1.0}


#9. Train Quantity Model

In [9]:
model_qty = tune_xgb_model(X_train, y_train_qty)
model_qty.fit(
    X_train, y_train_qty,
    eval_set=[(X_test, y_test_qty)],
    verbose=False
)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
✅ Best parameters found: {'subsample': 0.7, 'reg_lambda': 0.5, 'reg_alpha': 0, 'n_estimators': 500, 'min_child_weight': 7, 'max_depth': 8, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 1.0}


#5. Model Evaluation

In [10]:
def evaluate_model(model, X_test, y_test, target_name):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{target_name} -> RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

evaluate_model(model_rev, X_test, y_test_rev, 'Revenue')
evaluate_model(model_qty, X_test, y_test_qty, 'Quantity')

Revenue -> RMSE: 25865.93, MAE: 12222.53, R²: 0.6854
Quantity -> RMSE: 251.00, MAE: 140.63, R²: 0.7854


#6. Forecasting Function

In [11]:
def forecast(account, product, typ, year, month):
    # Encode categorical values
    acc = label_encoders['Account'].transform([account])[0]
    prod = label_encoders['Product'].transform([product])[0]
    tp = label_encoders['Type'].transform([typ])[0]

    # Find last known row for this combo
    subset = df[(df['Account']==acc) & (df['Product']==prod) & (df['Type']==tp)]
    if subset.empty:
        # If no historical data, return small positive values
        return {'Product': product, 'Type': typ, 'Forecast_Quantity': 1, 'Forecast_Revenue': "1"}

    last_row = subset.iloc[-1]

    # Build feature row
    feature_row = {
        'Account': acc, 'Product': prod, 'Type': tp,
        'Month_No': month,
        'Month_Sin': np.sin(2*np.pi*month/12),
        'Month_Cos': np.cos(2*np.pi*month/12),
        'Year_Since': year - df['Year'].min(),
        'Quarter_No': ((month-1)//3)+1,
        'Revenue_lag1': last_row['Revenue'],
        'Revenue_lag2': last_row['Revenue_lag1'],
        'Revenue_lag3': last_row['Revenue_lag2'],
        'Quantity_lag1': last_row['Quantity'],
        'Quantity_lag2': last_row['Quantity_lag1'],
        'Quantity_lag3': last_row['Quantity_lag2'],
        'Revenue_MA3': last_row['Revenue_MA3'],
        'Revenue_MA6': last_row['Revenue_MA6'],
        'Quantity_MA3': last_row['Quantity_MA3'],
        'Quantity_MA6': last_row['Quantity_MA6'],
        'Account_mean_rev': last_row['Account_mean_rev'],
        'Product_mean_rev': last_row['Product_mean_rev'],
        'Type_mean_rev': last_row['Type_mean_rev'],
        'Account_mean_qty': last_row['Account_mean_qty'],
        'Product_mean_qty': last_row['Product_mean_qty'],
        'Type_mean_qty': last_row['Type_mean_qty']
    }

    X_new = pd.DataFrame([feature_row])[feature_cols]
    qty_pred = model_qty.predict(X_new)[0]
    rev_pred = model_rev.predict(X_new)[0]

    # Clip predictions to minimum 1
    qty_pred = max(1, int(round(qty_pred)))
    rev_pred = max(1, int(round(rev_pred)))

    return {
        'Product': product,
        'Type': typ,
        'Forecast_Quantity': qty_pred,
        'Forecast_Revenue': f"{rev_pred:,}"
    }

order_counts = df.groupby(['Account','Product','Type']).size().reset_index(name='order_count')

#7. Forecast Orders Function

In [12]:
def forecast_orders(account_name, start_year, start_month, end_year, end_month):
    acc = label_encoders['Account'].transform([account_name])[0]
    products_types = df[df['Account']==acc][['Product','Type']].drop_duplicates()

    forecast_rows = []

    # Generate month range
    month_range = pd.date_range(
        start=datetime(start_year, start_month, 1),
        end=datetime(end_year, end_month, 1),
        freq='MS'
    )

    for single_date in month_range:
        year = single_date.year
        month = single_date.month

        for _, row in products_types.iterrows():
            prod = label_encoders['Product'].inverse_transform([row['Product']])[0]
            typ = label_encoders['Type'].inverse_transform([row['Type']])[0]

            pred = forecast(account_name, prod, typ, year, month)

            # Historical order count for this product
            oc = order_counts[
                (order_counts['Account']==acc) &
                (order_counts['Product']==row['Product']) &
                (order_counts['Type']==row['Type'])
            ]['order_count']

            num_orders = int(oc.values[0]) if len(oc) > 0 else 1

            # Split forecast evenly, but ensure min 1
            per_order_qty = max(1, int(round(pred['Forecast_Quantity'] / num_orders)))
            per_order_rev = max(1, int(round(int(pred['Forecast_Revenue'].replace(',','')) / num_orders)))

            for i in range(num_orders):
                forecast_rows.append({
                    'Year': year,
                    'Month': f"{month:02d}",
                    'Account': account_name,
                    'Product': pred['Product'],
                    'Type': pred['Type'],
                    'Forecast_Quantity': per_order_qty,
                    'Forecast_Revenue': f"{per_order_rev:,}"
                })

    return pd.DataFrame(forecast_rows)

#Select the range and the account that wants to forecast
result_df = forecast_orders('RBUS', 2025, 8, 2025, 12)
print(result_df)


     Year Month Account               Product          Type  \
0    2025    08    RBUS      BLACK PEPPER OIL  Conventional   
1    2025    08    RBUS      BLACK PEPPER OIL       Organic   
2    2025    08    RBUS     CINNAMON BARK OIL  Conventional   
3    2025    08    RBUS     CINNAMON BARK OIL  Conventional   
4    2025    08    RBUS     CINNAMON BARK OIL  Conventional   
..    ...   ...     ...                   ...           ...   
240  2025    12    RBUS  GINGER ESSENTIAL OIL  Conventional   
241  2025    12    RBUS            NUTMEG OIL  Conventional   
242  2025    12    RBUS            NUTMEG OIL       Organic   
243  2025    12    RBUS  Nutmeg Sri Lanka EO   Conventional   
244  2025    12    RBUS  Nutmeg Sri Lanka EO   Conventional   

     Forecast_Quantity Forecast_Revenue  
0                  414           64,195  
1                  384           60,633  
2                   45           10,075  
3                   45           10,075  
4                   45           