<a href="https://colab.research.google.com/github/vimesh630/Revenue_Forecasting/blob/main/New_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. Import Required LIbraries and Mount Google Drive

In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from google.colab import drive
from datetime import datetime
import calendar

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#2. Load & Preprocess Data

In [21]:
input_file_path = "/content/drive/MyDrive/VERGER/Revenue_Forecasting/forecasting_data.csv"
df = pd.read_csv(input_file_path)

# Clean and ensure correct dtypes
df = df.dropna(subset=['Year', 'Month_No'])
df['Year'] = df['Year'].astype(int)
df['Month_No'] = df['Month_No'].astype(int)
df['Revenue'] = pd.to_numeric(df['Revenue'], errors='coerce').fillna(0)
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce').fillna(0)

# Create date column and sort
df['Date'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['Month_No'].astype(str) + '-01')
df = df.sort_values(['Account','Product','Type','Date']).reset_index(drop=True)

# Encode categorical features
label_encoders = {}
for col in ['Account','Product','Type']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

#3. Feature Engineering

In [22]:
def create_lags(group, lags=[1,2,3]):
    group = group.sort_values('Date')
    for lag in lags:
        group[f'Revenue_lag{lag}'] = group['Revenue'].shift(lag).fillna(0)
        group[f'Quantity_lag{lag}'] = group['Quantity'].shift(lag).fillna(0)
    return group

def add_rolling_features(group):
    group = group.sort_values('Date')
    group['Revenue_MA3'] = group['Revenue'].rolling(3).mean().fillna(0)
    group['Revenue_MA6'] = group['Revenue'].rolling(6).mean().fillna(0)
    group['Quantity_MA3'] = group['Quantity'].rolling(3).mean().fillna(0)
    group['Quantity_MA6'] = group['Quantity'].rolling(6).mean().fillna(0)
    return group

df = df.groupby(['Account','Product','Type']).apply(create_lags).reset_index(drop=True)
df = df.groupby(['Account','Product','Type']).apply(add_rolling_features).reset_index(drop=True)

# Group-level statistics
df['Account_mean_rev'] = df.groupby('Account')['Revenue'].transform('mean')
df['Product_mean_rev'] = df.groupby('Product')['Revenue'].transform('mean')
df['Type_mean_rev'] = df.groupby('Type')['Revenue'].transform('mean')
df['Account_mean_qty'] = df.groupby('Account')['Quantity'].transform('mean')
df['Product_mean_qty'] = df.groupby('Product')['Quantity'].transform('mean')
df['Type_mean_qty'] = df.groupby('Type')['Quantity'].transform('mean')

# Cyclical features
df['Month_Sin'] = np.sin(2*np.pi*df['Month_No']/12)
df['Month_Cos'] = np.cos(2*np.pi*df['Month_No']/12)
df['Year_Since'] = df['Year'] - df['Year'].min()
df['Quarter_No'] = df['Quarter'].str.replace("Q","").astype(int)

  df = df.groupby(['Account','Product','Type']).apply(create_lags).reset_index(drop=True)
  df = df.groupby(['Account','Product','Type']).apply(add_rolling_features).reset_index(drop=True)


#4. Train Models

In [23]:
feature_cols = [
    'Account','Product','Type',
    'Month_No','Month_Sin','Month_Cos','Year_Since','Quarter_No',
    'Revenue_lag1','Revenue_lag2','Revenue_lag3',
    'Quantity_lag1','Quantity_lag2','Quantity_lag3',
    'Revenue_MA3','Revenue_MA6','Quantity_MA3','Quantity_MA6',
    'Account_mean_rev','Product_mean_rev','Type_mean_rev',
    'Account_mean_qty','Product_mean_qty','Type_mean_qty'
]

X = df[feature_cols]
y_rev = df['Revenue']
y_qty = df['Quantity']

X_train, X_test, y_train_rev, y_test_rev, y_train_qty, y_test_qty = train_test_split(
    X, y_rev, y_qty, test_size=0.2, random_state=42
)

xgb_params = {
    'objective': 'reg:squarederror',
    'n_estimators': 1000,
    'learning_rate': 0.03,
    'max_depth': 8,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'min_child_weight': 5,
    'gamma': 0.2,
    'reg_lambda': 1.0,
    'reg_alpha': 0.5,
    'random_state': 42
}

model_rev = xgb.XGBRegressor(**xgb_params)
model_rev.fit(X_train, y_train_rev, eval_set=[(X_test, y_test_rev)], verbose=False)

model_qty = xgb.XGBRegressor(**xgb_params)
model_qty.fit(X_train, y_train_qty, eval_set=[(X_test, y_test_qty)], verbose=False)

#5. Model Evaluation

In [24]:
def evaluate_model(model, X_test, y_test, target_name):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{target_name} -> RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

evaluate_model(model_rev, X_test, y_test_rev, 'Revenue')
evaluate_model(model_qty, X_test, y_test_qty, 'Quantity')

Revenue -> RMSE: 25551.34, MAE: 12102.08, R²: 0.6930
Quantity -> RMSE: 343.21, MAE: 157.39, R²: 0.5987


#6. Forecasting Function

In [25]:
def forecast(account, product, typ, year, month):
    # Encode categorical values
    acc = label_encoders['Account'].transform([account])[0]
    prod = label_encoders['Product'].transform([product])[0]
    tp = label_encoders['Type'].transform([typ])[0]

    # Find last known row for this combo
    subset = df[(df['Account']==acc) & (df['Product']==prod) & (df['Type']==tp)]
    if subset.empty:
        return {'Product': product, 'Type': typ, 'Forecast_Quantity': 0, 'Forecast_Revenue': "0"}
    last_row = subset.iloc[-1]

    # Build feature row
    feature_row = {
        'Account': acc, 'Product': prod, 'Type': tp,
        'Month_No': month,
        'Month_Sin': np.sin(2*np.pi*month/12),
        'Month_Cos': np.cos(2*np.pi*month/12),
        'Year_Since': year - df['Year'].min(),
        'Quarter_No': ((month-1)//3)+1,
        'Revenue_lag1': last_row['Revenue'],
        'Revenue_lag2': last_row['Revenue_lag1'],
        'Revenue_lag3': last_row['Revenue_lag2'],
        'Quantity_lag1': last_row['Quantity'],
        'Quantity_lag2': last_row['Quantity_lag1'],
        'Quantity_lag3': last_row['Quantity_lag2'],
        'Revenue_MA3': last_row['Revenue_MA3'],
        'Revenue_MA6': last_row['Revenue_MA6'],
        'Quantity_MA3': last_row['Quantity_MA3'],
        'Quantity_MA6': last_row['Quantity_MA6'],
        'Account_mean_rev': last_row['Account_mean_rev'],
        'Product_mean_rev': last_row['Product_mean_rev'],
        'Type_mean_rev': last_row['Type_mean_rev'],
        'Account_mean_qty': last_row['Account_mean_qty'],
        'Product_mean_qty': last_row['Product_mean_qty'],
        'Type_mean_qty': last_row['Type_mean_qty']
    }

    X_new = pd.DataFrame([feature_row])[feature_cols]
    qty_pred = model_qty.predict(X_new)[0]
    rev_pred = model_rev.predict(X_new)[0]

    return {
        'Product': product,
        'Type': typ,
        'Forecast_Quantity': int(round(qty_pred)),
        'Forecast_Revenue': f"{int(round(rev_pred)):,}"
    }

#7. Forecast Orders Function

In [28]:
# ==========================
# Forecasting Function
# ==========================
def forecast(account, product, typ, year, month):
    # Encode categorical values
    acc = label_encoders['Account'].transform([account])[0]
    prod = label_encoders['Product'].transform([product])[0]
    tp = label_encoders['Type'].transform([typ])[0]

    # Find last known row for this combo
    subset = df[(df['Account']==acc) & (df['Product']==prod) & (df['Type']==tp)]
    if subset.empty:
        return {'Product': product, 'Type': typ, 'Forecast_Quantity': 0, 'Forecast_Revenue': "0"}
    last_row = subset.iloc[-1]

    # Build feature row
    feature_row = {
        'Account': acc, 'Product': prod, 'Type': tp,
        'Month_No': month,
        'Month_Sin': np.sin(2*np.pi*month/12),
        'Month_Cos': np.cos(2*np.pi*month/12),
        'Year_Since': year - df['Year'].min(),
        'Quarter_No': ((month-1)//3)+1,
        'Revenue_lag1': last_row['Revenue'],
        'Revenue_lag2': last_row['Revenue_lag1'],
        'Revenue_lag3': last_row['Revenue_lag2'],
        'Quantity_lag1': last_row['Quantity'],
        'Quantity_lag2': last_row['Quantity_lag1'],
        'Quantity_lag3': last_row['Quantity_lag2'],
        'Revenue_MA3': last_row['Revenue_MA3'],
        'Revenue_MA6': last_row['Revenue_MA6'],
        'Quantity_MA3': last_row['Quantity_MA3'],
        'Quantity_MA6': last_row['Quantity_MA6'],
        'Account_mean_rev': last_row['Account_mean_rev'],
        'Product_mean_rev': last_row['Product_mean_rev'],
        'Type_mean_rev': last_row['Type_mean_rev'],
        'Account_mean_qty': last_row['Account_mean_qty'],
        'Product_mean_qty': last_row['Product_mean_qty'],
        'Type_mean_qty': last_row['Type_mean_qty']
    }

    X_new = pd.DataFrame([feature_row])[feature_cols]

    # Predict and clip negative values to 0
    qty_pred = max(0, model_qty.predict(X_new)[0])
    rev_pred = max(0, model_rev.predict(X_new)[0])

    return {
        'Product': product,
        'Type': typ,
        'Forecast_Quantity': int(round(qty_pred)),
        'Forecast_Revenue': f"{int(round(rev_pred)):,}"
    }

# ==========================
# Forecast Orders Function
# ==========================
order_counts = df.groupby(['Account','Product','Type','Year','Month_No']).size().reset_index(name='order_count')

def forecast_orders(account_name, year, month):
    acc = label_encoders['Account'].transform([account_name])[0]
    products_types = df[df['Account']==acc][['Product','Type']].drop_duplicates()

    forecast_rows = []

    for _, row in products_types.iterrows():
        prod = label_encoders['Product'].inverse_transform([row['Product']])[0]
        typ = label_encoders['Type'].inverse_transform([row['Type']])[0]

        pred = forecast(account_name, prod, typ, year, month)

        oc = order_counts[
            (order_counts['Account']==acc) &
            (order_counts['Product']==row['Product']) &
            (order_counts['Type']==row['Type']) &
            (order_counts['Month_No']==month)
        ]['order_count']

        num_orders = int(oc.values[0]) if len(oc) > 0 else 1

        per_order_qty = int(round(pred['Forecast_Quantity'] / num_orders))
        per_order_rev = int(round(int(pred['Forecast_Revenue'].replace(',','')) / num_orders))

        for i in range(num_orders):
            forecast_rows.append({
                'Year': year,
                'Month': f"{month:02d}",
                'Account': account_name,
                'Product': pred['Product'],
                'Type': pred['Type'],
                'Forecast_Quantity': per_order_qty,
                'Forecast_Revenue': f"{per_order_rev:,}"
            })

    return pd.DataFrame(forecast_rows)

# ==========================
# Example Forecast
# ==========================
result_df = forecast_orders('RBUS', 2025, 9)
print(result_df)


    Year Month Account                        Product          Type  \
0   2025    09    RBUS               BLACK PEPPER OIL  Conventional   
1   2025    09    RBUS               BLACK PEPPER OIL       Organic   
2   2025    09    RBUS              CINNAMON BARK OIL  Conventional   
3   2025    09    RBUS              CINNAMON BARK OIL  Conventional   
4   2025    09    RBUS              CINNAMON BARK OIL       Organic   
5   2025    09    RBUS  CINNAMON BARK OIL (SRI LANKA)  Conventional   
6   2025    09    RBUS    CINNAMON BARK OIL SRI LANKA  Conventional   
7   2025    09    RBUS   CINNAMON BARK OIL(SRI LANKA)  Conventional   
8   2025    09    RBUS              CINNAMON LEAF OIL  Conventional   
9   2025    09    RBUS              CINNAMON LEAF OIL       Organic   
10  2025    09    RBUS                 CITRONELLA OIL  Conventional   
11  2025    09    RBUS                  CLOVE BUD OIL  Conventional   
12  2025    09    RBUS                  CLOVE BUD OIL       Organic   
13  20

In [29]:
from datetime import datetime

# ==========================
# Multi-Month Forecast Function
# ==========================
def forecast_range(account_name, target_year, target_month):
    # Get today's date
    today = datetime.today()
    current_year, current_month = today.year, today.month

    # Generate list of year-month pairs
    months = []
    y, m = current_year, current_month
    while (y < target_year) or (y == target_year and m <= target_month):
        months.append((y, m))
        # increment month
        if m == 12:
            m = 1
            y += 1
        else:
            m += 1

    # Forecast for each month
    all_forecasts = []
    for y, m in months:
        monthly_forecast = forecast_orders(account_name, y, m)
        all_forecasts.append(monthly_forecast)

    # Combine all months
    return pd.concat(all_forecasts, ignore_index=True)

# ==========================
# Example Usage
# ==========================
multi_month_forecast = forecast_range('RBUS', 2025, 12)
print(multi_month_forecast)


     Year Month Account                        Product          Type  \
0    2025    08    RBUS               BLACK PEPPER OIL  Conventional   
1    2025    08    RBUS               BLACK PEPPER OIL       Organic   
2    2025    08    RBUS              CINNAMON BARK OIL  Conventional   
3    2025    08    RBUS              CINNAMON BARK OIL       Organic   
4    2025    08    RBUS  CINNAMON BARK OIL (SRI LANKA)  Conventional   
..    ...   ...     ...                            ...           ...   
104  2025    12    RBUS     Clove Bud Sri Lanka EO Org       Organic   
105  2025    12    RBUS           GINGER ESSENTIAL OIL  Conventional   
106  2025    12    RBUS                     NUTMEG OIL  Conventional   
107  2025    12    RBUS                     NUTMEG OIL       Organic   
108  2025    12    RBUS           Nutmeg Sri Lanka EO   Conventional   

     Forecast_Quantity Forecast_Revenue  
0                  528           42,748  
1                  443           35,892  
2        