<a href="https://colab.research.google.com/github/vimesh630/Spice_Price_Predction_VERGER/blob/Cinnamon/Preprocessing_for_Cinnamon_Price_Forecasting_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. Mount Google Drive and Import Libraries

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import os

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#2. Load Dataset from Google Drive

In [3]:
file_path = "/content/drive/MyDrive/VERGER/Cinnamon_Price_Prediction/Cinnamon_Dataset_New_0001.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1')

print(f"Original dataset shape: {df.shape}")

Original dataset shape: (2772, 16)


#3. Keep Only Active Grade x Region Pairs

In [4]:
df = df[df['Is_Active_Region'] == 1].copy()
print(f"After filtering active regions: {df.shape}")

After filtering active regions: (2244, 16)


#4. Ensure Month Column is datetime

In [5]:
if not np.issubdtype(df['Month'].dtype, np.datetime64):
    df['Month'] = pd.to_datetime(df['Month'], errors='coerce')

#5. Fill Missing Regional Prices

In [6]:
df['Regional_Price'] = df.groupby(['Grade', 'Region'])['Regional_Price']\
    .transform(lambda x: x.interpolate(method='linear').fillna(x.mean()))  # fill with mean if still NaN
df['Regional_Price'] = df['Regional_Price'].fillna(df['National_Price'])

#6. Fill Missing Numeric Features

In [7]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'Regional_Price' in numeric_cols:
    numeric_cols.remove('Regional_Price')  # exclude target

for col in numeric_cols:
    df[col] = df.groupby(['Grade', 'Region'])[col]\
                .transform(lambda x: x.ffill().bfill())

#6. Create Date-Based Features

In [8]:
df['year'] = df['Month'].dt.year
df['month'] = df['Month'].dt.month
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

#7. Create lag and rolling Features

In [9]:
# === Lags ===
for lag in [1, 2, 3, 6, 12]:
    df[f'price_lag_{lag}'] = df.groupby(['Grade', 'Region'])['Regional_Price'].shift(lag)

# === Rolling stats ===
for window in [3, 6, 12]:
    df[f'price_roll_mean_{window}'] = df.groupby(['Grade', 'Region'])['Regional_Price']\
        .shift(1).rolling(window).mean()
    df[f'price_roll_std_{window}'] = df.groupby(['Grade', 'Region'])['Regional_Price']\
        .shift(1).rolling(window).std()

#8. Log Transform of Price

In [10]:
df['log_price']=np.log1p(df['Regional_Price'])

#9. Relative % Change

In [11]:
for lag in [1, 3, 6, 12]:
    df[f'price_pct_change_{lag}'] = df.groupby(['Grade', 'Region'])['Regional_Price']\
        .pct_change(lag)

#10. Price Shock Indicator (large jump/drop)

In [12]:
df['price_shock_flag'] = (df['price_pct_change_1'].abs() > 0.15).astype(int)

#11. Momentum (sum of last n% changes)

In [13]:
for window in [3, 6]:
    df[f'momentum_{window}'] = df.groupby(['Grade', 'Region'])['price_pct_change_1']\
        .rolling(window).sum().reset_index(level=[0,1], drop=True)

#12. Crisis Period Dummy

In [14]:
df['crisis_flag'] = ((df['Month'] >= '2021-06') & (df['Month'] <= '2022-12')).astype(int)

#13. Interaction Terms

In [15]:
df['region_month'] = df['Region'].astype(str) + "_" + df['month'].astype(str)
df['grade_region'] = df['Grade'].astype(str) + "_" + df['Region'].astype(str)

#14. Categorical Encoding

In [16]:
df['Grade'] = df['Grade'].astype('category')
df['Region'] = df['Region'].astype('category')
df['region_month'] = df['region_month'].astype('category')
df['grade_region'] = df['grade_region'].astype('category')

#15. Final Clean

In [17]:
df = df.dropna().reset_index(drop=True)
print(f"Final dataset shape after dropping NaNs: {df.shape}")

Final dataset shape after dropping NaNs: (1836, 42)


# Save

In [18]:
output_path = "/content/drive/MyDrive/VERGER/Cinnamon_Price_Prediction/Cinnamon_Dataset_Preprocessed_CrisisAware.csv"
df.to_csv(output_path, index=False)
print(f"Preprocessed dataset saved to: {output_path}")

Preprocessed dataset saved to: /content/drive/MyDrive/VERGER/Cinnamon_Price_Prediction/Cinnamon_Dataset_Preprocessed_CrisisAware.csv
