<a href="https://colab.research.google.com/github/vimesh630/Spice_Price_Predction_VERGER/blob/Cinnamon/Preprocessing_for_Cinnamon_Price_Forecasting_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. Mount Google Drive and Import Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import os

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#2. Load Dataset from Google Drive

In [2]:
file_path = "/content/drive/MyDrive/VERGER/Cinnamon_Price_Prediction/Cinnamon_Dataset_New_0001.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1')

print(f"Original dataset shape: {df.shape}")

Original dataset shape: (2772, 16)


#3. Keep Only Active Grade x Region Pairs

In [3]:
df = df[df['Is_Active_Region'] == 1].copy()
print(f"After filtering active regions: {df.shape}")

After filtering active regions: (2244, 16)


#4. Fill Missing Regional Prices

In [5]:
df['Regional_Price'] = df.groupby(['Grade', 'Region'])['Regional_Price']\
    .transform(lambda x: x.interpolate(method='linear').fillna(df['National_Price']))

#5. Fill Missing Numeric Features

In [6]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove('Regional_Price')  # target handled separately

for col in numeric_cols:
    df[col] = df.groupby(['Grade', 'Region'])[col].apply(lambda x: x.ffill().bfill())

TypeError: incompatible index of inserted column with frame index

#6. Create Time-Based Features

In [None]:
df['year'] = df['Month'].dt.year
df['month'] = df['Month'].dt.month
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

#7. Create lag and rolling Features

In [None]:
for lag in [1, 2, 3, 6]:
    df[f'price_lag_{lag}'] = df.groupby(['Grade', 'Region'])['Regional_Price'].shift(lag)

for window in [3, 6]:
    df[f'price_roll_mean_{window}'] = df.groupby(['Grade', 'Region'])['Regional_Price']\
                                        .shift(1).rolling(window).mean()
    df[f'price_roll_std_{window}'] = df.groupby(['Grade', 'Region'])['Regional_Price']\
                                       .shift(1).rolling(window).std()

#8. Encode Categorical Variables

In [None]:
df['Grade'] = df['Grade'].astype('category')
df['Region'] = df['Region'].astype('category')

#9. Drop Rows with NaNs After Lagging

In [None]:
df = df.dropna().reset_index(drop=True)
print(f"Final dataset shape after dropping NaNs: {df.shape}")

#10. Save Preprocessed Dataset

In [None]:
output_path = "/content/drive/MyDrive/VERGER/Cinnamon_Price_Prediction/Cinnamon_Dataset_Preprocessed.csv"
df.to_csv(output_path, index=False)
print(f"Preprocessed dataset saved to: {output_path}")