<a href="https://colab.research.google.com/github/vidyacheekuri/LogiCast-M5-Forecasting/blob/main/2_featureengineering_and_baselinemodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd

# Load from Drive
df = pd.read_csv('/content/drive/MyDrive/M5Forecasting/merged_5k_sample.csv')
print("Merged data loaded:", df.shape)
df['date'] = pd.to_datetime(df['date'])
display(df.head())

  df = pd.read_csv('/content/drive/MyDrive/M5Forecasting/merged_5k_sample.csv')


Merged data loaded: (9565000, 22)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,FOODS_3_180_CA_1_validation,FOODS_3_180,FOODS_3,FOODS,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,,,,,0,0,0,
1,HOUSEHOLD_2_383_CA_3_validation,HOUSEHOLD_2_383,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,d_1,2,2011-01-29,11101,...,1,2011,,,,,0,0,0,3.97
2,FOODS_3_409_CA_3_validation,FOODS_3_409,FOODS_3,FOODS,CA_3,CA,d_1,0,2011-01-29,11101,...,1,2011,,,,,0,0,0,
3,FOODS_1_097_CA_2_validation,FOODS_1_097,FOODS_1,FOODS,CA_2,CA,d_1,0,2011-01-29,11101,...,1,2011,,,,,0,0,0,
4,HOBBIES_1_272_TX_2_validation,HOBBIES_1_272,HOBBIES_1,HOBBIES,TX_2,TX,d_1,0,2011-01-29,11101,...,1,2011,,,,,0,0,0,


In [2]:
def downcast_dtypes(df):
    # Select columns that are not objects
    float_cols = [c for c in df.columns if df[c].dtype == 'float64']
    int_cols = [c for c in df.columns if df[c].dtype in ['int64', 'int32']]

    # Downcast float columns
    for col in float_cols:
        df[col] = pd.to_numeric(df[col], downcast='float')

    # Downcast int columns
    for col in int_cols:
        df[col] = pd.to_numeric(df[col], downcast='integer')

    return df

df = downcast_dtypes(df)
print("Data types downcasted")
display(df.info(memory_usage='deep'))

Data types downcasted
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9565000 entries, 0 to 9564999
Data columns (total 22 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            object        
 1   item_id       object        
 2   dept_id       object        
 3   cat_id        object        
 4   store_id      object        
 5   state_id      object        
 6   d             object        
 7   sales         int16         
 8   date          datetime64[ns]
 9   wm_yr_wk      int16         
 10  weekday       object        
 11  wday          int8          
 12  month         int8          
 13  year          int16         
 14  event_name_1  object        
 15  event_type_1  object        
 16  event_name_2  object        
 17  event_type_2  object        
 18  snap_CA       int8          
 19  snap_TX       int8          
 20  snap_WI       int8          
 21  sell_price    float32       
dtypes: datetime64[ns](1), float32(1), int16(3), 

None

**Adding Lag & Rolling Features**

In [5]:
# Sort for lag/rolling operations
df = df.sort_values(by=['id', 'date'])

# Create lag features
LAG_DAYS = [7, 28]
for lag in LAG_DAYS:
    df[f'lag_{lag}'] = df.groupby('id')['sales'].shift(lag).astype('float32')

# Create rolling mean features
ROLL_WINDOWS = [7, 28]
for window in ROLL_WINDOWS:
    df[f'rolling_mean_{window}'] = (
        df.groupby('id')['sales']
        .shift(28)
        .rolling(window)
        .mean()
        .reset_index(0, drop=True)
        .astype('float32')
    )

print("Lag and rolling features added and downcasted to float32")

Lag and rolling features added and downcasted to float32


**Encoding Categorical Columns**

In [11]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1']
df[cat_cols] = df[cat_cols].fillna("unknown")

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

print("Categorical columns encoded")

Categorical columns encoded


In [19]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import numpy as np

# Drop rows with NaNs from lag/rolling
df_model = df.dropna()

# Features to train on
features = ['item_id', 'store_id', 'state_id', 'cat_id', 'dept_id',
            'snap_CA', 'snap_TX', 'snap_WI',
            'lag_7', 'lag_28', 'rolling_mean_7', 'rolling_mean_28']
target = 'sales'

# Train/valid split - Using percentage split
valid_size = int(len(df_model) * 0.1) # 10% for validation
X_train, y_train = df_model[:-valid_size][features], df_model[:-valid_size][target]
X_valid, y_valid = df_model[-valid_size:][features], df_model[-valid_size:][target]


print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_valid: {X_valid.shape}")

# LightGBM training
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)

model = lgb.train(
    {'objective': 'regression', 'metric': 'rmse'},
    train_data,
    valid_sets=[train_data, valid_data],
    callbacks=[lgb.log_evaluation(period=100), lgb.early_stopping(stopping_rounds=10)],
    num_boost_round=100,

)

# Evaluate
y_pred = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"RMSE: {rmse:.4f}")

Shape of X_train: (13590, 12)
Shape of X_valid: (1509, 12)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 472
[LightGBM] [Info] Number of data points in the train set: 13590, number of used features: 12
[LightGBM] [Info] Start training from score 1.714570
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[54]	training's rmse: 2.65374	valid_1's rmse: 0.857628
RMSE: 0.8576


In [6]:
import numpy as np

std_dev_sales = np.std(df['sales'])
print(f"Standard deviation of sales: {std_dev_sales:.4f}")

Standard deviation of sales: 3.8520
