In [1]:
%load_ext kedro

In [None]:
df = catalog.load('clean_data')
df.info()

In [57]:
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [55]:
def get_seasonal_stats(item_name):
    item_data = df[df['item'] == item_name].copy().sort_values('date')
    item_data.set_index('date', inplace=True)
    price_series = item_data['high_price']
    result = seasonal_decompose(price_series, model='additive', period=7)
    return {
        'item': item_name,
        'trend_strength': result.trend.std(),
        'seasonality_strength': result.seasonal.std(),
        'residual_strength': result.resid.std()
    }

stats_list = [get_seasonal_stats(item) for item in df['item'].unique()]
stats_df = pd.DataFrame(stats_list)

scaler = RobustScaler()
X_scaled = scaler.fit_transform(stats_df[['trend_strength', 'seasonality_strength']])

kmeans = KMeans(n_clusters=6, random_state=42)
stats_df['cluster'] = kmeans.fit_predict(X_scaled)

stats_df = stats_df[['item', 'cluster']]
stats_df[['trend_scaled', 'seasonality_scaled']] = X_scaled

In [56]:
stats_df

Unnamed: 0,item,cluster,trend_scaled,seasonality_scaled
0,আটা (প্যাকেট),0,-0.497942,-0.403584
1,আটা সাদা (খোলা),0,-0.525549,-0.393064
2,আদা (আমদানি),0,0.331129,0.677489
3,আদা (দেশী),0,1.764233,0.930259
4,আলু (মানভেদে),0,-0.38865,-0.287071
5,ইলিশ,4,5.587418,15.890082
6,"এম,এস রড (৬০ গ্রেড)",2,124.873066,59.381514
7,"এম,এস রড( ৪০ গ্রেড)",1,140.043756,2156.82363
8,এলাচ(ছোট),5,13.564054,6.622221
9,এ্যাংকর ডাল,0,-0.434376,-0.399732


In [58]:
featured_df = df.merge(stats_df[['item', 'cluster', 'trend_scaled', 'seasonality_scaled']], on='item', how='left')

featured_df['date_ordinal'] = featured_df['date'].map(pd.Timestamp.toordinal)
X = featured_df[['date_ordinal', 'cluster', 'trend_scaled', 'seasonality_scaled']]
y = featured_df['high_price']

model_with_features = RandomForestRegressor(n_estimators=100, random_state=42)
model_with_features.fit(X, y)
predictions_with = model_with_features.predict(X)

model_baseline = RandomForestRegressor(n_estimators=100, random_state=42)
model_baseline.fit(X[['date_ordinal']], y)
predictions_baseline = model_baseline.predict(X[['date_ordinal']])

mse_with = mean_squared_error(y, predictions_with)
mse_baseline = mean_squared_error(y, predictions_baseline)

print(f"MSE with new features: {mse_with:.2f}")
print(f"MSE baseline: {mse_baseline:.2f}")
print(f"Improvement: {((mse_baseline - mse_with) / mse_baseline * 100):.1f}%")

importances = model_with_features.feature_importances_
feature_names = X.columns
for name, importance in zip(feature_names, importances):
    print(f"{name}: {importance:.3f}")

MSE with new features: 1117879.98
MSE baseline: 297429301.46
Improvement: 99.6%
date_ordinal: 0.025
cluster: 0.004
trend_scaled: 0.474
seasonality_scaled: 0.497
