In [28]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

train_df  = pd.read_csv("C:\\Users\\supri\\Desktop\\Demand forecast\\train.csv")
test_df   = pd.read_csv("C:\\Users\\supri\\Desktop\\Demand forecast\\test.csv")
meal_df   = pd.read_csv("C:\\Users\\supri\\Desktop\\Demand forecast\\meal_info.csv")
center_df = pd.read_csv("C:\\Users\\supri\\Desktop\\Demand forecast\\fulfilment_center_info.csv")


In [2]:
train_df.head()
train_df.info()
train_df.isna().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456548 entries, 0 to 456547
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     456548 non-null  int64  
 1   week                   456548 non-null  int64  
 2   center_id              456548 non-null  int64  
 3   meal_id                456548 non-null  int64  
 4   checkout_price         456548 non-null  float64
 5   base_price             456548 non-null  float64
 6   emailer_for_promotion  456548 non-null  int64  
 7   homepage_featured      456548 non-null  int64  
 8   num_orders             456548 non-null  int64  
dtypes: float64(2), int64(7)
memory usage: 31.3 MB


id                       0
week                     0
center_id                0
meal_id                  0
checkout_price           0
base_price               0
emailer_for_promotion    0
homepage_featured        0
num_orders               0
dtype: int64

In [3]:
test_df.head()
test_df.info()
test_df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32573 entries, 0 to 32572
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     32573 non-null  int64  
 1   week                   32573 non-null  int64  
 2   center_id              32573 non-null  int64  
 3   meal_id                32573 non-null  int64  
 4   checkout_price         32573 non-null  float64
 5   base_price             32573 non-null  float64
 6   emailer_for_promotion  32573 non-null  int64  
 7   homepage_featured      32573 non-null  int64  
dtypes: float64(2), int64(6)
memory usage: 2.0 MB


id                       0
week                     0
center_id                0
meal_id                  0
checkout_price           0
base_price               0
emailer_for_promotion    0
homepage_featured        0
dtype: int64

In [4]:
meal_df.head()
meal_df.info()
meal_df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   meal_id   51 non-null     int64 
 1   category  51 non-null     object
 2   cuisine   51 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.3+ KB


meal_id     0
category    0
cuisine     0
dtype: int64

In [5]:
center_df.head()
center_df.info()
center_df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   center_id    77 non-null     int64  
 1   city_code    77 non-null     int64  
 2   region_code  77 non-null     int64  
 3   center_type  77 non-null     object 
 4   op_area      77 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 3.1+ KB


center_id      0
city_code      0
region_code    0
center_type    0
op_area        0
dtype: int64

In [6]:
weekly_center_df = (
    train_df
    .groupby(["week", "center_id"], as_index=False)["num_orders"]
    .sum()
    .rename(columns={"num_orders": "total_demand"})
)


In [18]:
# ensure time continuity and handle nulls
# columns used: week, center_id, total_demand


weekly_center_df = weekly_center_df.sort_values(["center_id", "week"])

def fix_time_continuity(df):
    # create full continuous week range
    full_weeks = pd.DataFrame(
        {"week": range(df["week"].min(), df["week"].max() + 1)}
    )

    # merge to introduce missing weeks
    df = full_weeks.merge(df, on="week", how="left")

    # fill center_id for missing rows
    df["center_id"] = df["center_id"].ffill()

    # fill missing demand with 0
    df["total_demand"] = df["total_demand"].fillna(0)

    return df

weekly_center_df = (
    weekly_center_df
    .groupby("center_id", group_keys=False)
    .apply(fix_time_continuity)
)

weekly_center_df.head()



  .apply(fix_time_continuity)


Unnamed: 0,week,center_id,total_demand,lag_1,lag_4,rolling_mean_4
0,1,10.0,25327.0,,,
1,2,10.0,21561.0,25327.0,,
2,3,10.0,22411.0,21561.0,,
3,4,10.0,22794.0,22411.0,,
4,5,10.0,35934.0,22794.0,25327.0,23023.25


In [13]:
# FEATURE ENGINEERING (lags + rolling)

weekly_center_df = weekly_center_df.sort_values(["center_id", "week"])

weekly_center_df["lag_1"] = (
    weekly_center_df.groupby("center_id")["total_demand"].shift(1)
)

weekly_center_df["lag_4"] = (
    weekly_center_df.groupby("center_id")["total_demand"].shift(4)
)

weekly_center_df["rolling_mean_4"] = (
    weekly_center_df
    .groupby("center_id")["total_demand"]
    .shift(1)
    .rolling(4)
    .mean()
)

features_df = weekly_center_df.dropna().reset_index(drop=True)

features_df.head()


Unnamed: 0,week,center_id,total_demand,lag_1,lag_4,rolling_mean_4
0,5,10.0,35934.0,22794.0,25327.0,23023.25
1,6,10.0,29761.0,35934.0,21561.0,25675.0
2,7,10.0,25618.0,29761.0,22411.0,27725.0
3,8,10.0,18614.0,25618.0,22794.0,28526.75
4,9,10.0,21667.0,18614.0,35934.0,27481.75


In [14]:
# STEP 5: train / validation split (time-based, per center)
feature_df = weekly_center_df.dropna().reset_index(drop=True)
VAL_WEEKS = 8

# sort first
feature_df = feature_df.sort_values(["center_id", "week"])

# split by time per center
train_df = (
    feature_df
    .groupby("center_id", group_keys=False)
    .apply(lambda x: x.iloc[:-VAL_WEEKS])
    .reset_index(drop=True)
)

val_df = (
    feature_df
    .groupby("center_id", group_keys=False)
    .apply(lambda x: x.iloc[-VAL_WEEKS:])
    .reset_index(drop=True)
)

# separate inputs (X) and target (y)
X_train = train_df.drop(columns=["total_demand"])
y_train = train_df["total_demand"]

X_val = val_df.drop(columns=["total_demand"])
y_val = val_df["total_demand"]

X_train.head(), y_train.head(), X_val.head(), y_val.head()


  .apply(lambda x: x.iloc[:-VAL_WEEKS])
  .apply(lambda x: x.iloc[-VAL_WEEKS:])


(   week  center_id    lag_1    lag_4  rolling_mean_4
 0     5       10.0  22794.0  25327.0        23023.25
 1     6       10.0  35934.0  21561.0        25675.00
 2     7       10.0  29761.0  22411.0        27725.00
 3     8       10.0  25618.0  22794.0        28526.75
 4     9       10.0  18614.0  35934.0        27481.75,
 0    35934.0
 1    29761.0
 2    25618.0
 3    18614.0
 4    21667.0
 Name: total_demand, dtype: float64,
    week  center_id    lag_1    lag_4  rolling_mean_4
 0   138       10.0  23889.0  26087.0        24252.00
 1   139       10.0  21203.0  23238.0        23031.00
 2   140       10.0  22309.0  23794.0        22798.75
 3   141       10.0  27272.0  23889.0        23668.25
 4   142       10.0  23878.0  21203.0        23665.50,
 0    21203.0
 1    22309.0
 2    27272.0
 3    23878.0
 4    24365.0
 Name: total_demand, dtype: float64)

In [15]:

# baseline prediction = last week's demand
baseline_preds = val_df["lag_1"]

y_true = val_df["total_demand"]

from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae_baseline = mean_absolute_error(y_true, baseline_preds)
rmse_baseline = np.sqrt(mean_squared_error(y_true, baseline_preds))

mae_baseline, rmse_baseline



(971.3344155844156, np.float64(1324.5175377442943))

In [16]:
weekly_center_df.columns


Index(['week', 'center_id', 'total_demand', 'lag_1', 'lag_4',
       'rolling_mean_4'],
      dtype='object')

In [17]:
print(val_df.columns)


Index(['week', 'center_id', 'total_demand', 'lag_1', 'lag_4',
       'rolling_mean_4'],
      dtype='object')


In [21]:
#train model

# initialize model
rf_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

# train
rf_model.fit(X_train, y_train)

# predict on validation
rf_preds = rf_model.predict(X_val)

# evaluate
mae_rf = mean_absolute_error(y_val, rf_preds)
rmse_rf = np.sqrt(mean_squared_error(y_val, rf_preds))

mae_rf, rmse_rf


(994.2727840909091, np.float64(1370.241114801779))

In [30]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# initialize model
gbr_model = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

# train
gbr_model.fit(X_train, y_train)

# predict
gbr_preds = gbr_model.predict(X_val)

# evaluate
mae_gbr = mean_absolute_error(y_val, gbr_preds)
rmse_gbr = np.sqrt(mean_squared_error(y_val, gbr_preds))

mae_gbr, rmse_gbr


(1240.8701152191088, np.float64(1573.9794945333167))

In [31]:
import json

model_info = {
    "model_type": "baseline_last_week",
    "description": "Predicts demand using last week's demand (lag_1)",
    "features_required": ["lag_1"],
    "reason_selected": "Outperformed RandomForest and Boosting on validation"
}

with open("model_info.json", "w") as f:
    json.dump(model_info, f, indent=4)


In [32]:
feature_schema = {
    "input_features": [
        "center_id",
        "week",
        "lag_1"
    ],
    "target": "total_demand"
}

with open("feature_schema.json", "w") as f:
    json.dump(feature_schema, f, indent=4)


In [33]:
metrics = {
    "baseline_mae": float(mae_baseline),
    "baseline_rmse": float(rmse_baseline),
    "random_forest_mae": float(mae_rf),
    "gradient_boosting_mae": float(mae_gbr)
}

with open("metrics.json", "w") as f:
    json.dump(metrics, f, indent=4)


In [34]:
#production step
#Prediction logic
def predict_demand(lag_1: float) -> float:
    """
    Baseline demand forecast:
    Predict next week's demand using last week's demand
    """
    return float(lag_1)


In [36]:
from predictor import predict_demand

predict_demand(25000)


ModuleNotFoundError: No module named 'predictor'

In [37]:
import os
print(os.listdir())


['.anaconda', '.conda', '.condarc', '.continuum', '.gitconfig', '.ipynb_checkpoints', '.ipython', '.jupyter', '.keras', '.matplotlib', 'anaconda3', 'anaconda_projects', 'AppData', 'Application Data', 'Contacts', 'Cookies', 'Desktop', 'Documents', 'Downloads', 'Favorites', 'feature_schema.json', 'Links', 'Local Settings', 'metrics.json', 'model_features.pkl', 'model_info.json', 'model_info.pkl', 'Music', 'My Documents', 'NetHood', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG2', 'NTUSER.DAT{e07fb1a8-5f7e-11f0-b011-50bbb534fa29}.TxR.0.regtrans-ms', 'NTUSER.DAT{e07fb1a8-5f7e-11f0-b011-50bbb534fa29}.TxR.1.regtrans-ms', 'NTUSER.DAT{e07fb1a8-5f7e-11f0-b011-50bbb534fa29}.TxR.2.regtrans-ms', 'NTUSER.DAT{e07fb1a8-5f7e-11f0-b011-50bbb534fa29}.TxR.blf', 'NTUSER.DAT{e07fb1a9-5f7e-11f0-b011-50bbb534fa29}.TM.blf', 'NTUSER.DAT{e07fb1a9-5f7e-11f0-b011-50bbb534fa29}.TMContainer00000000000000000001.regtrans-ms', 'NTUSER.DAT{e07fb1a9-5f7e-11f0-b011-50bbb534fa29}.TMContainer00000000000000000002.regtran