In [116]:
ON_COLAB = False
if ON_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

In [117]:
import pandas as pd
import numpy as np

In [118]:
sales_train = pd.read_csv("../data/sales_train_validation_afcs2023.csv")
prices = pd.read_csv("../data/sell_prices_afcs2023.csv")
calendar = pd.read_csv("../data/calendar_afcs2023.csv", parse_dates=["date"])

In [119]:
date_range = pd.Series(pd.date_range(start="2011-01-29", end="2016-06-19", freq='D')).dt.strftime('%Y-%m-%d').to_list()
date_range.insert(0, "id")
sales_train.columns = date_range[:len(sales_train.columns)]
sales_train[['item_id', "Loc1", "Loc2", "Type"]] = sales_train.id.str.rsplit("_", n=3, expand=True)
sales_train = sales_train.drop("id", axis=1)
sales_train = sales_train.melt(id_vars=['item_id', "Loc1", "Loc2", "Type"], var_name="date", value_name="sales_amount")
sales_train["date"] = pd.to_datetime(sales_train["date"])

In [120]:
sales_train = pd.merge(sales_train, calendar, on='date')
sales_train = sales_train.drop(columns=['Loc1', 'Loc2'])
sales_train = pd.merge(sales_train, prices, on=['wm_yr_wk', 'item_id'],how ='left')
sales_train = sales_train.drop('store_id',axis=1)

### Items were added at different times

In [121]:
date_added = sales_train[sales_train.sell_price.notna()].groupby("item_id")["date"].min().reset_index()
date_added

Unnamed: 0,item_id,date
0,FOODS_3_001,2011-01-29
1,FOODS_3_002,2013-10-19
2,FOODS_3_003,2015-01-17
3,FOODS_3_004,2013-06-01
4,FOODS_3_005,2011-01-29
...,...,...
818,FOODS_3_823,2011-02-19
819,FOODS_3_824,2011-01-29
820,FOODS_3_825,2011-01-29
821,FOODS_3_826,2013-04-13


In [122]:
date_added["week_added"] = 1
sales_train = sales_train.merge(date_added, on=["item_id", "date"], how='left')
sales_train["week_added"] = sales_train["week_added"].fillna(0)

### Cleaning categorical values to ease one-hot encoding later on

In [123]:
sales_train[['event_name_1', 'event_name_2']] = sales_train[['event_name_1', 'event_name_2']].fillna("No event")

In [124]:
sales_train[['event_type_1', 'event_type_2']] = sales_train[['event_type_1', 'event_type_2']].fillna("No event type")

In [125]:
sales_train = sales_train.sort_values(["item_id", "date"])

### Feature engineering on sales price

#### Filling Null values for sales price if no items not added yet (do not have a price).

In [126]:
sales_train['sell_price'] = sales_train['sell_price'].fillna(0)

#### Creating weekly shifted/ lagged values

The assumption is the following:
Since we are transposing $price_t$ to $price_{t+1}$, for $price_0$ for each product we need to backfill values for the first week, since these were shifted into the future.

In [127]:
sales_train

Unnamed: 0,item_id,Type,date,sales_amount,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_TX,sell_price,week_added
0,FOODS_3_001,validation,2011-01-29,0,11101,Saturday,1,1,2011,No event,No event type,No event,No event type,0,2.28,1.0
823,FOODS_3_001,validation,2011-01-30,2,11101,Sunday,2,1,2011,No event,No event type,No event,No event type,0,2.28,0.0
1646,FOODS_3_001,validation,2011-01-31,1,11101,Monday,3,1,2011,No event,No event type,No event,No event type,0,2.28,0.0
2469,FOODS_3_001,validation,2011-02-01,3,11101,Tuesday,4,2,2011,No event,No event type,No event,No event type,1,2.28,0.0
3292,FOODS_3_001,validation,2011-02-02,0,11101,Wednesday,5,2,2011,No event,No event type,No event,No event type,0,2.28,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1571106,FOODS_3_827,validation,2016-04-20,0,11612,Wednesday,5,4,2016,No event,No event type,No event,No event type,0,1.00,0.0
1571929,FOODS_3_827,validation,2016-04-21,1,11612,Thursday,6,4,2016,No event,No event type,No event,No event type,0,1.00,0.0
1572752,FOODS_3_827,validation,2016-04-22,0,11612,Friday,7,4,2016,No event,No event type,No event,No event type,0,1.00,0.0
1573575,FOODS_3_827,validation,2016-04-23,0,11613,Saturday,1,4,2016,No event,No event type,No event,No event type,0,1.00,0.0


In [128]:
lagged_8_days = (
    sales_train
    .set_index("date")
    .groupby("item_id", as_index=False)[["sell_price", "item_id", "sales_amount", "event_name_1", "event_name_2", "event_type_1", "event_type_2"]]
    .shift(periods=7)
    .reset_index()
    .bfill()
    .rename(columns={
        "sell_price": "7dl_sell_price",
        "sales_amount": "7dl_sales_amount",
        "event_name_1": "7dl_event_name_1",
        "event_name_2": "7dl_event_name_2",
        "event_type_1": "7dl_event_type_1",
        "event_type_2": "7dl_event_type_2"
    })
)

sales_train = sales_train.merge(lagged_8_days, on=["date", "item_id"])

### Adding Weekly, Monthly, Quaterly and Yearly Percent change in prices

In [159]:
weekly_changes_in_price = (
    sales_train
    .groupby(['item_id', "wm_yr_wk"])['sell_price']
    .mean()
    .rolling(window=2)
    .apply(lambda x: (x.iloc[0] - x.iloc[1])/ 100)
    .reset_index()
)

In [162]:
weekly_changes_in_price.sort_values("sell_price", ascending=False)

Unnamed: 0,item_id,wm_yr_wk,sell_price
22468,FOODS_3_084,11101,0.1873
81378,FOODS_3_299,11101,0.1873
29866,FOODS_3_111,11101,0.1198
108230,FOODS_3_397,11101,0.1148
188786,FOODS_3_691,11101,0.0998
...,...,...,...
108048,FOODS_3_396,11241,-0.1098
46941,FOODS_3_173,11236,-0.1098
81104,FOODS_3_298,11101,-0.1610
22194,FOODS_3_083,11101,-0.1700


In [165]:
wm_yr

Unnamed: 0,item_id,wm_yr_wk,sell_price
0,FOODS_3_001,11101,
1,FOODS_3_001,11102,0.0
2,FOODS_3_001,11103,0.0
3,FOODS_3_001,11104,0.0
4,FOODS_3_001,11105,0.0
...,...,...,...
225497,FOODS_3_827,11609,0.0
225498,FOODS_3_827,11610,0.0
225499,FOODS_3_827,11611,0.0
225500,FOODS_3_827,11612,0.0


In [167]:
sales_train[(sales_train["item_id"] == "FOODS_3_110") & (sales_train["wm_yr_wk"].isin([11101, 11100, 11102]))]

Unnamed: 0,item_id,Type,date,sales_amount,wm_yr_wk,weekday,wday,month,year,event_name_1,...,snap_TX,sell_price,week_added,7dl_sell_price,7dl_sales_amount,7dl_event_name_1,7dl_event_name_2,7dl_event_type_1,7dl_event_type_2,weekly_pct_change_price
206604,FOODS_3_110,validation,2011-01-29,0,11101,Saturday,1,1,2011,No event,...,0,0.0,0.0,0.0,0.0,No event,No event,No event type,No event type,-1.0
206605,FOODS_3_110,validation,2011-01-30,0,11101,Sunday,2,1,2011,No event,...,0,0.0,0.0,0.0,0.0,No event,No event,No event type,No event type,-1.0
206606,FOODS_3_110,validation,2011-01-31,0,11101,Monday,3,1,2011,No event,...,0,0.0,0.0,0.0,0.0,No event,No event,No event type,No event type,-1.0
206607,FOODS_3_110,validation,2011-02-01,0,11101,Tuesday,4,2,2011,No event,...,1,0.0,0.0,0.0,0.0,No event,No event,No event type,No event type,-1.0
206608,FOODS_3_110,validation,2011-02-02,0,11101,Wednesday,5,2,2011,No event,...,0,0.0,0.0,0.0,0.0,No event,No event,No event type,No event type,-1.0
206609,FOODS_3_110,validation,2011-02-03,0,11101,Thursday,6,2,2011,No event,...,1,0.0,0.0,0.0,0.0,No event,No event,No event type,No event type,-1.0
206610,FOODS_3_110,validation,2011-02-04,0,11101,Friday,7,2,2011,No event,...,0,0.0,0.0,0.0,0.0,No event,No event,No event type,No event type,-1.0
206611,FOODS_3_110,validation,2011-02-05,0,11102,Saturday,1,2,2011,No event,...,1,0.0,0.0,0.0,0.0,No event,No event,No event type,No event type,
206612,FOODS_3_110,validation,2011-02-06,0,11102,Sunday,2,2,2011,SuperBowl,...,1,0.0,0.0,0.0,0.0,No event,No event,No event type,No event type,
206613,FOODS_3_110,validation,2011-02-07,0,11102,Monday,3,2,2011,No event,...,1,0.0,0.0,0.0,0.0,No event,No event,No event type,No event type,
