In [80]:
ON_COLAB = False
if ON_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

In [81]:
import pandas as pd
import numpy as np

In [82]:
sales_train = pd.read_csv("../data/sales_train_validation_afcs2023.csv")
prices = pd.read_csv("../data/sell_prices_afcs2023.csv")
calendar = pd.read_csv("../data/calendar_afcs2023.csv", parse_dates=["date"])

In [83]:
date_range = pd.Series(pd.date_range(start="2011-01-29", end="2016-06-19", freq='D')).dt.strftime('%Y-%m-%d').to_list()
date_range.insert(0, "id")
sales_train.columns = date_range[:len(sales_train.columns)]
sales_train[['item_id', "Loc1", "Loc2", "Type"]] = sales_train.id.str.rsplit("_", n=3, expand=True)
sales_train = sales_train.drop("id", axis=1)
sales_train = sales_train.melt(id_vars=['item_id', "Loc1", "Loc2", "Type"], var_name="date", value_name="sales_amount")
sales_train["date"] = pd.to_datetime(sales_train["date"])

In [84]:
sales_train = pd.merge(sales_train, calendar, on='date')
sales_train = sales_train.drop(columns=['Loc1', 'Loc2', "Type"])
sales_train = pd.merge(sales_train, prices, on=['wm_yr_wk', 'item_id'],how ='left')
sales_train = sales_train.drop('store_id',axis=1)

### Items were added at different times

In [85]:
date_added = sales_train[sales_train.sell_price.notna()].groupby("item_id")["date"].min().reset_index()
date_added

Unnamed: 0,item_id,date
0,FOODS_3_001,2011-01-29
1,FOODS_3_002,2013-10-19
2,FOODS_3_003,2015-01-17
3,FOODS_3_004,2013-06-01
4,FOODS_3_005,2011-01-29
...,...,...
818,FOODS_3_823,2011-02-19
819,FOODS_3_824,2011-01-29
820,FOODS_3_825,2011-01-29
821,FOODS_3_826,2013-04-13


In [86]:
date_added["week_added"] = 1
sales_train = sales_train.merge(date_added, on=["item_id", "date"], how='left')
sales_train["week_added"] = sales_train["week_added"].fillna(0)

### Cleaning categorical values to ease one-hot encoding later on

In [87]:
sales_train[['event_name_1', 'event_name_2']] = sales_train[['event_name_1', 'event_name_2']].fillna("No event")

In [88]:
sales_train[['event_type_1', 'event_type_2']] = sales_train[['event_type_1', 'event_type_2']].fillna("No event type")

In [89]:
sales_train

Unnamed: 0,item_id,date,sales_amount,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_TX,sell_price,week_added
0,FOODS_3_001,2011-01-29,0,11101,Saturday,1,1,2011,No event,No event type,No event,No event type,0,2.28,1.0
1,FOODS_3_002,2011-01-29,0,11101,Saturday,1,1,2011,No event,No event type,No event,No event type,0,,0.0
2,FOODS_3_003,2011-01-29,0,11101,Saturday,1,1,2011,No event,No event type,No event,No event type,0,,0.0
3,FOODS_3_004,2011-01-29,0,11101,Saturday,1,1,2011,No event,No event type,No event,No event type,0,,0.0
4,FOODS_3_005,2011-01-29,0,11101,Saturday,1,1,2011,No event,No event type,No event,No event type,0,1.68,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1574394,FOODS_3_823,2016-04-24,2,11613,Sunday,2,4,2016,No event,No event type,No event,No event type,0,2.98,0.0
1574395,FOODS_3_824,2016-04-24,0,11613,Sunday,2,4,2016,No event,No event type,No event,No event type,0,2.48,0.0
1574396,FOODS_3_825,2016-04-24,0,11613,Sunday,2,4,2016,No event,No event type,No event,No event type,0,3.98,0.0
1574397,FOODS_3_826,2016-04-24,2,11613,Sunday,2,4,2016,No event,No event type,No event,No event type,0,1.28,0.0


In [90]:
sales_train = sales_train.sort_values(["item_id", "date"])

### Feature engineering on sales price

### Creating Classification Of Price

In [91]:
sales_train['sell_price'] = sales_train['sell_price'].fillna(0)

In [92]:
quantiles = sales_train.sell_price.quantile([0, 0.25, 0.5, 0.75, 1]).to_list()

In [93]:
sales_train["price_category"] = pd.cut(sales_train["sell_price"], quantiles, labels=["low", "mid-low", "mid-high", "high"])

#### Filling Null values for sales price if no items not added yet (do not have a price).

#### Creating weekly shifted/ lagged values

The assumption is the following:
Since we are transposing $price_t$ to $price_{t+1}$, for $price_0$ for each product we need to backfill values for the first week, since these were shifted into the future.

In [94]:
lagged_8_days = (
    sales_train
    .set_index("date")
    .groupby("item_id", as_index=False)[["sell_price", "price_category", "item_id", "sales_amount", "event_name_1", "event_name_2", "event_type_1", "event_type_2"]]
    .shift(periods=7)
    .reset_index()
    .bfill()
    .rename(columns={
        "sell_price": "7dl_sell_price",
        "price_category": "7dl_price_category",
        "sales_amount": "7dl_sales_amount",
        "event_name_1": "7dl_event_name_1",
        "event_name_2": "7dl_event_name_2",
        "event_type_1": "7dl_event_type_1",
        "event_type_2": "7dl_event_type_2"
    })
)

sales_train = sales_train.merge(lagged_8_days, on=["date", "item_id"])

### Adding Weekly, Monthly, Quaterly and Yearly Percent change in prices

In [95]:
weekly_changes_in_price = (
    sales_train
    .groupby(['item_id', "wm_yr_wk"])['sell_price']
    .mean()
    .pct_change()
    .reset_index()
    .rename(columns={"sell_price": "weekly_pct_change"})
    .fillna(0)
)
weekly_changes_in_price.loc[weekly_changes_in_price["weekly_pct_change"] == np.inf, "weekly_pct_change"] = 0
weekly_changes_in_price["weekly_pct_change"] = weekly_changes_in_price["weekly_pct_change"] * 100

In [96]:
monthly_changes_in_price = (
    sales_train
    .set_index("date")
    .groupby(['item_id', pd.Grouper(freq="M")])['sell_price']
    .mean()
    .pct_change()
    .reset_index()
    .rename(columns={"sell_price": "monthly_pct_change"})
    .fillna(0)
)
monthly_changes_in_price.loc[monthly_changes_in_price["monthly_pct_change"] == np.inf, "monthly_pct_change"] = 0
monthly_changes_in_price["monthly_pct_change"] = monthly_changes_in_price["monthly_pct_change"] * 100

In [97]:
quaterly_changes_in_price = (
    sales_train
    .set_index("date")
    .groupby(['item_id', pd.Grouper(freq="Q")])['sell_price']
    .mean()
    .pct_change()
    .reset_index()
    .rename(columns={"sell_price": "qrt_pct_change"})
    .fillna(0)
)
quaterly_changes_in_price.loc[quaterly_changes_in_price["qrt_pct_change"] == np.inf, "qrt_pct_change"] = 0
quaterly_changes_in_price["qrt_pct_change"] = quaterly_changes_in_price["qrt_pct_change"] * 100

In [98]:
yearly_changes_in_price = (
    sales_train
    .set_index("date")
    .groupby(['item_id', pd.Grouper(freq="Y")])['sell_price']
    .mean()
    .pct_change()
    .reset_index()
    .rename(columns={"sell_price": "yearly_pct_change"})
    .fillna(0)
)
yearly_changes_in_price.loc[yearly_changes_in_price["yearly_pct_change"] == np.inf, "yearly_pct_change"] = 0
yearly_changes_in_price["yearly_pct_change"] = yearly_changes_in_price["yearly_pct_change"] * 100

In [99]:
sales_train = sales_train.merge(weekly_changes_in_price, on=["item_id", "wm_yr_wk"])

In [100]:
sales_train = sales_train.merge(
    monthly_changes_in_price,
    left_on=[
        "item_id",
        sales_train['date'].dt.to_period('M')
    ],
    right_on=[
        "item_id",
        monthly_changes_in_price['date'].dt.to_period('M')
    ]
).drop(["date_y", "key_1"], axis=1).rename(columns={"date_x":"date"})

In [101]:
sales_train = sales_train.merge(
    quaterly_changes_in_price,
    left_on=[
        "item_id",
        sales_train['date'].dt.to_period('Q')
    ],
    right_on=[
        "item_id",
        quaterly_changes_in_price['date'].dt.to_period('Q')
    ]
).drop(["date_y", "key_1"], axis=1).rename(columns={"date_x":"date"})

In [102]:
sales_train = sales_train.merge(
    yearly_changes_in_price,
    left_on=[
        "item_id",
        sales_train['date'].dt.to_period('Y')
    ],
    right_on=[
        "item_id",
        yearly_changes_in_price['date'].dt.to_period('Y')
    ]
).drop(["date_y", "key_1"], axis=1).rename(columns={"date_x":"date"})

### Adding Weekday, Weekend and Holiday Dummy Vars

In [103]:
sales_train["is_weekday"] = sales_train["weekday"].apply(lambda x: x not in ["Saturday","Sunday"])

In [104]:
sales_train["is_weekend"] = sales_train["weekday"].apply(lambda x: x in ["Saturday","Sunday"])

Here we can have either 0, 1 or 2 events together.

In [105]:
sales_train["is_holiday"] = sales_train.apply(
    lambda x: sum(
        [
            int(x["event_name_1"] != 'No event'),
            int(x["event_name_2"] != 'No event'),
        ]
    ), axis=1
)


In [106]:
sales_train.to_csv("../data/fulling_connected_feature_eng_train_data.csv")

In [107]:
sales_train.to_pickle("../data/fulling_connected_feature_eng_train_data.pkl")