In [None]:
import warnings; warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import datetime
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
import pmdarima
import pickle
import time
import os
from fbprophet import Prophet

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

pd.plotting.register_matplotlib_converters()

#  Calendar

In [None]:
calendar = pd.read_csv(f'{input_path}calendar.csv', parse_dates=['date'])
calendar.head()

In [None]:
downcast_dict = {'wm_yr_wk': np.int16,
                'wday': np.int16,
                'd': 'category',
                'snap_CA': np.uint8,
                'snap_TX': np.uint8,
                'snap_WI': np.uint8,
                'event': np.uint8}

Dummy variables are created for each event type or event name by testing their presence in $event\_name/type\_1$ and $event\_name/type\_2$. After some exploration, I find out that if there is any event 2, there will definitely be event 1.

In [None]:
event_types = calendar.event_type_1.unique()[1:]

for event_type in event_types:
    calendar['event_' + event_type.lower()] = ((calendar.event_type_1 == event_type) | (calendar.event_type_2 == event_type)).map({True: 1, False: 0})
    downcast_dict['event_' + event_type.lower()] = np.uint8
event_names = calendar.event_name_1.unique()[1:]
for event_name in event_names:
    calendar['event_' + event_name.lower()] = ((calendar.event_name_1 == event_name) | (calendar.event_name_2 == event_name)).map({True: 1, False: 0})
    downcast_dict['event_' + event_name.lower()] = np.uint8
    
calendar['event'] = (~calendar.event_name_1.isnull()).map({True: 1, False: 0}) #dummy variable to test the presence of any event

Unnecessary fields are dropped and other fields are downcasted to save RAM usage.

In [None]:
calendar.drop(['weekday', 'month', 'year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'], axis=1, inplace=True)
calendar = calendar.astype(downcast_dict)

Output as an object to retain all the types downcasted. (feature can be a better option because its performance is better)

In [None]:
with open(f'{output_path}processed_calendar', 'wb+') as out:
    pickle.dump(calendar, out)

# Prices

In [None]:
prices = pd.read_csv(f'{input_path}sell_prices.csv')
prices.head()

Aggregate the fields into one $id$ to match the $id$ in submission.csv

In [None]:
prices['id'] = prices.item_id + '_' + prices.store_id + '_validation'
prices.drop(['store_id', 'item_id'], axis=1, inplace=True)

In [None]:
prices = prices.astype({'id': 'category', 
               'wm_yr_wk': np.int16,
               'sell_price': np.float16})

In [None]:
with open(f'{output_path}processed_prices', 'wb+') as out:
    pickle.dump(prices, out)

# Sales

In [None]:
sales = pd.read_csv(f'{input_path}sales_train_validation.csv')
sales.head()

In [None]:
sales.drop(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], axis=1, inplace=True)

In [None]:
d = dict()
d['id'] = 'category'

for i in range(1, 1914):
    d['d_' + str(i)] = np.int16

sales = sales.astype(d)

Melt the sales dataframe so that the days fields can be used to merge with other dataframe

In [None]:
st = time.time()
melted_sales = pd.melt(sales, id_vars='id', var_name='d', value_name='demand')
print("sales melted {}".format(time.time()-st))

melted_sales['d'] = melted_sales['d'].astype('category')

st = time.time()
with open(f'{output_path}melted_sales', 'wb+') as out:
    pickle.dump(melted_sales, out)
time.time()-st

## merging all data

In [None]:
st = time.time()
prices_calendar = pd.merge(prices, calendar, on='wm_yr_wk', how='left')
print('prices merged with calendar {}'.format(time.time()-st))

In [None]:
st = time.time()
full_df = pd.merge(prices_calendar, melted_sales, on=['id', 'd'], how='left')
print('merged with melted sales {}'.format(time.time()-st))

In [None]:
full_df = full_df.set_index('date')
full_df = full_df.astype({'d':'category'})

In [None]:
with open(f'{output_path}full_df', 'wb+') as out:
    pickle.dump(full_df, out)