In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from itertools import cycle
pd.set_option('max_columns', 50)
plt.style.use('bmh')
color_pal = plt.rcParams['axes.prop_cycle'].by_key()['color']
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

# Dataset Files
- `calendar.csv` - Contains information about the dates on which the products are sold.
- `sales_train_validation.csv` - Contains the historical daily unit sales data per product and store [d_1 - d_1913]
- `sell_prices.csv` - Contains information about the price of the products sold per store and date.
- `sales_train_evaluation.csv` - Available one month before competition deadline. Will include sales [d_1 - d_1941]
- `sample_submission.csv` - Submission for the 28 days ahead forecast

In [2]:
os.listdir("dataset/")

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'dataset/'

In [None]:
# loading the dataset files
cal = pd.read_csv('dataset/calendar.csv')

In [None]:
print(cal.shape)
cal.head(10)

In [None]:
stv = pd.read_csv('dataset/sales_train_validation.csv')

In [None]:
print(stv.shape)
stv.head(10)

In [None]:
ste = pd.read_csv('dataset/sales_train_evaluation.csv')

In [None]:
print(ste.shape)
ste.head()

We are given historic sales data in the `sales_train_validation` dataset.
- rows exist in this dataset for days d_1 to d_1913. We are given the department, category, state, and store id of the item.
- d_1914 - d_1941 represents the `validation` rows which we will predict in stage 1
- d_1942 - d_1969 represents the `evaluation` rows which we will predict for the final competition standings.

In [None]:
ss = pd.read_csv('dataset/sample_submission.csv')

In [None]:
print(ss.shape)
ss.head(10)

# Our Aim
We are trying to predict forecast sales for the next 28 forecast days. The sample submission has the following format:
- The columns represent 28 forecast days. We will fill these forecast days with our predictions.
- The rows each represent a specific item. This id tells us the item type, state, and store.

In [None]:
sellp = pd.read_csv('dataset/sell_prices.csv')

In [None]:
print(sellp.shape)
sellp.head(10)

In [None]:
# visualizing data for a singe item

day_cols = [col for col in stv.columns if 'd_' in col]
print("Total number of days for training ", len(day_cols))
print("-----------------------------------------")
print("Starting 10 days \n", day_cols[:10])
print("-----------------------------------------")
print("Last 10 days \n", day_cols[-10:])

In [None]:
'''
for item : 'HOBBIES_1_001_CA_1_validation'
just select the id as the index and the sales for the 1913 days
'''
stv.loc[stv['id'] == 'HOBBIES_1_001_CA_1_validation'] \
    .set_index('id')[day_cols].T \
    .plot(figsize = (20, 7), title = "HOBBIES_1_001_CA_1 sales",
          color = next(color_cycle))
plt.ylabel('unit sales')
plt.xlabel('day')
plt.show()

In [None]:
stv.loc[stv['id'] == 'FOODS_3_090_CA_3_validation'] \
    .set_index('id')[day_cols].T \
    .plot(figsize = (20, 7), title = "FOODS_3_090_CA_3 sales",
          color = next(color_cycle))
plt.ylabel('unit sales')
plt.xlabel('day')
plt.show()

## Merging the data with real dates
- We are given a calendar with additional information about past and future dates.
- The calendar data can be merged with our days data
- From this we can find weekly and annual trends

In [None]:
cols = ['d','date','event_name_1','event_name_2','event_type_1','event_type_2', 'snap_CA']
cal[cols].head()

In [None]:
# Merge calendar on our items' data
example = stv.loc[stv['id'] == 'FOODS_3_090_CA_3_validation'][day_cols].T
example = example.rename(columns={8412:'FOODS_3_090_CA_3'}) # Name it correctly
example = example.reset_index().rename(columns={'index': 'd'}) # make the index "d"
example = example.merge(cal, how='left', validate='1:1')
example.set_index('date')['FOODS_3_090_CA_3'] \
    .plot(figsize=(15, 5),
          color=next(color_cycle),
          title='FOODS_3_090_CA_3 sales by actual sale dates')
plt.show()

# Select more top selling examples
example2 = stv.loc[stv['id'] == 'HOBBIES_1_234_CA_3_validation'][day_cols].T
example2 = example2.rename(columns={6324:'HOBBIES_1_234_CA_3'}) 
example2 = example2.reset_index().rename(columns={'index': 'd'}) 
example2 = example2.merge(cal, how='left', validate='1:1')

example3 = stv.loc[stv['id'] == 'HOUSEHOLD_1_118_CA_3_validation'][day_cols].T
example3 = example3.rename(columns={6776:'HOUSEHOLD_1_118_CA_3'})
example3 = example3.reset_index().rename(columns={'index': 'd'})
example3 = example3.merge(cal, how='left', validate='1:1')

# Getting narrower results
- Day of the week
- Month 
- Year

In [None]:
examples = ['FOODS_3_090_CA_3','HOBBIES_1_234_CA_3','HOUSEHOLD_1_118_CA_3']
example_df = [example, example2, example3]
for i in [0, 1, 2]:
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 3))
    example_df[i].groupby('wday').mean()[examples[i]] \
        .plot(kind='line',
              title='average sale: day of week',
              lw=5,
              color=color_pal[0],
              ax=ax1)
    example_df[i].groupby('month').mean()[examples[i]] \
        .plot(kind='line',
              title='average sale: month',
              lw=5,
              color=color_pal[4],

              ax=ax2)
    example_df[i].groupby('year').mean()[examples[i]] \
        .plot(kind='line',
              lw=5,
              title='average sale: year',
              color=color_pal[2],

              ax=ax3)
    fig.suptitle(f'Trends for item: {examples[i]}',
                 size=20,
                 y=1.1)
    plt.tight_layout()
    plt.show()

In [None]:
# plotting 10 items to notice some trends
ten_examples = stv.sample(10, random_state=529) \
        .set_index('id')[day_cols] \
    .T \
    .merge(cal.set_index('d')['date'],
           left_index=True,
           right_index=True,
            validate='1:1') \
    .set_index('date')

In [None]:
fig, axs = plt.subplots(5, 2, figsize=(15, 20))
axs = axs.flatten()
ax_idx = 0
for item in ten_examples.columns:
    ten_examples[item].plot(title=item,
                              color=next(color_cycle),
                              ax=axs[ax_idx])
    ax_idx += 1
plt.tight_layout()
plt.show()

# Some Observations
- Some items are unavailable for certain durations of time
- There is a huge range difference in the sales of items
- for some items max sales is 6 while for some it is 800
- there are occasional spikes in the sales
- this could be due to weekends or some special events.
- drop in the sales price could also account for the same.

# Combined Sales over Time by Category