In [None]:
#Setting working path required for relative package loading

import os
os.chdir("/home/tales/ds/walmart-recruiting-store-sales-forecasting/")
print("working dir:", "\'" + os.getcwd() + "\'")

In [None]:
import pandas as pd

from src.utils import pretties, time_utils, stats
from src.visualization import plot
from bokeh.plotting import show, output_notebook

In [None]:
pretties.max_data_frame_columns()
pretties.decimal_notation()
output_notebook()

# walmart-recruiting-store-sales-forecasting
https://www.kaggle.com/c/walmart-recruiting-store-sales-forecasting

# Train

### train.csv

<p>This is the historical training data, which covers from <b>2010-02-05</b> to <b>2012-11-01</b>. <br> Within this file you will find the following fields:</p>

<ul>
<li>Store - the store number</li>
<li>Dept - the department number</li>
<li>Date - the week</li>
<li>Weekly_Sales - &nbsp;sales for the given department in the given store</li>
<li>IsHoliday - whether the week is a special holiday week</li>
</ul>

This file contains anonymized information about the 45 stores, indicating the type and size of store.

### Loading

In [None]:
dtype = {"Store": str, "Dept": str, "Date": str, "Weekly_Sales": float, "IsHoliday": bool}

In [None]:
train = pd.read_csv("data/raw/train.csv", dtype=dtype)
train["timestamp"] = train["Date"].apply(lambda str_dt : time_utils.str_datetime_to_timestamp(str_dt, "%Y-%m-%d"))
train = train.sort_values("timestamp")

In [None]:
print("Total train size: {}".format(len(train)))

In [None]:
train.head(3).append(train.tail(3))

# <font color="navy">Overview</font>

#### Weekly Sales

In [None]:
train["Weekly_Sales"].describe()

<b>Negative Sales</b> <br>
https://www.kaggle.com/c/walmart-recruiting-store-sales-forecasting/discussion/7152

In [None]:
stats.freq(train["Weekly_Sales"] < 0)

<b>Zero Sales</b>

In [None]:
stats.freq(train["Weekly_Sales"] == 0)

<b>No Outliers</b>

In [None]:
train["Weekly_Sales"].plot.hist(title="Weekly Sales Overview", figsize=(6,3))

In [None]:
outliers_thresh_quantile = 0.90
no_outliers = train[train["Weekly_Sales"] <= train["Weekly_Sales"].quantile(outliers_thresh_quantile)]
no_outliers["Weekly_Sales"].plot.hist(title="Weekly Sales Overview - No Outliers ({}%)".format(100 * outliers_thresh_quantile), figsize=(6,3))

# <font color="navy">Stores</font>

In [None]:
train.groupby("Store")["Weekly_Sales"].sum().sort_values().plot.bar(title="Sales grouped by store", figsize=(10,3))

# <font color="navy">Dates</font>

In [None]:
grouped_sales = train.groupby("Date")["Weekly_Sales"].sum()
p = plot.plot_time_series_count(grouped_sales.index, grouped_sales, color="navy")
show(p)

<b>Questions</b> <br>
Does each the stores follow the same sales variance behavior of all together?

# <font color="navy">Holidays</font>

In [None]:
display("Store Dates With Holidays")
stats.freq(train.drop_duplicates(["Store", "Date"])["IsHoliday"])

In [None]:
grouped_sales = train.groupby("IsHoliday")["Weekly_Sales"].mean()
grouped_sales.plot.bar(title="Median Sales Comparison Between Holiday and Ordinary Days")

<b>Notes</b> <br>
It seems that not all the holidays have a huge effect in increasing sales, among all stores and all departments

# <font color="navy">Departments</font>

In [None]:
train["StoreDept"] = train["Store"] + "_" + train["Dept"]

In [None]:
train.sort_values(["Weekly_Sales"], ascending=False)

<b>Questions</b> <br>
Can we assume that the same Dept number corresponds the same department accross all stores?
https://www.kaggle.com/c/walmart-recruiting-store-sales-forecasting/discussion/7159
(hypothesis testing)