#  STEP 1: INITIAL DATA EXPLORATION
#### Tools: Jupyter Notebooks, pandas, Matplotlib

### Use Case: 
### Store Item Demand Forecasting
### Given 5 years of store-item sales data.
### Goal is to predict 3 months of sales for 50 different items at 10 different stores.

In [None]:
# imports
import warnings
import numpy as np
import pandas as pd
from matplotlib import pyplot
%matplotlib inline

## Data Set
#### Data Source: https://www.kaggle.com/c/demand-forecasting-kernels-only

In [None]:
import types
import pandas as pd

dataset = pd.read_csv("train.csvc",parse_dates=['date'])
dataset.head()


## Data Quality Assessment

In [None]:
# view data set information
dataset.info()

##### Number of Observations

In [None]:
# training data set size
print("Training data set size: %s" %dataset.size)

#### Exploratory Data Analysis

##### Summary statistics

In [None]:
# describe train data

dataset.describe().T

##### Column types

In [None]:
# explore train data types

dataset.dtypes

##### Time period of data set

In [None]:
# time period of data set
print('Min date of training data set: %s' % dataset['date'].min().date())
print('Max date of training data set: %s' % dataset['date'].max().date())

### Initial data exploration

#### Daily sales

In [None]:
# total daily sales
total_daily_sales = dataset.groupby('date', as_index=False)['sales'].sum()

# store daily sales
store_daily_sales = dataset.groupby(['store', 'date'], as_index=False)['sales'].sum()

# item daily sales
item_daily_sales = dataset.groupby(['item', 'date'], as_index=False)['sales'].sum()

In [None]:
print('total daily sales :')
total_daily_sales.head()

In [None]:
print('store daily sales :')
store_daily_sales.head()

In [None]:
print('item daily sales :')
item_daily_sales.head()

In [None]:
# summary statistics - Total daily sales
print('Total daily sales summary:')
total_daily_sales.describe().T

In [None]:
# summary statistics - store daily sales
print('store daily sales summary:')
store_daily_sales.describe().T

In [None]:
# summary statistics - item daily sales
print('item daily sales summary:')
item_daily_sales.describe().T

### Data Visualisation

##### Visualise total daily sales

In [None]:
total_daily_sales[['date','sales']].plot('date', figsize=(15,8))

##### Visualise store daily sales

In [None]:
# create stacked line plots of store dialy sales
fig, axs = pyplot.subplots(figsize=(15, 8))
for store in store_daily_sales['store'].unique():
    current_store_daily_sales = store_daily_sales[(store_daily_sales['store'] == store)]
    #current_store_daily_sales[['date','sales']].plot('date', figsize=(15,15))
    current_store_daily_sales[['date','sales']].plot('date', ax=axs, legend=False)
pyplot.ylabel("sales")    
pyplot.show()

##### Visualise item daily sales

In [None]:
# create stacked line plots of item dialy sales
fig, axs = pyplot.subplots(figsize=(15, 15))
for item in item_daily_sales['item'].unique():
    current_item_daily_sales = item_daily_sales[(item_daily_sales['item'] == item)]
    current_item_daily_sales[['date','sales']].plot('date', ax=axs, legend=False)
pyplot.ylabel("sales")    
pyplot.show()

#### Visualise total monthly sales


In [None]:
monthly_sales = total_daily_sales.resample('M', on = 'date').sum()
monthly_sales.head()

In [None]:
# plot monthly sales
monthly_sales.plot(figsize=(15,8))
pyplot.ylabel("monthly sales")    
pyplot.show()

## Initial data exploration summary


#### data set size: 3,652,000

#### data fields:
##### date - Date of the sale data. There are no holiday effects or store closures.
##### store - Store ID
##### item - Item ID
##### sales - Number of units of an Item sold at a particular store on a particular date.

##### data set time period:2013-01-01 to 2017-12-31 (5 years)
