In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [27]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib 
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import seaborn as sns
from tqdm import tqdm
import  plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from src import utils
import datetime
import warnings
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import colorlover  as cl
from IPython.display import HTML
import sys


#hack to avoid showing deprecationg warnings
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)


#Environment settings
data_path_in = 'Data/input/'
data_path_out= 'Data/ouput/'

In [3]:
# Deserialize previously saved data from "preprocessing"
train_df = pd.read_csv(data_path_in+'train_clean.csv')
test_df = pd.read_csv(data_path_in+'test_clean.csv')
print(train_df.shape)
print(test_df.shape)

(523021, 36)
(45689, 34)


# Changing time format

In [4]:
train_df.Date=train_df.Date.apply(lambda x:datetime.datetime.strptime(x, '%Y-%m-%d'))
test_df.Date=test_df.Date.apply(lambda x:datetime.datetime.strptime(x, '%Y-%m-%d'))
print("Date column converted to Datetime type")

Date column converted to Datetime type


# Combining test & train set

In [5]:
# Dropping the 2 features not present in test set
train_stripped = train_df.copy()
train_stripped=train_stripped.drop(columns=['NumberOfCustomers','NumberOfSales'],axis=1)
all_data = pd.concat([train_stripped,test_df],axis=0).reset_index()

#to restore the original column order
all_data= all_data[list(test_df.columns.values)]
print("Train and test combined. Shape {}".format(all_data.shape))

Train and test combined. Shape (568710, 34)


# 1.Store informations

## 1.1 Number of stores and sales per store type

In [6]:
store_names=all_data['StoreType'].unique()
store_num = len(all_data['StoreID'].unique())

#Pie chart 1
store_type_count=[]
for name in store_names:
    filtered=all_data[all_data['StoreType']== name ]
    store_type_count.append(len(filtered['StoreID'].unique()))
    

#Pie chart 2
sales_per_type = list(train_df.groupby(['StoreType'])['NumberOfSales'].sum())
#Hack to make order correspond to that of store_names
sales_per_type[1], sales_per_type[3] = sales_per_type[3], sales_per_type[1]

fig = {
  "data": [
    {
      "values": store_type_count,
      "labels": store_names,
      "domain": {"x": [0, .48]},
      "name": "Store count/type",
      "hoverinfo":"label+value+name",
      "type": "pie"
    },
    {
      "values": sales_per_type,
      "labels": store_names,
      "domain": {"x": [.52, 1]},
      "name": "Sales count / Store type",
      "hoverinfo":"label+value+name",
      "type": "pie"
    }],
  "layout": {
        "title":"Store count & Sales per Store Type"
    }
}
iplot(fig, filename='store informations')

## 1.2 Number of stores per store type for each region

In [80]:
data=[]
for store_type_id, store_type in all_data.groupby('StoreType'):
    regional_count={}
    for region_id,region in store_type.groupby('Region'):
        regional_count[region_id]=len(region['StoreID'].unique())
    data.append(go.Bar(x=list(regional_count.keys()),
                       y=[regional_count[k] for k in regional_count],
                       name=store_type_id
                      ))
    
layout = go.Layout(
    barmode='stack',
    title='Store types count per Region',
    xaxis=dict(title='Region'),
    yaxis=dict(title='Store')
    )

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='stacked-bar')

# 1.3 Store density per Region

In [71]:
density=[]
store_num=[]
region_size=[]
for regionID, region in all_data.groupby('Region'):
    area=region['Region_AreaKM2'].unique()[0]
    stores=len(region['StoreID'].unique())
    density.append(stores/area)
    store_num.append(stores)
    region_size.append(area)

trace0 = go.Bar(
    x=np.arange(0,12),
    y=density,
    text=["Area: "+str(region_size[x])+"\t Stores: "+str(store_num[x]) for x in range(0,11)],
    opacity=1
)

data = [trace0]
layout = go.Layout(
    title='Store per Area for each region',
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='store-area')

# 2. Time Series

In [8]:
dic_train = {'IsHoliday':'mean',
       'IsOpen':'mean',
       'HasPromotions':'mean',
       'NearestCompetitor':'mean',
       'NumberOfCustomers':'sum',
       'NumberOfSales':'sum',
       'Region_AreaKM2':'mean',
       'Region_GDP':'mean',
       'Region_PopulationK':'mean',
       #'Max_Gust_SpeedKm_h':'mean', dropped in preprocessing
       'Max_Humidity':'mean',
       'Max_Sea_Level_PressurehPa':'mean',
       'Max_TemperatureC':'mean',
       'Max_VisibilityKm':'mean',
       'Max_Wind_SpeedKm_h':'mean',
       'Mean_Dew_PointC':'mean',
       'Mean_Humidity':'mean',
       'Mean_Sea_Level_PressurehPa':'mean',
       'Mean_TemperatureC':'mean',
       'Mean_VisibilityKm':'mean',
       'Mean_Wind_SpeedKm_h':'mean',
       'Min_Dew_PointC':'mean',
       'Min_Humidity':'mean',
       'Min_Sea_Level_PressurehPa':'mean',
       'Min_TemperatureC':'mean',
       'Min_VisibilitykM':'mean',
       'Precipitationmm':'mean',
       'WindDirDegrees':'mean'}

dic_all = dic_train.copy()
del dic_all['NumberOfSales']
del dic_all['NumberOfCustomers']

monthly_sales_train=train_df.groupby(["Region","StoreID",pd.Grouper(key = 'Date',freq='M')]).\
agg(dic_train).reset_index()

monthly_sales_all=all_data.groupby(["Region","StoreID",pd.Grouper(key = 'Date',freq='M')]).\
agg(dic_all).reset_index()

weekly_sales_train=train_df.groupby(["Region","StoreID",pd.Grouper(key = 'Date',freq='W')]).\
agg(dic_train).reset_index()

weekly_sales_train=train_df.groupby(["Region","StoreID",pd.Grouper(key = 'Date',freq='w')]).\
agg(dic_train).reset_index()

In [39]:
#Specify plot colors
spectral_c=cl.scales['11']['qual']['Paired']

def create_plot_data(dataset,interest,method):
    data_=[]
    for regionID, region in dataset.groupby('Region'):
        if method=='sum':
            regional_data=region.groupby(['Date'])[interest].sum()
        else:
            regional_data=region.groupby(['Date'])[interest].mean()
        
        data_.append(go.Scattergl(x=region['Date'],
                           y=regional_data,
                           name = regionID,
                           line = dict(color = spectral_c[len(data_)]),
                           opacity = 1))
    return data_

updatemenus = utils.get_region_updatemenu(default_active=0)

In [37]:
layout_m = dict(title='Sales/Month per Region', showlegend=False,
              updatemenus=updatemenus)
fig_m = dict(data=create_plot_data(monthly_sales_train,
                                   'NumberOfSales',
                                   "sum" ), layout=layout_m)
iplot(fig_m, filename='Sales per Month for each Region')

In [38]:
layout_w = dict(title='Sales/Week per Region', showlegend=False,
              updatemenus=updatemenus)

fig_w = dict(data=create_plot_data(weekly_sales_train,
                                   'NumberOfSales',
                                   "sum" ), layout=layout_w)

iplot(fig_w, filename='Sales per Week for each Region')

In [41]:
layout_w_promo = dict(title='Promo mean /Week per Region', showlegend=False,
              updatemenus=updatemenus)

fig_w_promo = dict(data=create_plot_data(weekly_sales_train,
                                   'HasPromotions',
                                   "mean" ), layout=layout_w_promo)
iplot(fig_w_promo, filename='Promo mean per Week for each Region')

In [135]:
pop =[]
area=[]
density =[]
for regionID, region in monthly_sales_train.groupby('Region'):
    pop.append(region['Region_PopulationK'].unique()[0]*1000)
    area.append(region['Region_AreaKM2'].unique()[0])
    density.append(region['Region_PopulationK'].unique()[0]*1000/
                        region['Region_AreaKM2'].unique()[0])

normalized_pop = (pop-min(pop))/(max(pop)-min(pop))
normalized_area = (area-min(area))/(max(area)-min(area))
normalized_density = (density-min(density))/(max(density)-min(density))

data_pop=go.Bar(x=np.arange(0,12),
                y=normalized_pop,
                name='Population',
                text=pop)

data_area=go.Bar(x=np.arange(0,12),
                 y=normalized_area,
                 text=area,
                 name='Area kmq')

data_density=go.Bar(x=np.arange(0,12),
                    y=normalized_density,
                    text=density,
                    name='Density')



In [136]:
data = [data_pop, data_area,data_density]

layout = go.Layout(
    barmode='group',
    title='Population-Area-Density (Normalized)',
    yaxis=dict(title='Normalized value'),
    xaxis=dict(title='Region')
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='Population info')

In [None]:
layout = dict(title='Population region', showlegend=False)

fig = dict(data=data_pop, layout=layout)
iplot(fig, filename='Population per Region')

In [13]:
for regionID, region in all_data.groupby('Region'):
    open_hol_stores = region[(region['IsHoliday']==1)&(region['IsOpen']==1)].shape[0]
    print("Region {}\t Open stores on holiday {} ".format(regionID,open_hol_stores))

Region 0	 Open stores on holiday 110 
Region 1	 Open stores on holiday 2 
Region 2	 Open stores on holiday 22 
Region 3	 Open stores on holiday 95 
Region 4	 Open stores on holiday 20 
Region 5	 Open stores on holiday 11 
Region 6	 Open stores on holiday 3 
Region 7	 Open stores on holiday 23 
Region 8	 Open stores on holiday 24 
Region 9	 Open stores on holiday 29 
Region 10	 Open stores on holiday 100 
