# DATA VISUALIZATION
## Load Data

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib 
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import seaborn as sns
from scipy.stats import skew
from sklearn.preprocessing import LabelEncoder
import datetime
import pickle

pd.set_option('display.max_columns', None) # no truncate columns

In [None]:
#Environment settings
data_path_in = 'Data/input/'
data_path_out = 'Data/output/'

data_type = {
    'StoreID': 'uint16',
    'Date': str,
    'IsHoliday': bool,
    'IsOpen': bool,
    'HasPromotions': bool,
    'StoreType': 'category',
    'AssortmentType': 'category',
    'NearestCompetitor': 'uint32',
    'Region': 'category',
    'NumberOfCustomers': 'uint16',
    'NumberOfSales': 'uint16',
    'Region_AreaKM2': 'uint16',
    'Region_GDP': 'uint16',
    'Region_PopulationK': 'uint16',
    'CloudCover': 'float32',
    'Events': 'category',
    'Max_Dew_PointC': 'int8',
    'Max_Gust_SpeedKm_h': float, # 'uint8'
    'Max_Humidity': 'uint8',
    'Max_Sea_Level_PressurehPa': 'uint16',
    'Max_TemperatureC': 'int8',
    'Max_VisibilityKm': float, # 'uint8'
    'Max_Wind_SpeedKm_h': 'uint8',
    'Mean_Dew_PointC': 'int8',
    'Mean_Humidity': 'uint8',
    'Mean_Sea_Level_PressurehPa': 'uint16',
    'Mean_TemperatureC': 'int8',
    'Mean_VisibilityKm': float, # 'uint8'
    'Mean_Wind_SpeedKm_h': 'uint8',
    'Min_Dew_PointC': 'int8',
    'Min_Humidity': 'uint8',
    'Min_Sea_Level_PressurehPa': 'uint16',
    'Min_TemperatureC': 'int8',
    'Min_VisibilitykM': float, #'uint8'
    'Precipitationmm': 'float16',
    'WindDirDegrees': 'int16'
}

# load and serialize dataframes for later use in preprocessing notebook
df = {}
for ds_name in ['train', 'test']:
    df[ds_name] = pd.read_csv(data_path_in+ds_name+'.csv', dtype=data_type, parse_dates=["Date"], dayfirst=True)
    with open(data_path_out+ds_name+'_dv.obj', 'wb') as file:
        pickle.dump(df, file)

train = df['train']
test = df['test']

In [None]:
print("Train set has shape {}.\nTest set has shape {}" .format(train.shape, test.shape))

## Combining train & test data

In [None]:
X = train.copy()
X = X.drop(columns=['NumberOfCustomers', 'NumberOfSales'], axis=1)
y = train.loc[:, 'NumberOfCustomers':'NumberOfSales']

all_data = pd.concat([X, test], axis=0).reset_index()

#to restore the original column order
all_data = all_data[list(test.columns.values)]
all_data.head(10)


## 1. Numerical features

In [None]:
numeric_features = all_data.select_dtypes(include=[np.number])

print("There are {} numeric features ({:.2f}% of total) ".format(numeric_features.shape[1],
                                                                 (numeric_features.shape[1]/
                                                                  all_data.shape[1])*100))

In [None]:
all_data.describe()

### Date manipulation

In [None]:
all_data.Date.head()

In [None]:
# Missing values
plt.figure(figsize=(15,6))
rows_length = numeric_features.shape[0]
numeric_nan_length = numeric_features.isna().sum().where(lambda x : x > 0).dropna()
numeric_nan_length = numeric_nan_length.sort_values()
ax = numeric_nan_length \
.plot(kind='barh', alpha=0.9, title='Missing values count', table=True)
ax.xaxis.set_visible(False) # hide x axis labels

### Max_Gust_SpeedKm_h

In [None]:
all_data = all_data.drop('Max_Gust_SpeedKm_h', axis=1)
all_data.shape

### CloudCover & Visibility

In [None]:
nan_index = all_data[all_data['CloudCover'].isnull()].index.tolist()
# for i in nan_index:
#     curr_date = all_data.loc[i,'Date']
#     curr_region = all_data.loc[i,'Region']
#     regional_stores = all_data[(all_data.Date == curr_date) & (all_data.Region == curr_region)]
#     curr_mode = regional_stores['CloudCover'].mode()


# 

### Time - sales visualization

In [None]:
stores = {}
for storeid in train.StoreID.unique():
    stores[storeid] = pd.DataFrame(train[train["StoreID"] == storeid])
    stores[storeid] = stores[storeid].set_index("Date")

In [None]:
#ax = stores[1000].plot(y="NumberOfSales",figsize=(20,10))
#for key,store in stores.items():
#    store.plot(ax=ax, y="NumberOfSales")

In [None]:
train_byRegion = train.groupby('Region')

plt.figure(figsize=(200,100))
for region,regStores in train_byRegion:
    print(region)
    
    ax = plt.subplot2grid((11,1),(int(region), 0))
    
    for key, store in regStores.groupby('StoreID'):
        store = store.set_index("Date")
        store.NumberOfSales.resample('M').sum().plot(ax = ax, figsize=(20,100),legend=False)
        #store.groupby(pd.Grouper(freq='M')).plot(ax = ax, x="Date", y="NumberOfSales",figsize=(20,100),legend=False)
        #sns.tsplot(data=store, time='Date', value='NumberOfSales')
        
    plt.show()
    

In [None]:
store