In [None]:
%pip install --q vectice -U
%pip install --q squarify -U
%pip install --q plotly -U
%pip install --q seaborn -U
%pip install --q nbformat -U

In [None]:
phs_id = "PHA-1596"

In [None]:
import vectice as vct
vec = vct.connect(config="token_i.json")

## Import libraries

In [None]:
# importing libraries

import pandas as pd  # data science essentials
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 10000
import matplotlib.pyplot as plt  # essential graphical output
import seaborn as sns  # enhanced graphical output
import numpy as np   # mathematical essentials
import squarify
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px

In [None]:
# Download the files locally
!wget https://vectice-examples.s3.us-west-1.amazonaws.com/Tutorial/ForecastTutorial/items.csv -q --no-check-certificate
!wget https://vectice-examples.s3.us-west-1.amazonaws.com/Tutorial/ForecastTutorial/holidays_events.csv -q --no-check-certificate
!wget https://vectice-examples.s3.us-west-1.amazonaws.com/Tutorial/ForecastTutorial/stores.csv -q --no-check-certificate
!wget https://vectice-examples.s3.us-west-1.amazonaws.com/Tutorial/ForecastTutorial/oil.csv -q --no-check-certificate
!wget https://vectice-examples.s3.us-west-1.amazonaws.com/Tutorial/ForecastTutorial/transactions.csv -q --no-check-certificate
!wget https://vectice-examples.s3.us-west-1.amazonaws.com/Tutorial/ForecastTutorial/train_reduced.csv -q --no-check-certificate

#### Great! Let's build dataframes from the file for later use

In [None]:
#read datasets
items = pd.read_csv("items.csv")
holiday_events = pd.read_csv("holidays_events.csv", parse_dates=['date'])
stores = pd.read_csv("stores.csv")
oil = pd.read_csv("oil.csv", parse_dates=['date'])
transactions = pd.read_csv("transactions.csv", parse_dates=['date'])
df = pd.read_csv("train_reduced.csv")

### Create Vectice dataset assets

#### First let's navigate your way to your personal workspace, get the tutorial project and start an iteration of the 'Data Understanding' phase. Go ahead and execute the cell below to navigate to your workspace. 

In [None]:
# Start an iteration of the phase
active_iter = vec.phase(phs_id).create_iteration()

#### Let's document the datasets we created for our project

In [None]:
# Provide context into the origin datasets by attaching them to the step
active_iter.step_collect_initial_data += vct.Dataset.origin(name="Items origin",resource=vct.FileResource(paths="items.csv", dataframes=items), attachments='Item1.png')
active_iter.step_collect_initial_data += vct.Dataset.origin(name="Holiday origin",resource=vct.FileResource(paths="holidays_events.csv", dataframes=holiday_events), attachments= 'holiday.png')
active_iter.step_collect_initial_data += vct.Dataset.origin(name="Stores origin",resource=vct.FileResource(paths="stores.csv", dataframes=stores), attachments='Store1.png')
active_iter.step_collect_initial_data += vct.Dataset.origin(name="Oil origin",resource=vct.FileResource(paths="oil.csv", dataframes=oil))
active_iter.step_collect_initial_data += vct.Dataset.origin(name="Transactions origin",resource=vct.FileResource(paths="transactions.csv", dataframes=transactions), attachments='transaction1317.png')

active_iter.step_collect_initial_data += "5 datasets have been identified as origin dataset that will be combined to create the main dataset used for this project, once the exploration and data quality are completed."

#### Great, now we have our datasets and metadata documented in Vectice...pretty straight forward!

# Describe data

#### The following few cells are boiker plate code and not specific to Vectice
#### Obviously your strategy to describe your datasets might be more fleshed out than this

### Collect basic data properties

### Document the "Describe Data" step and close it

#### Let's push all that we have learned from our datasets in Vectice, keeping the context inline make it simple

In [None]:
# Document the "Describe Data" step and close it
# formatting the dimensions of the dataset (ROWS, COLUMNS)
msg = "\nSize of Original Dataset:\n"\
"Items dataset: Observations: " + str(items.shape[0]) + " - Features: " + str(items.shape[1])  + "\n" \
"Holiday dataset: Observations: " + str(holiday_events.shape[0])  + "- Features: " + str(holiday_events.shape[1])  + "\n" \
"Stores dataset: Observations: " + str(stores.shape[0])  + " - Features: " + str(stores.shape[1])  + "\n" \
"Oil: Observations: " + str(oil.shape[0])  + " - Features: " + str(oil.shape[1])  + "\n" \
"Transactions: Observations: " + str(transactions.shape[0])  + " - Features: " + str(transactions.shape[1])

active_iter.step_describe_data = "The data properties have been reviewed for the datasets identified\n" + msg

### Visualizations

#### Stores Visualizations

In [None]:
#Treemap of store counts across different cities
fig = plt.figure(figsize=(25, 21))
marrimeko=stores.city.value_counts().to_frame().reset_index()
marrimeko.columns = ["city", "count"]
ax = fig.add_subplot(111, aspect="equal")
ax = squarify.plot(sizes=marrimeko['count'].values,label=marrimeko['city'].values,
              color=sns.color_palette('cubehelix_r', 28), alpha=1)
ax.set_xticks([])
ax.set_yticks([])
fig=plt.gcf()
fig.set_size_inches(40,25)
plt.title("Treemap of store counts across different cities", fontsize=18)
fig.savefig('Store1.png', dpi=300)
plt.show()

In [None]:
#Treemap of store counts across different States
fig = plt.figure(figsize=(25, 21))
marrimeko=stores.city.value_counts().to_frame().reset_index()
marrimeko.columns = ["state", "count"]
ax = fig.add_subplot(111, aspect="equal")
ax = squarify.plot(sizes=marrimeko['count'].values,label=marrimeko['state'].values,
              color=sns.color_palette('viridis_r', 28), alpha=1)
ax.set_xticks([])
ax.set_yticks([])
fig=plt.gcf()
fig.set_size_inches(40,25)
plt.title("Treemap of store counts across different States", fontsize=18)
fig.savefig('Store2.png', dpi=300)
plt.show()

##### Inspecting the allocation of clusters to store numbers - Visualizations

In [None]:
#Stacked Barplot of Store types and their cluster distribution
plt.style.use('dark_background')
type_cluster = stores.groupby(['type','cluster']).size()
type_cluster.unstack().plot(kind='bar',stacked=True, colormap= 'PuBu', figsize=(13,11),  grid=False)
plt.title('Stacked Barplot of Store types and their cluster distribution', fontsize=18)
plt.ylabel('Count of clusters in a particular store type', fontsize=16)
plt.xlabel('Store type', fontsize=16)
plt.savefig('Store4.png', dpi=300);
plt.show()

#### Holidays Visualization

In [None]:
#Stacked Barplot of locale name against event type
holiday_local_type = holiday_events.groupby(['locale_name', 'type']).size()
holiday_local_type.unstack().plot(kind='bar',stacked=True, colormap= 'magma_r', figsize=(12,10),  grid=False)
plt.title('Stacked Barplot of locale name against event type')
plt.ylabel('Count of entries')
plt.savefig('holiday.png')
plt.show()

#### Transactions Visualization

In [None]:
#Distribution of transactions per day from 2013 till 2017
plt.style.use('seaborn-white')
plt.figure(figsize=(13,11))
plt.plot(transactions.date.values, transactions.transactions.values, color='darkblue')
plt.ylim(-50, 10000)
plt.title("Distribution of transactions per day from 2013 till 2017")
plt.ylabel('transactions per day', fontsize= 16)
plt.xlabel('Date', fontsize= 16)
plt.savefig('transaction1317.png')
plt.show()

#### Items Visualizations

In [None]:

#Counts of items per family category
x, y = (list(x) for x in zip(*sorted(zip(items.family.value_counts().index, 
                                         items.family.value_counts().values), 
                                        reverse = False)))
trace2 = go.Bar(
    y=items.family.value_counts().values,
    x=items.family.value_counts().index,
    marker=dict(
        color=items.family.value_counts().values,
        colorscale = 'Portland',
        reversescale = False
    ),
    orientation='v',
)

layout = dict(
    title='Counts of items per family category',
     width = 800, height = 800,
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
#         domain=[0, 0.85],
    ))

fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')
plt.savefig('Item1.png')

In [None]:
#Number of items attributed to a particular item class
x, y = (list(x) for x in zip(*sorted(zip(items['class'].value_counts().index, 
                                         items['class'].value_counts().values), 
                                        reverse = False)))
trace2 = go.Bar(
    x=items['class'].value_counts().index,
    y=items['class'].value_counts().values,
    marker=dict(
        color=items['class'].value_counts().values,
        colorscale = 'Portland',
        reversescale = True
    ),
    orientation='v',
)

layout = dict(
    title='Number of items attributed to a particular item class',
     width = 800, height = 1400,
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
#         domain=[0, 0.85],
    ))

fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')
plt.savefig('Item2.png')

In [None]:
#Stacked Barplot of locale name against event type
plt.style.use('seaborn-white')
fam_perishable = items.groupby(['family', 'perishable']).size()
fam_perishable.unstack().plot(kind='bar',stacked=True, colormap= 'coolwarm', figsize=(12,10),  grid=False)
plt.title('Stacked Barplot of locale name against event type')
plt.ylabel('Count of entries')
plt.savefig('Item3.png')

### Document our findings in Vectice by attaching the visualizations we just created

In [None]:
active_iter.step_describe_data += 'Store1.png'
active_iter.step_describe_data += 'Store2.png'
active_iter.step_describe_data += 'Store4.png'
active_iter.step_describe_data += 'holiday.png'
active_iter.step_describe_data += 'transaction1317.png'
active_iter.step_describe_data += 'Item1.png'
active_iter.step_describe_data += 'Item2.png'
active_iter.step_describe_data += 'Item3.png'

In [None]:
active_iter.step_explore_data += f"Items dataset:\n{items.describe().applymap('{:,.2f}'.format).to_json(indent=2 )[1:480]}..."
active_iter.step_explore_data += f"Stores dataset:\n{stores.describe().applymap('{:,.2f}'.format).to_json(indent=2 )[1:480]}"
active_iter.step_explore_data += f"Holidays dataset:\n{holiday_events.describe().applymap('{:,.2f}'.format).to_json(indent=2 )[1:480]}"
active_iter.step_explore_data += f"Oil dataset:\n{oil.describe().applymap('{:,.2f}'.format).to_json(indent=2 )[1:480]}"
active_iter.step_explore_data += f"Transaction dataset:\n{transactions.describe().applymap('{:,.2f}'.format).to_json(indent=2 )[1:470]}..."

# Verify Data Quality

### Basic EDA

In [None]:

datasets = {"items": items, "holiday_events": holiday_events, "stores": stores, "oil": oil, "transaction": transactions}
for name, ds in datasets.items():
    active_iter.step_verify_data_quality += f"Dataset: {name}\nThere are {len(ds)} rows in the dataset\nIsnull report:\n{ds.isnull().sum()}\nMissing values report:\n{ds.isna().sum()}"

### Document our findings in Vectice

In [None]:
#Close step, mark it as completed in the webapp and publish message
active_iter.step_verify_data_quality += "The information comprise in this dataset is accurate and comprehensive.\nAs the information aligns with other trusted resources, the dataset was considered as reliable and also relevant to the business problem we are trying to solve.\nHowever, this data can not be used for real time reporting as the data does not update itself.\nFurther data preparation is required."

In [None]:
active_iter.complete()