#Before your start with this Tutorial

**Tutorial Intention:** Providing an example of iteration and related step on a modeling phase for you to:

*   Experience the data science lifecycle using Vectice
*   See how simple it is to connect your notebook to Vectice
*   Learn how to structure and log your work using Vectice

**Resources needed:**
*   Forecast Unit Sales Tutorial Project: You can find it as part of your personal workspace named after your name
*   Vectice Webapp Documentation: 
*   Vectice API documentation: 

#PIP install Packages

In [1]:
!pip install -q python-dotenv gql Deprecated requests_toolbelt
!pip install --q squarify
!pip install --q s3fs
!pip install --upgrade boto3
!pip install --q plotly

[K     |████████████████████████████████| 65 kB 3.1 MB/s 
[K     |████████████████████████████████| 54 kB 2.8 MB/s 
[K     |████████████████████████████████| 202 kB 66.1 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.1.5 requires typing-extensions<4.2.0,>=3.7.4.1; python_version < "3.8", but you have typing-extensions 4.4.0 which is incompatible.
spacy 3.4.3 requires typing-extensions<4.2.0,>=3.7.4; python_version < "3.8", but you have typing-extensions 4.4.0 which is incompatible.
confection 0.0.3 requires typing-extensions<4.2.0,>=3.7.4.1; python_version < "3.8", but you have typing-extensions 4.4.0 which is incompatible.[0m
[?25h

In [2]:
!pip install --index-url https://test.pypi.org/simple/ vectice==22.4.7.1

Looking in indexes: https://test.pypi.org/simple/, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vectice==22.4.7.1
  Downloading https://test-files.pythonhosted.org/packages/39/13/1664ea8ae0eedb9d9c93587069553247520487843462a8499765b50b811d/vectice-22.4.7.1-py2.py3-none-any.whl (116 kB)
[K     |████████████████████████████████| 116 kB 14.2 MB/s 
Installing collected packages: vectice
Successfully installed vectice-22.4.7.1


#Import libraries

In [4]:
# importing mathematical and ds libraries
import pandas as pd  # data science essentials
import matplotlib.pyplot as plt  # essential graphical output
import numpy as np   # mathematical essentials
%matplotlib inline

# import Visual libraries
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from matplotlib import pyplot as plt
import seaborn as sns  # enhanced graphical output

#importing other libraries
import IPython.display #this is for our data pipeline
import logging
import json
logging.basicConfig(level=logging.INFO)

# D3 modules
from IPython.core.display import display, HTML, Javascript
from string import Template
import datetime
from datetime import timedelta

In [5]:
#import the Vectice Library
import vectice
from vectice import FileDataWrapper, DatasetSourceUsage


urllib3 (1.26.12) or chardet (3.0.4) doesn't match a supported version!



#Connect to Vectice, Project and Phase

In [6]:
#Connect to vectice Using the API
retail_ws = vectice.connect(config=r"API_token.json")

In [7]:
#Connect to the project
retail_project = retail_ws.project(project="Tutorial Project: Forecast in store unit sales")

In [8]:
# Get the phase for Data Preparation 
DP = retail_project.phase("Data Preparation")

In [9]:
# Get the currently active iteration
iter = DP.iteration

## Select Data 

In [10]:
# Get the Select Data step
step = iter.step("Select Data")

In [None]:
items = pd.read_csv("items.csv")
holiday_events = pd.read_csv("holidays_events.csv", parse_dates=['date'])
stores = pd.read_csv("stores.csv")
oil = pd.read_csv("oil.csv", parse_dates=['date'])
transactions = pd.read_csv("transactions.csv", parse_dates=['date'])
df = pd.read_csv("train_trimmed.csv")

In [None]:
# Use the FileDataWrapper to create the DataSource for the datasets
items_file_wrapped = FileDataWrapper(path="items.csv", name="Items origin")
holiday_file_wrapped = FileDataWrapper(path="holidays_events.csv", name="Holiday origin")
stores_file_wrapped = FileDataWrapper(path="stores.csv", name="Stores origin")
oil_file_wrapped = FileDataWrapper(path="oil.csv", name="Oil origin")
transactions_file_wrapped = FileDataWrapper(path="transactions.csv", name="Transactions origin")
df_file_wrapped = FileDataWrapper(path="train_trimmed.csv", name="Training origin")

In [None]:
retail_project.origin_dataset = items_file_wrapped
retail_project.origin_dataset = holiday_file_wrapped
retail_project.origin_dataset = oil_file_wrapped
retail_project.origin_dataset = transactions_file_wrapped
retail_project.origin_dataset = df_file_wrapped

In [11]:
step.close(message="We selected the main dataset: Corporation Favorita Dataset")

#Clean Data

In [13]:
# Get the Clean Data step
step = iter.step("Clean data")

In [None]:
df.head() #preview of the dataset

NameError: ignored

In [None]:
#add missing date
min_oil_date = min(train.date)
max_oil_date = max(train.date)

calendar = []

d1 = min_oil_date
d2 = max_oil_date

delta = d2 - d1         # timedelta

for i in range(delta.days + 1):
    calendar.append(datetime.date.strftime(d1 + timedelta(days=i), '%Y-%m-%d'))

calendar = pd.DataFrame({'date':calendar})

oil = calendar.merge(oil, left_on='date', right_on='date', how='left')

In [None]:
#Check how many NA
print(oil.isnull().sum(), '\n')

#Type
print('Type : ', '\n', oil.dtypes)

#Print the 3 first line
oil.head(5)

In [None]:
#Check index to apply the formula
na_index_oil = oil[oil['dcoilwtico'].isnull() == True].index.values

#Define the index to use to apply the formala
na_index_oil_plus = na_index_oil.copy()
na_index_oil_minus = np.maximum(0, na_index_oil-1)

for i in range(len(na_index_oil)):
    k = 1
    while (na_index_oil[min(i+k,len(na_index_oil)-1)] == na_index_oil[i]+k):
        k += 1
    na_index_oil_plus[i] = min(len(oil)-1, na_index_oil_plus[i] + k )

#Apply the formula
for i in range(len(na_index_oil)):
    if (na_index_oil[i] == 0):
        oil.loc[na_index_oil[i], 'dcoilwtico'] = oil.loc[na_index_oil_plus[i], 'dcoilwtico']
    elif (na_index_oil[i] == len(oil)):
        oil.loc[na_index_oil[i], 'dcoilwtico'] = oil.loc[na_index_oil_minus[i], 'dcoilwtico']
    else:
        oil.loc[na_index_oil[i], 'dcoilwtico'] = (oil.loc[na_index_oil_plus[i], 'dcoilwtico'] + oil.loc[na_index_oil_minus[i], 'dcoilwtico'])/ 2    

In [None]:
#Make sure the dates are all the same format
holiday_events['date'] = pd.to_datetime(holiday_events['date'], format="%Y-%m-%d")
oil['date'] = pd.to_datetime(oil['date'], format="%Y-%m-%d")
holiday_events.dtypes

In [None]:
#remove id for the dataset
df = df.drop('id', axis = 1)

#create the clean dataset
df.to_csv("Corpo_fav_cleaned.csv")

In [None]:
#Wrap dataset to export metadata to Vectice to access it from the webapp
clean_file_wrapped = FileDataWrapper(path="Corpo_fav_cleaned.csv", name="Corpo_fav_cleaned")
retail_project.origin_dataset = clean_file_wrapped

In [14]:
#Close step, mark it as completed in the webapp and publish message
step.close(message="I replaced NaN and missing values by their mean value. I changed boolean variable into Integers. I Renamed columns to be consistent. I dropped IDs as they were not making sense for our analysis. I transformed dates into date format as some were delaying weird results")

#Construct Data

In [15]:
# Get the Construct data step
step = iter.step("Construct Data")

**Here we analyze the data and select the features for our model to be trained on.**


* **Train**: id, date, store_nbr, item_nbr, unit_scale, on_promotion
* **Items**: item_nbr, family, class, perishable
* **Holidays_events**: date, type, locale, locale_name, description, transferred
* **Stores**: store_nbr, city, state, type, cluster
* **Oil**: date, dcoilwtico
* **Transactions**: date, store_nbr, transactions

**Selected features as inputs to the model**

* date, holiday.type, holidaye.locale, holiday.locale_name, holiday_transfered, 
* store_nbr, store.city, store.state, store.type, store.cluster, transactions, 
* item_nbr, item.family, item.class, on_promotion, perishable, dcoilwtico.

**Selected features as outputs of the model**
* transactions per store, unit_sales per item

In [16]:
#Close step, mark it as completed in the webapp and publish message
step.close(message="For our model output, we selected unit sales. For our model input we selected: date, holiday.type, holidaye.locale, holiday.locale_name, holiday_transfered, store_nbr, store.city, store.state, store.type, store.cluster, transactions, item_nbr, item.family, item.class, on_promotion, perishable, dcoilwtico.")

#Integrate Data

In [17]:
# Get the Integrate data step
step = iter.step("Integrate Data")

In [None]:
#Merge train
df = df.merge(stores, left_on='store_nbr', right_on='store_nbr', how='left')
df = df.merge(items, left_on='item_nbr', right_on='item_nbr', how='left')
df = df.merge(holiday_events, left_on='date', right_on='date', how='left')
df = df.merge(oil, left_on='date', right_on='date', how='left')

#drop column that are not needed
df = df.drop(['description', 'locale_name'], axis = 1)

In [None]:
#create the ready for modeling dataset
df.to_csv("r4modeling.csv")

#save that new dataset to Vectice
r4m_file_wrapped = FileDataWrapper(path="r4modeling.csv", name="Ready4modeling")
retail_project.origin_dataset = r4m_file_wrapped

In [18]:
#Close step, mark it as completed in the webapp and publish message
step.close(message="I merged the stores, items, holiday events and oil dataset to my main dataset to bring additional informations")

# Format Data

In [19]:
# Get the Format data step
step = iter.step("Format Data")

In [None]:
# create the datapipeline to clean data and automate transformation
import datetime as dt
from sklearn.base import BaseEstimator, TransformerMixin

class prepare_data(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("prepare_data -> init")
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        train_stores = X[0].merge(X[1], right_on = 'store_nbr', left_on='store_nbr')
        train_stores_oil = train_stores.merge(X[2], right_on='date', left_on='date')
        train_stores_oil_items = train_stores_oil.merge(X[3], right_on = 'item_nbr', left_on = 'item_nbr')
        train_stores_oil_items_transactions = train_stores_oil_items.merge(X[4], right_on = ['date', 'store_nbr'], left_on = ['date', 'store_nbr'])
        train_stores_oil_items_transactions_hol = train_stores_oil_items_transactions.merge(X[5], right_on = 'date', left_on = 'date')
        
        data_df = train_stores_oil_items_transactions_hol.copy(deep = True)
        
        # Fill the empty values
        data_df['onpromotion'] = data_df['onpromotion'].fillna(0)
        # change the bool to int
        data_df['onpromotion'] = data_df['onpromotion'].astype(int)
        data_df['transferred'] = data_df['transferred'].astype(int)

        # change the names
        data_df.rename(columns={'type_x': 'st_type', 'type_y': 'hol_type'}, inplace=True)

        # drop the id
        data_df.drop(['id'], axis=1, inplace=True)
        
        print(data_df.head())
        
        # handle date
        data_df['date'] = pd.to_datetime(data_df['date'])
        data_df['date'] = data_df['date'].map(dt.datetime.toordinal)
                
        return data_df

In [20]:
#Close step, mark it as completed in the webapp and publish message
step.close(message="I created a data pipeline to be able to reproduce and streamline the data preparation for future data. The process would include inputing nulls in numerical attributes, applying standard scalar and encoding categorical data.")