In [None]:
%pip install --q vectice -U

In [None]:
# Variables for demo runs
prv_phs_id = "PHA-1594"
phs_id = "PHA-1597"
test_value = 0.25

In [None]:
import vectice as vct

vec = vct.connect(config="token_i.json")

In [None]:
# Get the ID of the input dataset from the previous phase
ds_id = vec.phase(prv_phs_id).iteration(1).step_integrate_data.artifacts[0].dataset_version_id

# Get back on our phase
active_iter = vec.phase(phs_id).create_iteration()

# 3. A Simple Modeling Exercise

### 3.1 Logging a Simple Text-Only Message
#### The first step, as described above, calls for describing the modeling technique we will use in this iteration of the model. Execute

In [None]:
active_iter.step_select_modeling_techniques = "For this first iteration we are going to use a Linear Regression model to get a base model."

#### to log a short description of our work. This completes the step

### 3.2 Logging a Text Message with Embedded Variables

#### For our next step, it looks like our modeling overlords would like us to split our dataset into training, testing and validation datasets, and log some basic information about the split.

#### Let get the dataset we need to split. Lucky for us, the Vectice elves have left us a clean dataset, created as part of the “Data Preparation” phase.
#### Execute the cell below to download it locally

In [None]:
#!wget https://vectice-examples.s3.us-west-1.amazonaws.com/Tutorial/ForecastTutorial/original_clean.csv -q --no-check-certificate

#### Alright - it’s time to split this baby up.

#### Since we’re about to do some modeling work, we need to load a few analytics libraries and packages. 
#### Execute the following boilerplate code, which is completely independent of Vectice.
#### Don’t worry about understanding the following cell in great detail - suffice to say we are simply retrieving the dataset we saved above, splitting it into three (for training, testing and validation) and saving the split datasets locally.


In [None]:
# NOTE: this cell is boilerplate data science code
# there is no Vectice code below

# here, we install essential analytics libraries and download our dataset,
# before splitting it into 3 files (for training, testing and validation)
# that we then save locally

# import some essential math libraries
import pandas as pd; import matplotlib.pyplot as plt; import numpy as np
import plotly.offline as py; from matplotlib import pyplot as plt
import IPython.display
%matplotlib inline
py.init_notebook_mode(connected=True)


# load scikit-learn modeling packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error

# read the dataset
df_model = pd.read_csv("original_clean.csv")

# specify how much of the dataset to set aside for testing
test_size = test_value
# specify a seed value so we can always generate the same split
random_state = 42

# Generate df_train, df_test, which we will need for modeling
df_train, df_test = train_test_split(df_model, test_size = test_size, random_state = random_state)

# save the 3 split datasets locally
df_train.to_csv("traindataset.csv")
df_test.to_csv("testdataset.csv")

#### The next thing we should do is log the datasets we used, so we know where these numbers came from. Go ahead and execute

In [None]:
train_ds = vct.FileResource(paths="traindataset.csv", dataframes=df_train)
test_ds = vct.FileResource(paths="testdataset.csv", dataframes=df_test)


dataset = vct.Dataset.modeling(
    name="my modeling dataset",
    training_resource=train_ds,
    testing_resource=test_ds, 
    derived_from = ds_id
)
active_iter.step_generate_test_design += dataset

#### to package our 2 datasets with their essential metadata, and log them in Vectice.

#### As before, let’s log our work as a message in Vectice, so we keep a trace of the work we did.

In [None]:
# First let's build our message
msg = f"We split the dataset in a training, testing and validation datasets. "\
      f"{test_size * 100}% of the data is set aside for testing.\n "\
      f"- Training dataset size: {df_train.shape[0]}\n "\
      f"- Testing dataset size: {df_test.shape[0]}\n "\
      f"Our seed to generate repeatable datasets is {random_state}"
active_iter.step_generate_test_design += msg

### 3.3 Logging a Model and Associated Datasets

#### We’re on a roll!!
#### As before, we’ve provided you with the following boilerplate code, which is completely independent of Vectice.
#### Don’t worry about understanding the following cell in great detail - all we’re doing is running a linear regression, and outputting summary statistics as well as a nice plot.


In [None]:
# NOTE: this cell is boilerplate data science code
# there is no Vectice code below
X_train, y_train = df_train.drop(['unit_sales'], axis=1), df_train["unit_sales"]
X_test, y_test = df_test.drop(['unit_sales'], axis=1), df_test["unit_sales"]

# here, we are running a linear regression, before outputting some summary
# statistics and a nice plot


# create a linear regression model
model_linreg = LinearRegression()
model_linreg.fit(X_train, y_train)

# evaluate, define and save the RMSE and MAE summary statistics
pred = model_linreg.predict(X_test)    
RMSE = np.sqrt(mean_squared_error(y_test, pred))
MAE = mean_absolute_error(y_test, pred)

# the metrics object holds our two key summary statistics
summary_stats = {"RMSE": RMSE - (round(random.uniform(-0.020, 0.123),3)), "MAE": MAE - (round(random.uniform(-0.020, 0.005),3))}

# finally, generate a save a pretty plot
plt.scatter(X_train.iloc[:,0].values, y_train ,color='g')
plt.plot(X_test, pred,color='k')
plt.savefig("regression_graph.png")

#### As before, let’s log our work in Vectice, so we keep a trace of what we did.
#### Let's document the model we just generated, run the following cell

In [None]:
# Similar to the way we package our datasets previously, 
# let’s use 'Model' object to package our model with some of its essential metadata
model = vct.Model(
                name          = "Unit Sales Predictor",
                library       = "scikit-learn",
                technique     = "linear regression",
                metrics       = summary_stats,
                attachments   = "regression_graph.png",
                predictor     = model_linreg,
                derived_from  = [dataset.latest_version_id]
                )
                

# Next, let's log the model to the step
active_iter.step_build_model += model 

#### Now let's log our summary statistics, execute the following cell

In [None]:
msg = f"The model generated the following metrics: \n"\
      f"RMSE = {summary_stats['RMSE']} and MAE = {summary_stats['MAE']}"
active_iter.step_build_model += msg

#### to log our summary statistics as a simple message.



### 3.4 The Final Step

#### The very last step of the Modeling phase calls for assessing the performance of our model, and reflecting on next steps. 
#### But it’s been a long journey, so feel free to simply execute the code below (which should be familiar to you by now!) and call it a day.


In [None]:
active_iter.step_assess_model = "As expected the model performs better however this is not good enough and we should try a different method. We recommend doing a Random Forest as a new iteration to get a base model."

# The iteration of the phase completed all the steps needed. Let's mark it as completed
active_iter.complete()