In [None]:
# Install Vectice and other packages
%pip install -q vectice -U
%pip install boto3
%pip install botocore

### Instructions

Paste your API token below and execute the block. (your token can be generated [here](https://app.vectice.com/account/api-keys) )   

Dataset used can be found here: https://vectice-examples.s3.us-west-1.amazonaws.com/Tutorial/ForecastTutorial/items.csv

In [None]:
!wget https://vectice-examples.s3.us-west-1.amazonaws.com/Tutorial/ForecastTutorial/items.csv -q --no-check-certificate

In [None]:
# Import vectice package
import vectice


# Connect using your token API - Your token can be found here: https://app.vectice.com/account/api-keys
conn = vectice.connect(
    api_token='YOUR API TOKEN', 
    host='https://app.vectice.com',
    workspace='Samples'
)
# Alternate methods of connecting
# project = vc.connect(config='~/.config/vectice-config.json')
# provided the json file contains the "WORKSPACE" and "PROJECT" entries,
# OR
# project = vectice.connect(config="<API_key_config_name>.json", workspace="ws_name", project="project_name")
# both will return a project refrerence

# Open the project
project = conn.project("How To: Reporting your Milestones")

#### Capture your dataset and their usage

This sample uses data from our Vectice S3 bucket. 
     
We will use boto3 as a client.   

In [None]:
from boto3 import client  # Used to create a client and read from S3
from botocore import UNSIGNED
from botocore.client import Config
from vectice import FileDataWrapper, S3DataWrapper, GcsDataWrapper, DatasetSourceUsage

s3_client = client('s3', config=Config(signature_version=UNSIGNED), region_name='us-west-1')


The first cell illustrates how to add dataset and tag them as origin_dataset   

The second cell shows how to tag/attach a clean dataset ready for modeling to your project   

The third one captures the definition of you modeling dataset (compound dataset - training, testing, validation)

In [None]:
# Data Scientist code to build data frames with data
# ...

# Start an iteration of the 'Identify source datasets' step of the 'Document Dataset' phase
step = project.phase("Document Dataset").iteration().step("Identify source datasets")

# Retrieve a list of steps to complete
# project.phase("Document Dataset").iteration().steps

In [None]:
# Document the original datasets used for this project
# Using a S3DataWrapper for files on AWS S3:
step.origin_dataset = S3DataWrapper(name="Stores",s3_client=s3_client,bucket_name='vectice-examples',resource_path="Tutorial/ForecastTutorial/stores.csv")
step.origin_dataset = S3DataWrapper(name="Transactions", s3_client=s3_client, bucket_name='vectice-examples', resource_path="Tutorial/ForecastTutorial/transactions.csv")

# Using a FileDataWrapper for a local file
step.origin_dataset = FileDataWrapper(path="items.csv", name="Items")

# More examples here: https://docs.vectice.com/python-api-docs/how-to-register-datasets

# Document the step and automatically attach the datasets to it. Move on the next step
step = step.next_step(message="The datasets for the project have been identified as \'stores.csv\' and \'transaction.csv'.\nBoth files are located under the \'vectice-example' S3 bucket.")

In [None]:
# Data Scientist code for data preparation, normalization, etc...
# ...

# Document the cleaned dataset in Vectice - using the same S3 bucket
step.clean_dataset = S3DataWrapper(name="CleanDataset", s3_client=s3_client, bucket_name='vectice-examples', resource_path="Tutorial/ForecastTutorial/train_clean.csv")

# Document the step, automatically attach the dataset to it and move on to the next step
step = step.next_step(message="As part of our standard Data Pipeline process we applied the following preparation to our datasets:\n"\
    " - Handling of missing data\n - Applied standard scaler to numerical attributes\n - Converted categorical data into numerical\n"\
    " - Split values in numerical values, categorical values, and dates."\
    "\n\nWe processed our origin datasets through our data pipeline to generate a dataset ready for modeling.\nThe dataset is ready for modeling.")

In [None]:
# Data Scientist code to generate training, testing, and validation dataframes
# ...

# Define a testing, training and validation datawrapper
train_ds = S3DataWrapper(name="Modeling Dataset", s3_client=s3_client, bucket_name='vectice-examples', resource_path="Tutorial/ForecastTutorial/traindataset.csv", usage=DatasetSourceUsage.TRAINING)
test_ds = S3DataWrapper(name="Modeling Dataset", s3_client=s3_client, bucket_name='vectice-examples', resource_path="Tutorial/ForecastTutorial/testdataset.csv", usage=DatasetSourceUsage.TESTING)
validate_ds = S3DataWrapper(name="Modeling Dataset", s3_client=s3_client, bucket_name='vectice-examples', resource_path="Tutorial/ForecastTutorial/validatedataset.csv", usage=DatasetSourceUsage.VALIDATION)

# Document the cleaned dataset in Vectice - using the same S3 bucket
step.modeling_dataset = [train_ds, test_ds, validate_ds]
# Document the step and automatically attach the dataset
step.close(message="We split the dataset in a training, testing and validation datasets. 40% of the data is set aside for testing and our seed to generate repeatable datasets is 42")