In [None]:
# Install Vectice
%pip install -q vectice -U

### Instructions

Paste your API token below and execute the block. (your token can be generated [here](https://app.vectice.com/account/api-keys) )   



In [None]:
# Import vectice package
import vectice

# Set Vectice logging level
import logging
logging.getLogger("vectice").setLevel(logging.WARNING)

# Connect using your token API - Your token can be found here: https://app.vectice.com/account/api-keys
conn = vectice.connect(
    api_token='YOUR API TOKEN', 
    host='https://app.vectice.com',
    workspace='Samples'
)
# Open the project
project = conn.project("How To: Capture your Datasets")

##### Capture your dataset and their usage

This sample uses data from our Vectice S3 bucket.      
We will use boto3 as a client.   

The first cell illustrates how to add dataset and tag them as origin_dataset   
The second cell shows how to tag/attach a clean dataset ready for modeling to your project   
The third one defines a modeling dataset that is a compound dataset (training, testing, validation)   

In [None]:
from boto3 import client  # Used to create a client and read from S3
from botocore import UNSIGNED
from botocore.client import Config
from vectice import FileDataWrapper, S3DataWrapper, GcsDataWrapper, DatasetSourceUsage

s3_client = client('s3', config=Config(signature_version=UNSIGNED), region_name='us-west-1')

# Data Scientist code to build data frames with data
# ...

# get the 'Identify datasets' step of the 'Data Understanding' phase
step = project.phase("Data Understanding").iteration().step("Identify datasets")

# Document the original datasets used for this project
step.origin_dataset = S3DataWrapper(name="Stores",s3_client=s3_client,bucket_name='vectice-examples',resource_path="Tutorial/ForecastTutorial/stores.csv")
step.origin_dataset = S3DataWrapper(name="Transactions", s3_client=s3_client, bucket_name='vectice-examples', resource_path="Tutorial/ForecastTutorial/transactions.csv")

# Document the step and automatically attach the datasets to it. Move on the next step
step = step.next_step(message="The datasets for the project have been identified as \'stores.csv\' and \'transaction.csv'.\nBoth files are located under the \'vectice-example' S3 bucket.")

In [None]:
# Data Scientist code for data preparation, normalization, etc...
# ...

# get the 'Clean dataset' step of the 'Data Understanding' phase
# step = project.phase("Data Understanding").iteration().step("Clean dataset")

# Document the cleaned dataset in Vectice - using the same S3 bucket
step.clean_dataset = S3DataWrapper(name="CleanDataset", s3_client=s3_client, bucket_name='vectice-examples', resource_path="Tutorial/ForecastTutorial/train_clean.csv")

# Document the step, automatically attach the dataset to it and move on to the next step
step = step.next_step(message="A new dataset has been created, combining both origin datasets, removing non-essentials feaures and normalized the data for modeling.")

In [None]:
# Data Scientist code to generate training, testing, and validation dataframes
# ...

# get the 'Clean dataset' step of the 'Data Understanding' phase
# step = project.phase("Data Understanding").iteration().step("Build Model")

# Define a testing, training and validation datawrapper
train_ds = S3DataWrapper(name="Modeling Dataset", s3_client=s3_client, bucket_name='vectice-examples', resource_path="Tutorial/ForecastTutorial/traindataset.csv", usage=DatasetSourceUsage.TRAINING)
test_ds = S3DataWrapper(name="Modeling Dataset", s3_client=s3_client, bucket_name='vectice-examples', resource_path="Tutorial/ForecastTutorial/testdataset.csv", usage=DatasetSourceUsage.TESTING)
validate_ds = S3DataWrapper(name="Modeling Dataset", s3_client=s3_client, bucket_name='vectice-examples', resource_path="Tutorial/ForecastTutorial/validatedataset.csv", usage=DatasetSourceUsage.VALIDATION)

# Document the cleaned dataset in Vectice - using the same S3 bucket
step.modeling_dataset = [train_ds, test_ds, validate_ds]
# Document the step and automatically attach the dataset
step.close(message="This model iteration uses the attached modeling dataset")