In [None]:
# Prepare variables
import os
from featurestore import *
import datetime
PROJECT_NAME = os.getenv("PROJECT_NAME", "demo_timetravelapi")
MAIN_USER_TOKEN = os.getenv("MAIN_USER_TOKEN")
SPARK_DEPS_AZURE = os.getenv("SPARK_DEPS_JAR")

In [None]:
# Define credentials for ingesting user's data
S3_ACCESS_KEY = os.getenv("S3_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3_SECRET_KEY")
S3_REGION = os.getenv("S3_REGION")
credentials = S3Credentials(S3_ACCESS_KEY, S3_SECRET_KEY, S3_REGION)

In [None]:
! pip install pyspark==3.4.1 h2o-featurestore

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-azure:3.3.1") \
    .config("spark.jars", SPARK_DEPS_AZURE) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [None]:
# Login and create client
client = Client(API, secure=True)
client.auth.set_auth_token(MAIN_USER_TOKEN)

In [None]:
# Delete the project if already exists
try:
    client.projects.get(PROJECT_NAME).delete()
except:
    pass

## Timetravel API

In [None]:
# Create a project
project = client.projects.create(PROJECT_NAME)

In [None]:
# Define source for ingesting
source = CSVFile("s3a://feature-store-test-data/customer_churn_data_based_on_dates.csv")

In [None]:
# Extract schema from datasource
schema = client.extract_schema_from_source(source, credentials)

In [None]:
# Register a new feature set with the above schema without time travel column
first_fs = project.feature_sets.register(schema, "fs_without_time_travel", "description")

In [None]:
# Ingest data
ingest = first_fs.ingest(source, credentials)

In [None]:
# Since feature set is registered without time travel column, the time scope if ingested data is computed based on current
# ingestion. Start and end date time of the scope are same in this case.
# We can still perform filtering during retrieve, but that filtering is based on time of ingests
# rather than based on data stored in the time travel column

# Retrieve with boundaries based on ingestion time. 
ingestion_time = ingest._meta.ingestion_timestamp.ToDatetime().strftime("%Y-%m-%d %H:%M:%S")
ref = first_fs.retrieve(ingestion_time, ingestion_time)
ref.as_spark_frame(spark).show()

# Retrieve with boundaries available in the time travel column do not have any impact in this case and we retrieve empty
# feature set. This is because without time travel column, we operate on time of ingestions.
ref = first_fs.retrieve("2021-04-02 00:00:00", "2021-04-03 00:00:00")
# In this case we can see the data is empty as expected
ref.as_spark_frame(spark).show()

# Feature set with time travel column

In [None]:
# Register a new feature set with the above schema with time travel column
second_fs = project.feature_sets.register(schema, "fs_with_time_travel", "description", time_travel_column="Date", time_travel_column_format="yyyy-MM-dd HH:mm:ss")

In [None]:
# Ingest data
ingest = second_fs.ingest(source, credentials)

In [None]:
# Since feature set is registered with time travel column we can perform retrieve
# filtering based on boundaries provided in the time travel column
ref = second_fs.retrieve("2021-04-02 00:00:00", "2021-04-03 00:00:00")
ref.as_spark_frame(spark).show()

# Retrieve with boundaries based on ingestion time leads to no data as we operate on data from time travel column
ingestion_time = ingest._meta.ingestion_timestamp.ToDatetime().strftime("%Y-%m-%d %H:%M:%S")
ref = second_fs.retrieve(ingestion_time, ingestion_time)
# The data are expected to be empty
ref.as_spark_frame(spark).show()

## Cleanups

In [None]:
client.projects.get(PROJECT_NAME).delete()
