# Derived Join Feature Sets

In [1]:
# Preparation - define variables for automation
import os
PROJECT_NAME = os.getenv("PROJECT_NAME", "demo_join_featureset")
REFRESH_TOKEN = os.getenv("REFRESH_TOKEN")
SPARK_DEPS_AZURE = os.getenv("SPARK_DEPS_JAR")

### Set up Spark session

In [None]:
! pip install pyspark==3.4.1 h2o-featurestore

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-azure:3.3.1") \
    .config("spark.jars", SPARK_DEPS_AZURE) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [None]:
# Connect and authenticate
from featurestore import *
client = Client(API, secure=True)
client.auth.set_auth_token(REFRESH_TOKEN)

In [None]:
# Define credentials for the data source
S3_ACCESS_KEY = os.getenv("S3_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3_SECRET_KEY")
S3_REGION = os.getenv("S3_REGION")
credentials = S3Credentials(S3_ACCESS_KEY, S3_SECRET_KEY, S3_REGION)

In [None]:
# Delete project if already exists
try:
    client.projects.get(PROJECT_NAME).delete()
except:
    pass

## Create a project

In [None]:
# Create a project
project = client.projects.create(PROJECT_NAME)

In [None]:
# Define input data sources
westnile_virus_source = CSVFile("s3a://feature-store-test-data/west-nile/west_nile_virus.txt")
westnile_weather_source = CSVFile("s3a://feature-store-test-data/west-nile/west_nile_weather.txt")

## Extract schema

In [None]:
westnile_virus_schema = client.extract_schema_from_source(westnile_virus_source, credentials)

In [None]:
westnile_virus_schema

In [None]:
westnile_weather_schema = client.extract_schema_from_source(westnile_weather_source, credentials)

In [None]:
westnile_weather_schema

## Register feature sets

In [None]:
# Register a feature set
westnile_virus_fs = project.feature_sets.register(westnile_virus_schema, "west_nile_virus", primary_key = "Date")

In [None]:
westnile_weather_fs = project.feature_sets.register(westnile_weather_schema, "west_nile_weather", primary_key = "Date")

## Ingest data

In [None]:
westnile_virus_fs.ingest(westnile_virus_source, credentials)

In [None]:
westnile_weather_fs.ingest(westnile_weather_source, credentials)

## Define joined feature set transformation

In [None]:
# Create joined feature set transformation
import featurestore.core.transformations as t
join_transformation = t.JoinFeatureSets(left_key="Date", right_key="Date")

## Extract schema based on the joined feature sets

In [None]:
# Extract schema for the derived feature set
derived_schema = client.extract_derived_schema([westnile_virus_fs, westnile_weather_fs], join_transformation)

## register a feature set for the join feature set

In [None]:
derived_fs = project.feature_sets.register(derived_schema, "joined_fs")

In [None]:
# Wait for data from input feature set to be propagated into derived
from featurestore.core.job_types import INGEST
jobs = derived_fs.get_active_jobs(INGEST)
if len(jobs) > 0:
    jobs[0].wait_for_result()

## Retrieve data

In [None]:
# Retrieve the feature set
latest_derived = project.feature_sets.get(derived_fs.feature_set_name)
df = latest_derived.retrieve().as_spark_frame(spark)
df.show()

## Cleanups

In [None]:
client.projects.get(PROJECT_NAME).delete()
