## Derived feature set based on Spark pipeline with aggregated window 

In [None]:
# Retrieve environment variables
import os
PROJECT_NAME = os.getenv("PROJECT_NAME", "demo_spark_pipeline_with_aggregates")
REFRESH_TOKEN = os.getenv("REFRESH_TOKEN")
SPARK_DEPS_AZURE = os.getenv("SPARK_DEPS_JAR")

In [None]:
# Install featurestore and other dependencies
! pip install h2o-featurestore pyspark==3.4.1 

### Set up featurestore client

In [None]:
# Login and create a client
from featurestore import *
client = Client(API, secure=True)
client.auth.set_auth_token(REFRESH_TOKEN)

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-azure:3.3.1") \
    .config("spark.jars", SPARK_DEPS_AZURE) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [None]:
spark

In [None]:
# Define credentials for external data source
S3_ACCESS_KEY = os.getenv("S3_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3_SECRET_KEY")
S3_REGION = os.getenv("S3_REGION")
credentials =  S3Credentials(S3_ACCESS_KEY, S3_SECRET_KEY, S3_REGION)

In [None]:
# Delete project if it already exists
try:
    client.projects.get(PROJECT_NAME).delete()
except:
    pass

### Create a project

In [None]:
project = client.projects.create(PROJECT_NAME)

### Extract schema from the external data source

In [None]:
source = CSVFile("s3a://feature-store-test-data/allyears2k_headers_with_date.csv")

In [None]:
# Extract schema 
schema = client.extract_schema_from_source(source, credentials)

In [None]:
schema

### Create a feature set 

In [None]:
# Regiser a feature set
fs = project.feature_sets.register(schema, "fs_spark_pipeline_agg")

### Ingest data into a feature set

In [None]:
# Ingest data
fs.ingest(source, credentials)

### Create a spark pipeline

In [None]:
# Create a spark pipeline using airlines dataset, that shows the average delay in arrival flights for the last 30 days
from featurestore import SparkPipeline
from pyspark.ml.feature import SQLTransformer
from pyspark.ml import Pipeline
from featurestore import SparkPipeline
query = "select window.start AS start_time, window.end AS end_time, avg(ArrDelay) AS ave_arr_delay from __THIS__ group by window(Date, '30 days')"   
transformer = SQLTransformer(statement=query)
spark_pipeline = Pipeline(stages=[transformer])
pipeline_transformation = SparkPipeline(spark_pipeline)

### Extract schema from the spark pipeline

In [None]:
# Input parameters to extract derived schema are: Input feature set and spark pipeline transformations
spark_pipeline_schema = client.extract_derived_schema([fs], pipeline_transformation)

In [None]:
spark_pipeline_schema

### Create a derived feature set using spark pipeline schema

In [None]:
fs_spark_pipeline = project.feature_sets.register(spark_pipeline_schema, "derived_fs_spark_pipeline", time_travel_column = "end_time")

In [None]:
# Wait for data from input feature set to be propagated into derived
from featurestore.core.job_types import INGEST
jobs = fs_spark_pipeline.get_active_jobs(INGEST)
if len(jobs) > 0:
   jobs[0].wait_for_result()

In [None]:
fs_spark_pipeline

### Retrieve feature set as a spark dataframe

In [None]:
latest_fs = project.feature_sets.get(fs_spark_pipeline.feature_set_name)
df = latest_fs.retrieve().as_spark_frame(spark)
df.show()

### Cleanup

In [None]:
client.projects.get(PROJECT_NAME).delete()