# Feature-store ingestion using spark engine
Please make sure required env variables are set - `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `GOOGLE_APPLICATION_CREDENTIALS`

### Pypsark dataframe source & V3IO KV target

In [None]:
import os
import mlrun
import mlrun.feature_store as fs
import pandas as pd
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Initialize the MLRun project object
project = mlrun.load_project('./')

# Required credentials :
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, GOOGLE_APPLICATION_CREDENTIALS as jupyter env vars and project secrets (k8s provider).

In [2]:
# Fetch the transactions dataset from the server
if not os.path.exists('data.csv'):
    dataset_path = 'https://s3.wasabisys.com/iguazio/data/fraud-demo-mlrun-fs-docs/data.csv'
    transactions_data = pd.read_csv(dataset_path, parse_dates=['timestamp']).sort_values(by='source', axis=0)[:500]
    transactions_data.to_csv('data.csv',index=None)
else:
    transactions_data = pd.read_csv('data.csv')

In [3]:
# Creating feature-set
transaction_set = fs.FeatureSet('transactions',
                                entities=[fs.Entity('source')], 
                                timestamp_key = 'timestamp', 
                                engine='spark',
                                description="transactions feature set")

# setting default targets
transaction_set.set_targets(targets=['nosql'], with_defaults=False)

In [None]:
from pyspark.sql import SparkSession

# Creating spark dataframe to ingest
spark = SparkSession.builder \
                    .master("local[1]") \
                    .appName("SparkByExamples.com") \
                    .getOrCreate()
        
spark_dataframe = spark.createDataFrame(transactions_data)

# Ingesting
fs.ingest(transaction_set, spark_dataframe, overwrite=True)

### S3 CSV source & target

In [5]:
from mlrun.datastore.sources import CSVSource
from mlrun.datastore import CSVTarget

user_events_set = fs.FeatureSet("events",
                                entities=[fs.Entity("source")],
                                timestamp_key='timestamp', 
                                description="user events feature set",
                                engine='spark')

s3_filepath = os.path.join(project.artifact_path, 'transactions_cut.csv')
s3_target = project.artifact_path

user_events_source_s3 = CSVSource("user_events", path=s3_filepath)
user_events_target_s3 = CSVTarget("user_events", path=s3_target)

user_events_set.set_targets(targets = [user_events_target_s3], with_defaults=False)

In [None]:
ingestion_df = fs.ingest(featureset=user_events_set, source=user_events_source_s3, overwrite=True)

### GCS dataframe source & V3IO KV target

In [7]:
from mlrun.datastore.sources import CSVSource
from mlrun.datastore import CSVTarget

user_events_set = fs.FeatureSet("events2",
                                entities=[fs.Entity("source")],
                                timestamp_key='timestamp', 
                                description="user events feature set",
                                engine='spark')

gs_filepath = os.path.join('gs' + project.artifact_path[2:], 'transactions_cut.csv')
gs_target = 'gs' + project.artifact_path[2:]

user_events_source_gs = CSVSource("user_events", path=gs_filepath)
user_events_target_gs = CSVTarget("user_events", path=gs_target)

user_events_set.set_targets(targets = [user_events_target_gs], with_defaults=False)

In [8]:
# ingestion_df = fs.ingest(featureset=user_events_set, source=user_events_source_gs, overwrite=True)

# Ingesting Remotely (spark operator)

In [9]:
# For Spark operator
from mlrun.runtimes import Spark3Runtime
Spark3Runtime.deploy_default_image()

In [10]:
#mlrun: start-code

from pyspark import SparkConf
from pyspark.sql import SparkSession
import json


from mlrun.feature_store.api import ingest
def ingest_handler(context):
    gcs_auth = json.loads(open(context.get_secret('GOOGLE_APPLICATION_CREDENTIALS'), 'rb').read())
    conf = (SparkConf()
            .set("spark.hadoop.fs.s3a.path.style.access", True)
            .set("spark.hadoop.fs.s3a.access.key", context.get_secret('AWS_ACCESS_KEY_ID'))
            .set("spark.hadoop.fs.s3a.secret.key", context.get_secret('AWS_SECRET_ACCESS_KEY'))
            .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .set("com.amazonaws.services.s3.enableV4", True)
#             .set("spark.driver.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true")
#             .set('spark.hadoop.fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
#             .set('spark.hadoop.fs.gs.auth.service.account.enable', 'true')
#             .set('spark.hadoop.google.cloud.auth.service.account.json.keyfile', context.get_secret('GOOGLE_APPLICATION_CREDENTIALS'))
           )
    spark = (
        SparkSession.builder.config(conf=conf).appName("S3 app").getOrCreate()
    )
    
    ingest(mlrun_context=context, spark_context=spark)
    
#mlrun: end-code

In [11]:
fn = mlrun.code_to_function(name='remote_spark_fs', kind='spark', image='.spark-job-default-image')

# set spark driver config (gpu_type & gpus=<number_of_gpus>  supported too)
fn.with_driver_limits(cpu="1300m")
fn.with_driver_requests(cpu=1, mem="512m") 

# set spark executor config (gpu_type & gpus=<number_of_gpus> are supported too)
fn.with_executor_limits(cpu="1400m")
fn.with_executor_requests(cpu=1, mem="512m")
fn.with_igz_spark()

config = fs.RunConfig(function=fn ,local=False, handler="ingest_handler")

### S3

In [None]:
user_events_set = fs.FeatureSet("events3",
                                entities=[fs.Entity("source")],
                                timestamp_key='timestamp', 
                                description="user events feature set",
                                engine='spark')

user_events_set.set_targets(targets = [user_events_target_s3], with_defaults=False)

ingestion_df = fs.ingest(featureset=user_events_set,
                         source=user_events_source_s3,
                         overwrite=True,
                         run_config=config)

### GCS

In [13]:
user_events_set = fs.FeatureSet("events4",
                                entities=[fs.Entity("source")],
                                timestamp_key='timestamp', 
                                description="user events feature set",
                                engine='spark')

user_events_set.set_targets(targets = [user_events_target_gs], with_defaults=False)

# ingestion_df = fs.ingest(featureset=user_events_set,
#                          source=user_events_source_gs,
#                          overwrite=True,
#                          run_config=config)