In [None]:
%%writefile scenario2.py
import pandas as pd
import numpy as np
import mlrun.feature_store as fstore
from mlrun.datastore import ParquetTarget,NoSqlTarget
import mlrun
from datetime import datetime, timedelta
from mlrun.execution import MLClientCtx
from sys import getsizeof

data_size = 100

def ingest_data(context : MLClientCtx, date:str, prefix:int):
    cols = ["col" + str(x) for x in range(35)]
    df = pd.DataFrame(np.random.random_sample((data_size, 35)), columns=cols)
    ts = pd.Timestamp(date)
    df['timestamp'] = ts
    entity = [prefix + x for x in range(data_size)]
    df['entity'] = entity
    # each different feature set gets ingested the same dataset generated. Shape(data_size, 35)
    
    fs1 = fstore.get_feature_set(uri="store://feature-sets/ml2802/test1:latest", 
                                project="ml2802")
    
    fstore.ingest(fs1, df, overwrite=False)
    
    context.logger.info('fs1 ingested')
    
    fs2 = fstore.get_feature_set(uri="store://feature-sets/ml2802/test2:latest", 
                                project="ml2802")
    
    fstore.ingest(fs2, df, overwrite=False)
    
    context.logger.info('fs2 ingested')
    
    fs3 = fstore.get_feature_set(uri="store://feature-sets/ml2802/test3:latest", 
                                project="ml2802")
    
    fstore.ingest(fs3, df, overwrite=False)
    
    context.logger.info('fs3 ingested')
    
    fs4 = fstore.get_feature_set(uri="store://feature-sets/ml2802/test4:latest", 
                                project="ml2802")
    
    fstore.ingest(fs4, df, overwrite=False)
    
    context.logger.info('fs4 ingested')
    
    fs5 = fstore.get_feature_set(uri="store://feature-sets/ml2802/test5:latest", 
                                project="ml2802")
    
    fstore.ingest(fs5, df, overwrite=False)
    
    context.logger.info('fs5 ingested')

In [None]:
import mlrun
project_name = 'ml2802'
# Initialize the MLRun project object
project = mlrun.get_or_create_project(project_name, context="./", user_project=False)

In [None]:
import mlrun.feature_store as fstore
from mlrun.datastore import ParquetTarget

In [None]:
# Define 5 different FeatureSets
test_set_1 = fstore.FeatureSet("test1", 
                              entities=[fstore.Entity("entity")], 
                              timestamp_key='timestamp', 
                              description="Test FS")
test_set_2 = fstore.FeatureSet("test2", 
                              entities=[fstore.Entity("entity")], 
                              timestamp_key='timestamp', 
                              description="Test FS")
test_set_3 = fstore.FeatureSet("test3", 
                              entities=[fstore.Entity("entity")], 
                              timestamp_key='timestamp', 
                              description="Test FS")
test_set_4 = fstore.FeatureSet("test4", 
                              entities=[fstore.Entity("entity")], 
                              timestamp_key='timestamp', 
                              description="Test FS")
test_set_5 = fstore.FeatureSet("test5", 
                              entities=[fstore.Entity("entity")], 
                              timestamp_key='timestamp', 
                              description="Test FS")

In [None]:
parq = ParquetTarget(name="part", partitioned=True, time_partitioning_granularity="day")

In [None]:
test_set_1.set_targets(targets=[parq],with_defaults=False)
test_set_2.set_targets(targets=[parq],with_defaults=False)
test_set_3.set_targets(targets=[parq],with_defaults=False)
test_set_4.set_targets(targets=[parq],with_defaults=False)
test_set_5.set_targets(targets=[parq],with_defaults=False)


In [None]:
test_set_1.plot(rankdir="LR", with_targets=True)

In [None]:
test_set_1.save()
test_set_2.save()
test_set_3.save()
test_set_4.save()
test_set_5.save()

In [None]:
test_func = mlrun.code_to_function(name='test_func', kind='job', image='mlrun/mlrun', filename="scenario2.py")

In [None]:
dates={'2022-10-16':10000000 ,'2022-09-16':20000000,'2022-08-16': 30000000,'2022-07-16':40000000,'2022-06-16':50000000,'2022-05-16':60000000,'2022-04-16':70000000,'2022-03-16':80000000,
       '2022-02-16':90000000,'2022-01-16':100000000,'2021-12-16':110000000}

In [None]:
# Run this for every date, each time with different date and prefix

for date in dates.keys():
    test_func.run(name='ingest', 
                  handler='ingest_data',
                  params = {'date' : date,
                            'prefix' : dates[date]})
    print(date)

In [None]:
features = [
    "test1.col0 as t1col0",
    "test1.col1 as t1col1",
    "test1.col2 as t1col2",
    "test1.col3 as t1col3",
    "test1.col4 as t1col4",
    "test1.col5 as t1col5",
    "test1.col6 as t1col6",
    "test1.col7 as t1col7",
    "test1.col8 as t1col8",
    "test1.col9 as t1col9",
    "test1.col10 as t1col10",
    "test1.col11 as t1col11",
    "test1.col12 as t1col12",
    "test1.col13 as t1col13",
    "test1.col14 as t1col14",
    "test1.col15 as t1col15",
    "test1.col16 as t1col16",
    "test1.col17 as t1col17",
    "test1.col18 as t1col18",
    "test1.col19 as t1col19",
    "test1.col20 as t1col20",
    "test1.col21 as t1col21",
    "test1.col22 as t1col22",
    "test1.col23 as t1col23",
    "test1.col24 as t1col24",
    "test1.col25 as t1col25",
    "test1.col26 as t1col26",
    "test1.col27 as t1col27",
    "test1.col28 as t1col28",
    "test1.col29 as t1col29",
    "test1.col30 as t1col30",
    "test1.col31 as t1col31",
    "test1.col32 as t1col32",
    "test1.col33 as t1col33",
    "test1.col34 as t1col34",
    "test2.col0 as t2col0",
    "test2.col1 as t2col1",
    "test2.col2 as t2col2",
    "test2.col3 as t2col3",
    "test2.col4 as t2col4",
    "test2.col5 as t2col5",
    "test2.col6 as t2col6",
    "test2.col7 as t2col7",
    "test2.col8 as t2col8",
    "test2.col9 as t2col9",
    "test2.col10 as t2col10",
    "test2.col11 as t2col11",
    "test2.col12 as t2col12",
    "test2.col13 as t2col13",
    "test2.col14 as t2col14",
    "test2.col15 as t2col15",
    "test2.col16 as t2col16",
    "test2.col17 as t2col17",
    "test2.col18 as t2col18",
    "test2.col19 as t2col19",
    "test2.col20 as t2col20",
    "test2.col21 as t2col21",
    "test2.col22 as t2col22",
    "test2.col23 as t2col23",
    "test2.col24 as t2col24",
    "test2.col25 as t2col25",
    "test2.col26 as t2col26",
    "test2.col27 as t2col27",
    "test2.col28 as t2col28",
    "test2.col29 as t2col29",
    "test2.col30 as t2col30",
    "test2.col31 as t2col31",
    "test2.col32 as t2col32",
    "test2.col33 as t2col33",
    "test2.col34 as t2col34",
    "test3.col0 as t3col0",
    "test3.col1 as t3col1",
    "test3.col2 as t3col2",
    "test3.col3 as t3col3",
    "test3.col4 as t3col4",
    "test3.col5 as t3col5",
    "test3.col6 as t3col6",
    "test3.col7 as t3col7",
    "test3.col8 as t3col8",
    "test3.col9 as t3col9",
    "test3.col10 as t3col10",
    "test3.col11 as t3col11",
    "test3.col12 as t3col12",
    "test3.col13 as t3col13",
    "test3.col14 as t3col14",
    "test3.col15 as t3col15",
    "test3.col16 as t3col16",
    "test3.col17 as t3col17",
    "test3.col18 as t3col18",
    "test3.col19 as t3col19",
    "test3.col20 as t3col20",
    "test3.col21 as t3col21",
    "test3.col22 as t3col22",
    "test3.col23 as t3col23",
    "test3.col24 as t3col24",
    "test3.col25 as t3col25",
    "test3.col26 as t3col26",
    "test3.col27 as t3col27",
    "test3.col28 as t3col28",
    "test3.col29 as t3col29",
    "test3.col30 as t3col30",
    "test3.col31 as t3col31",
    "test3.col32 as t3col32",
    "test3.col33 as t3col33",
    "test3.col34 as t3col34",
    "test4.col0 as t4col0",
    "test4.col1 as t4col1",
    "test4.col2 as t4col2",
    "test4.col3 as t4col3",
    "test4.col4 as t4col4",
    "test4.col5 as t4col5",
    "test4.col6 as t4col6",
    "test4.col7 as t4col7",
    "test4.col8 as t4col8",
    "test4.col9 as t4col9",
    "test4.col10 as t4col10",
    "test4.col11 as t4col11",
    "test4.col12 as t4col12",
    "test4.col13 as t4col13",
    "test4.col14 as t4col14",
    "test4.col15 as t4col15",
    "test4.col16 as t4col16",
    "test4.col17 as t4col17",
    "test4.col18 as t4col18",
    "test4.col19 as t4col19",
    "test4.col20 as t4col20",
    "test4.col21 as t4col21",
    "test4.col22 as t4col22",
    "test4.col23 as t4col23",
    "test4.col24 as t4col24",
    "test4.col25 as t4col25",
    "test4.col26 as t4col26",
    "test4.col27 as t4col27",
    "test4.col28 as t4col28",
    "test4.col29 as t4col29",
    "test4.col30 as t4col30",
    "test4.col31 as t4col31",
    "test4.col32 as t4col32",
    "test4.col33 as t4col33",
    "test4.col34 as t4col34",
    "test5.*",
]

fv = fstore.FeatureVector("test-vector-big",features=features,description="this is my vector")
fv.save()

In [None]:
dates={'2022-10-16':3330000 ,'2022-09-16':3330000,'2022-08-16': 3330000,'2022-07-16':3330000,'2022-06-16':3330000,'2022-05-16':3330000,'2022-04-16':3330000,'2022-03-16':3330000,
       '2022-02-16':3330000,'2022-01-16':3330000,'2021-12-16':3330000}

In [None]:
# Run this for every date, each time with different date and prefix


for date in dates.keys():
    test_func.run(name='ingest', 
                  handler='ingest_data',
                  params = {'date' : date,
                            'prefix' : dates[date]})
    print(date)

In [None]:
from mlrun import feature_store as fstore
from mlrun.runtimes import Spark3Runtime
from datetime import datetime, timedelta
from mlrun.feature_store import RunConfig
from mlrun.datastore import ParquetTarget
from mlrun import auto_mount
import mlrun

In [None]:
Spark3Runtime.deploy_default_image()

In [None]:
%%writefile spark-read.py
import mlrun
from mlrun.feature_store.retrieval import SparkFeatureMerger
from mlrun.datastore.targets import get_target_driver

def spark_handler(context, vector_uri, target, entity_rows=None, 
                  timestamp_column=None, drop_columns=None, with_indexes=None, query=None):
    vector = context.get_store_resource(vector_uri)
    store_target = get_target_driver(target, vector)
    entity_timestamp_column = timestamp_column or vector.spec.timestamp_field
    if entity_rows:
        entity_rows = entity_rows.as_df()

    context.logger.info(f"starting vector merge task to {vector.uri}")
    merger = SparkFeatureMerger(vector)
    resp = merger.start(entity_rows, entity_timestamp_column, store_target, drop_columns, with_indexes=with_indexes, 
                        query=query)
    target = vector.status.targets[store_target.name].to_dict()
    context.log_result('feature_vector', vector.uri)
    context.log_result('target', target)

In [None]:
spark_func = mlrun.code_to_function(name="spark-read",
                                    kind="spark",
                                    handler="spark_handler",
                                    filename="spark-read.py",
                                    image=".spark-job-default-image").apply(auto_mount())
spark_func.with_executor_requests(cpu="1",mem="1G")
spark_func.with_driver_requests(cpu="1",mem="1G")
spark_func.with_driver_limits(cpu="1")
spark_func.with_executor_limits(cpu="1")
spark_func.with_igz_spark()
spark_func.spec.image_pull_policy = "Always"
spark_func.spec.replicas = 3
rc = RunConfig(spark_func, local=False)

In [None]:
spark_func.to_dict()

In [None]:
resp = fv.get_offline_features(with_indexes=True,
                               timestamp_for_filtering='timestamp',
                               start_time = datetime.strptime("2021-01-16", '%Y-%m-%d')-timedelta(days=1),
                               end_time = datetime.strptime("2022-10-16", '%Y-%m-%d'),
                               engine='spark',
                               run_config=rc,
                               target=ParquetTarget(path="v3io:///bigdata/test_parq"))

In [None]:
import pandas as pd

In [None]:
df = pd.read_parquet("/v3io/bigdata/test_parq")

In [None]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("parquetFile").getOrCreate()

In [None]:
df = spark.read.parquet("v3io://bigdata/test_parq")

In [None]:
df.createOrReplaceTempView("ParquetTable")

In [None]:
sparkSQL = spark.sql("select count(*) from ParquetTable where limit 1")

In [None]:
sparkSQL.show()