### Import General Python libs and modules

In [1]:
# Sys imports
import sys
sys.path.insert(0, "../")

# 3rd party imports
from datetime import datetime, timedelta
from pprint import pprint
import pandas as pd
from feast import (FeatureStore,
                    FileSource,
                    FeatureService,
                    FeatureView,
                    Field,
                    Entity)
from feast.types import  Float32, Int32

In [2]:
FEAST_REPO = "/Users/kike/Library/CloudStorage/OneDrive-VMware,Inc/OCTO/2022-H1/Taurus/Feast/feast_workshops-master/module_1/feature_repo"
fs = FeatureStore(repo_path=FEAST_REPO)

In [3]:
pprint(fs.config.dict())

{'_offline_config': 'file',
 '_offline_store': None,
 '_online_config': {'path': 'data/online_store.db', 'type': 'sqlite'},
 '_online_store': None,
 'feature_server': None,
 'flags': None,
 'go_feature_retrieval': False,
 'offline_store': {'type': 'file'},
 'online_store': {'path': 'data/online_store.db', 'type': 'sqlite'},
 'project': 'feature_repo',
 'provider': 'local',
 'registry': 'data/registry.db',
 'repo_path': PosixPath('/Users/kike/Library/CloudStorage/OneDrive-VMware,Inc/OCTO/2022-H1/Taurus/Feast/feast_workshops-master/module_1/feature_repo')}


### Step 1. Register the data source, entity, features in the FeatureView, and the FeatureService with the Feast Registry

In [4]:
# This is the data source which could be provided by Postgress or Taurus DWH
driver_stats = pd.read_parquet(path=f"{FEAST_REPO}/data/driver_stats.parquet")
display(driver_stats)

Unnamed: 0,datetime,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2021-07-13 11:00:00+00:00,1005,0.373837,0.154890,498,2021-07-28 11:08:04.802
1,2021-07-13 12:00:00+00:00,1005,0.571627,0.643958,656,2021-07-28 11:08:04.802
2,2021-07-13 13:00:00+00:00,1005,0.399909,0.993888,722,2021-07-28 11:08:04.802
3,2021-07-13 14:00:00+00:00,1005,0.967468,0.788458,424,2021-07-28 11:08:04.802
4,2021-07-13 15:00:00+00:00,1005,0.024679,0.956064,569,2021-07-28 11:08:04.802
...,...,...,...,...,...,...
1802,2021-07-28 09:00:00+00:00,1001,0.089418,0.311234,485,2021-07-28 11:08:04.802
1803,2021-07-28 10:00:00+00:00,1001,0.222534,0.927691,114,2021-07-28 11:08:04.802
1804,2021-04-12 07:00:00+00:00,1001,0.175219,0.761434,385,2021-07-28 11:08:04.802
902,2021-07-20 23:00:00+00:00,1003,0.025968,0.109748,55,2021-07-28 11:08:04.802


In [5]:
'''
Create a FileSource from a file containing feature data. Only Parquet format supported.
- created_timestamp_column – Timestamp column when row was created, used for deduplicating rows.
- timestamp_field – Event timestamp foe;d used for point in time joins of feature values.
'''
driver_hourly_stats = FileSource(path=f"{FEAST_REPO}/data/driver_stats.parquet",
                                 timestamp_field="datetime",
                                 created_timestamp_column="created")
driver = Entity(name="driver_id",
                value_type=Int32,
                description="driver id",)

driver_hourly_stats_view = FeatureView(
    name="driver_hourly_stats",
    entities=["driver_id"],
    ttl=timedelta(seconds=86400 * 365), # 1 year
    schema=[
        Field(name="conv_rate", dtype=Float32),
        Field(name="acc_rate", dtype=Float32),
        Field(name="avg_daily_trips", dtype=Int32),
    ],
    online=True,
    source=driver_hourly_stats,
    tags={},
)

driver_feature_svc = FeatureService(name="driver_ranking_fv_svc",
                           features=[driver_hourly_stats_view],
                           tags={"description": "Used for training a MLP model"})

fs.apply([driver, driver_feature_svc, driver_hourly_stats_view]) # create (initialize) the feature store

#### Get the training historical data

In [6]:
# We define a toy dataset that has no numeric features
entity_df = pd.DataFrame.from_dict(
    {
        "driver_id": [1001, 1002, 1003, 1004],
        "event_timestamp": [
            datetime(2021, 4, 12, 10, 59, 42),
            datetime(2021, 4, 12, 8, 12, 10),
            datetime(2021, 4, 12, 16, 40, 26),
            datetime(2021, 4, 12, 15, 1, 12),
        ],
    }
)
entity_df

Unnamed: 0,driver_id,event_timestamp
0,1001,2021-04-12 10:59:42
1,1002,2021-04-12 08:12:10
2,1003,2021-04-12 16:40:26
3,1004,2021-04-12 15:01:12


In [7]:
'''
This step merges historical features in he data store and the rows from
the identity table. The resulting table is tipically used for Ml models
training purposes.
'''
training_df = fs.get_historical_features(
        entity_df=entity_df,
        features=fs.get_feature_service("driver_ranking_fv_svc")
    ).to_df()
display(training_df)

Unnamed: 0,driver_id,event_timestamp,conv_rate,acc_rate,avg_daily_trips
360,1001,2021-04-12 10:59:42+00:00,0.175219,0.761434,385
721,1002,2021-04-12 08:12:10+00:00,0.312347,0.481786,810
1082,1003,2021-04-12 16:40:26+00:00,0.736727,0.936667,939
1445,1004,2021-04-12 15:01:12+00:00,0.094609,0.151163,166


### Step 2. Now materialize, load data from offline into online store

In [8]:
'''
This step is complementary to the previous, it is normally executed to extract the latest features
from the offline store into the online store for inference purposes.
'''
fs.materialize_incremental(end_date=datetime.utcnow() - timedelta(minutes=0))

Materializing [1m[32m1[0m feature views to [1m[32m2022-06-22 14:06:07-05:00[0m into the [1m[32msqlite[0m online store.

[1m[32mdriver_hourly_stats[0m from [1m[32m2022-06-22 08:54:11-05:00[0m to [1m[32m2022-06-22 14:06:07-05:00[0m:


0it [00:00, ?it/s]


### Step 3: Get the feature vector for inference from the online store

In [9]:
# get the feature vector for inferencing from the online store
inf_fs = FeatureStore(repo_path=FEAST_REPO)
inf_feature_service = inf_fs.get_feature_service("driver_ranking_fv_svc")

for driver in [1001, 1002, 1003]:
    feature_vector = inf_fs.get_online_features(
        entity_rows=[{"driver_id": driver}],
        features=inf_feature_service
    ).to_df()
    print("--" * 5)
    pprint(feature_vector)

----------
   driver_id  conv_rate  avg_daily_trips  acc_rate
0       1001   0.222534              114  0.927691
----------
   driver_id  conv_rate  avg_daily_trips  acc_rate
0       1002   0.913528              573  0.357142
----------
   driver_id  conv_rate  avg_daily_trips  acc_rate
0       1003   0.581626              451  0.558202
