# Introduction to Feature Stores with Feast
## The easy-reading booklet to master the key concepts of feature stores and learn how to use Feast

### Import libraries

In [None]:
COMPUTE_LOCAL_WORKING_FOLDER = 'work/fullstackml/experiments/feast-credit-scoring'

from feast import (
    FeatureStore, 
    Entity, 
    Field, 
    FeatureService, 
    FeatureView, 
    FileSource, 
    RepoConfig,
    types,
    ValueType,
    PushSource
    )

from feast.infra.online_stores.sqlite import SqliteOnlineStoreConfig
from feast.infra.offline_stores.file import FileOfflineStoreConfig
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage
from feast.repo_config import RegistryConfig
from feast.data_source import PushMode

import pandas as pd
from datetime import datetime, timedelta
import os

### Setup registry, repository, initialize feature store

In [None]:
repo_config = RepoConfig(
    project="credit_scoring",
    registry=RegistryConfig(
        registry_type='sqlite',
        path=os.path.join(COMPUTE_LOCAL_WORKING_FOLDER,  
        'fs',
        'registry.db')),
    provider="local",
    entity_key_serialization_version=2,
    online_store=SqliteOnlineStoreConfig(
        type='sqlite', 
        path=os.path.join(COMPUTE_LOCAL_WORKING_FOLDER, 
        'fs',
        'online_store.db')),
    offline_store=FileOfflineStoreConfig(type='file')
    )
        

fs = FeatureStore(config=repo_config)

### Define entities, festure views and register them

In [None]:
zipcode = Entity(
    name="zipcode", 
    join_keys=["zipcode"]
    )
dob_ssn = Entity(
    name="dob_ssn",
    value_type=ValueType.STRING,
    join_keys=["dob_ssn"],
    description="Date of birth and last four digits of social security number"
    )

zipcode_features = FeatureView(
    name="zipcode_features",
    entities=[zipcode],
    ttl=timedelta(days=3650),
    schema=[
        Field(name="city", dtype=types.String),
        Field(name="state", dtype=types.String),
        Field(name="location_type", dtype=types.String),
        Field(name="tax_returns_filed", dtype=types.Int64),
        Field(name="population", dtype=types.Int64),
        Field(name="total_wages", dtype=types.Int64),
    ],
    source=FileSource(
        path=os.path.join(
            COMPUTE_LOCAL_WORKING_FOLDER,
            'data',
            'zipcode_table_2.parquet'),
        timestamp_field="event_timestamp",
        #created_timestamp_column="created_timestamp",
    )
)

credit_history = FeatureView(
    name="credit_history",
    entities=[dob_ssn],
    ttl=timedelta(days=90),
    source=FileSource(
        path=os.path.join(
            COMPUTE_LOCAL_WORKING_FOLDER,
            'data',
            'credit_history_2.parquet'),
        timestamp_field="event_timestamp",
        #created_timestamp_column="created_timestamp",
    ),
)

# Register entities and feature views
fs.apply([
    credit_history, 
    dob_ssn, 
    zipcode, 
    zipcode_features]) 

### Generate training dataset

In [None]:
loan_data = pd.read_parquet(os.path.join(
   COMPUTE_LOCAL_WORKING_FOLDER,
   'data',
   'loan_table.parquet'))

feast_features = [
   "zipcode_features:city",
   "zipcode_features:state",
   "zipcode_features:location_type",
   "zipcode_features:tax_returns_filed",
   "zipcode_features:population",
   "zipcode_features:total_wages",
   "credit_history:credit_card_due",
   "credit_history:mortgage_due",
   "credit_history:student_loan_due",
   "credit_history:vehicle_loan_due",
   "credit_history:hard_pulls",
   "credit_history:missed_payments_2y",
   "credit_history:missed_payments_1y",
   "credit_history:missed_payments_6m",
   "credit_history:bankruptcies",
]

training_data = fs.get_historical_features(
   entity_df=loan_data, 
   features=feast_features
)

training_df = training_data.to_df()

### Save the generated dataset for use in the moderation phase.

In [None]:
# Storing the dataset as a local file
dataset = fs.create_saved_dataset(
    from_=training_data,
    name="credit_scoring_dataset",
    storage=SavedDatasetFileStorage(os.path.join(
            COMPUTE_LOCAL_WORKING_FOLDER,
            'data',
            'credit_scoring_dataset.parquet'))
            )

### How retrieve a stored dataset

In [None]:
# Retrieving the saved dataset and converting it to a DataFrame
training_df = fs.get_saved_dataset(name="credit_scoring_dataset").to_df()

### Materialization

In [None]:
fs.materialize_incremental(end_date=datetime.now())

In [None]:
dummy_loan_request = {
   "zipcode": [76104],
   "dob_ssn": ["19500806_6783"],
   "person_age": [133],
   "person_income": [59000],
   "person_home_ownership": ["RENT"],
   "person_emp_length": [123.0],
   "loan_intent": ["PERSONAL"],
   "loan_amnt": [35000],
   "loan_int_rate": [16.02],
}

# Next we fetch our online features 
customer_zipcode = dummy_loan_request['zipcode'][0]
dob_ssn = dummy_loan_request["dob_ssn"][0]

feature_vector = fs.get_online_features(
   entity_rows=[{"zipcode": customer_zipcode, "dob_ssn": dob_ssn}],
   features=feast_features,
).to_dict()

# Converting the features to a DataFrame
features = dummy_loan_request.copy()
features.update(feature_vector)
features_df = pd.DataFrame.from_dict(data=features)

### Feature Service

In [None]:
# A feature service definition can be created that will consist references to multiple feature views
mixedviews_fs = FeatureService(
    name="mixed_views",
    features=[
        zipcode_features[["city","state"]],
        credit_history[["mortgage_due"]]
        ]
)

fs.apply([mixedviews_fs])

# Now a call can be made to this feature service to retrieve required data that may be coming from one or more feature views -
features_to_fetch = fs.get_feature_service("mixed_views")

entity_rows = [
        {
            "zipcode": 76104,
            "dob_ssn": 19500806_6783,
        },
    ]

# From online store
returned_features = fs.get_online_features(
        features=features_to_fetch,
        entity_rows=entity_rows
    ).to_dict()

# or from offline store
returned_features_off = fs.get_historical_features(
    features=features_to_fetch, 
    entity_df=loan_data).to_df()

### Push source

In [None]:
zipcode_push_source = PushSource(
    name="zipcode_push_source",
    batch_source=FileSource(
        path=os.path.join(
            COMPUTE_LOCAL_WORKING_FOLDER,
            'data',
            'zipcode_table_2.parquet'),
        timestamp_field="event_timestamp",
    )
)

zipcode_features_push = FeatureView(
    name="zipcode_features_push",
    entities=[zipcode],
    ttl=timedelta(days=3650),
    source=zipcode_push_source
)

fs.apply([zipcode_features_push])

In [None]:
# new dummy data to push
pushed_data = [{'zipcode': 1111,
 'city': 'NEW ROME',
 'state': 'NJ',
 'location_type': 'PRIMARY',
 'tax_returns_filed': 13245,
 'population': 24083,
 'total_wages': 1089095041,
 'event_timestamp': pd.Timestamp('2017-01-01 12:00:00+0000', tz='UTC')}]

# execute the push
fs.push("zipcode_push_source", pd.DataFrame.from_dict(data=pushed_data), to=PushMode.ONLINE_AND_OFFLINE)

In [None]:
# get just pushed data

fs.get_online_features(
   entity_rows=[{"zipcode": 1111}],
   features=[
   "zipcode_features_push:city",
   "zipcode_features_push:state",
   "zipcode_features_push:location_type",
   "zipcode_features_push:tax_returns_filed",
   "zipcode_features_push:population",
   "zipcode_features_push:total_wages"],
).to_dict()

### Erase all configuration, log, and files generated at the time of initialization.

In [None]:
fs.teardown()