# Credit scoring model training and inference using Feast feature store.

In [None]:
# System imports
from datetime import timedelta

# 3rd party imports
import category_encoders as ce
import pandas as pd
from datetime import datetime
from pathlib import Path
from pprint import pprint
from feast import  (Entity,
                    FeatureService,
                    FeatureStore,
                    FeatureView,
                    Field,
                    FileSource,
                    ValueType)
from feast.types import Int64, String
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb

In [None]:
# Functions definitions
def get_data_from_file(file: str) -> pd.DataFrame:
    df = pd.read_parquet(file)
    return df

def show_df_size(df: pd.DataFrame, df_name: str) -> None:
    print(f"{df_name} df size: {df.shape[0]:,d} rows, {df.shape[1]:,d} columns")

In [None]:
# Define source data files paths
ZIPCODE_TABLE = "../data/zipcode_table.parquet"
CREDIT_HISTORY_TABLE="../data/credit_history.parquet"
LOANS_TABLE = "../data/loan_table.parquet"

## Section 1: Data exploration (only using local files)

In [None]:
# Get economy-wise geographical info.
zip_df = get_data_from_file(ZIPCODE_TABLE)
show_df_size(zip_df, "Zip")
display(zip_df)

In [None]:
# Get credit-related data
credit_df = get_data_from_file(CREDIT_HISTORY_TABLE)
show_df_size(credit_df, "Credit")
display(credit_df.sort_values(by='event_timestamp', ascending=False))

## Section 2: Creating the feature repo and the online store

In [None]:
# Set the feature store repo path
FEAST_REPO = "../../feature_repo/"
repo_path = Path(FEAST_REPO)
fs = FeatureStore(repo_path=repo_path) # a FeatureStore object is used to define, create, and retrieve features

#### Define the 'zipcode' and 'dob_ssn' (date-of-birth_social-sec-number) entities.

An entity is a collection of semantically related features. Users define entities to map to the domain of their use case. In this case. the zip code and the dob_ssn wiil identify the requestor of a loan. We want to build a model that helps decide whether the loan should be granted or denied.

In [None]:
zipcode = Entity(
    name="zipcode",
    value_type=Int64,
    description="Zipcode for the loan origin"
)

dob_ssn = Entity(
    name="dob_ssn",
    value_type=String,
    description="Date of birth and last four digits of social security number"
)

#### Define the FeatureViews and FeatureService

A feature view is an object that represents a logical group of time-series feature data as it is found in a data source. Feature views consist of zero or more entities, one or more features, and a data source. Feature views allow Feast to model your existing feature data in a consistent way in both an offline (training) and online (serving) environment. Feature views generally contain features that are properties of a specific object, in which case that object is defined as an entity and included in the feature view. If the features are not related to a specific object, the feature view might not have entities.

In [None]:
'''
Create Feat's FileSource objects:
- Notice that only Parquet files are supported by FileSource
'''

zipcode_batch_source = FileSource(
    path=ZIPCODE_TABLE,
    timestamp_field="event_timestamp",

    created_timestamp_column="created_timestamp" 
)


credit_history_source = FileSource(
    path=CREDIT_HISTORY_TABLE,
    timestamp_field="event_timestamp",
    created_timestamp_column="created_timestamp"
)

'''
Create the FeatureView objects, one per each FileSource
'''
zipcode_features = FeatureView(
    name="zipcode_features",
    entities=["zipcode"], # entity defined by Feast.Entity
    ttl=timedelta(days=3650), # time to live
    schema=[
        Field(name="city", dtype=String),
        Field(name="state", dtype=String),
        Field(name="location_type", dtype=String),
        Field(name="tax_returns_filed", dtype=Int64),
        Field(name="population", dtype=Int64),
        Field(name="total_wages", dtype=Int64),
    ],
    source=zipcode_batch_source,
    online=True,
)

credit_history = FeatureView(
    name="credit_history",
    entities=["dob_ssn"], # entity defined by Feast.Entity
    ttl=timedelta(days=800), # time to live
    schema=[
        Field(name="credit_card_due", dtype=Int64),
        Field(name="mortgage_due", dtype=Int64),
        Field(name="student_loan_due", dtype=Int64),
        Field(name="vehicle_loan_due", dtype=Int64),
        Field(name="hard_pulls", dtype=Int64),
        Field(name="missed_payments_2y", dtype=Int64),
        Field(name="missed_payments_1y", dtype=Int64),
        Field(name="missed_payments_6m", dtype=Int64),
        Field(name="bankruptcies", dtype=Int64),
    ],
    source=credit_history_source,
    online=True
)

'''
Create the feature service: A feature service defines a logical group of features from one or more feature views.
This group of features can be retrieved together during training or serving.
'''
model_features_svc = FeatureService(
    name="model_features_svc",
    features=[zipcode_features, credit_history],
    tags={"Description": "Used for training a XGBoost Logistic Regression model"}
)

## Register objects to metadata store and update related infrastructure.

The apply method registers one or more definitions (e.g., Entity, FeatureView) and registers or updates these
objects in the Feast registry. Once the apply method has updated the infrastructure (e.g., create tables in
an online store), it will commit the updated registry. All operations are idempotent, meaning they can safely
be rerun.


In [None]:
# Apply feature store definitions
fs.apply([zipcode, dob_ssn, # Entities
          zipcode_features, credit_history, # FeatureViews
          model_features_svc # FeatureService
         ])

# Display feature services, feature views and feature names just registered
for feature_svc in fs.list_feature_services():
    print(f"Feature service name: {feature_svc.name}")
    for projection in feature_svc.feature_view_projections:
        print(f"\tFeature view: {projection.name}")
        for feat in projection.features:
            print(f"\t\tFeature: {feat.name}, type: {feat.dtype}")

## Load (materialize) data from the offline store into the online store.

This method loads feature data in the specified interval from either
the specified feature views, or all feature views if none are specified,
into the online store where it is available for online serving.

In [None]:
start_date = datetime(2017, 1, 1, 0, 0, 0)
end_date = datetime(2021, 9, 1,  0, 0, 0)
fs.materialize(start_date, end_date)

## Section 3: Use feature defintions augmented with a "loans" data table

In [None]:
# Recovering features from the feature service
feat_svc = fs.list_feature_services()[-1] # get the last feature service in the list
feast_features = []
for view_proj in feat_svc.feature_view_projections: 
    #print(view_proj.name)
    for feature in view_proj.features:
        #print(f"\t{feature.name}")
        feast_features.append(f"{view_proj.name}:{feature.name}")
pprint(feast_features)

In [None]:
# Get a couple of vector features from the online store
zipcodes_dob_ssns = [(8089, "19600724_9887"), (69033, "19960703_3449")]
for zipcode, dob_ssn in zipcodes_dob_ssns:
    print(f"Feature vector for zipcode {zipcode} and dob_ssn {dob_ssn}:")
    data = fs.get_online_features(
            entity_rows=[{"zipcode": zipcode, "dob_ssn": dob_ssn}],
            features=feast_features).to_dict()
    display(pd.DataFrame.from_dict({key:data[key] for key in data}))

In [None]:
# Get the Loans table to be used to train the model
loans_df = pd.read_parquet(LOANS_TABLE)
print(f"Loans df size: {loans_df.shape[0]} rows, {loans_df.shape[1]} cols")
display(loans_df.head(3))

In [None]:
display(loans_df[loans_df.dob_ssn == '19600724_9887'])

In [None]:
display(loans_df[loans_df.dob_ssn == '19960703_3449'])

#### get_historical_features(). Enrich an entity dataframe with historical feature values for either training or batch scoring.

This method joins historical feature data from one or more feature views to an entity dataframe by using a time
travel join. Each feature view is joined to the entity dataframe using all entities configured for the respective feature
view. All configured entities must be available in the entity dataframe. Therefore, the entity dataframe must
contain all entities found in all feature views, but the individual feature views can have different entities.

Time travel is based on the configured TTL for each feature view. A shorter TTL will limit the
amount of scanning that will be done in order to find feature data for a specific entity key. 

->> Setting a short TTL may result in null values being returned.


In [None]:
%%time
train_df = fs.get_historical_features(entity_df=loans_df, features=feast_features).to_df()
print(f"Training df size: {train_df.shape[0]} rows, {train_df.shape[1]} cols")
display(train_df)

## Section 4. Create Model

### Data preparation

In [None]:
%%time
# Define categorical columns and the list of columns to remove for modeling purposes
categorical_features = [
    "person_home_ownership",
    "loan_intent",
]

columns_to_drop = [
    "city",
    "state",
    "event_timestamp",
    "created_timestamp__",
    "loan_id",
    "loan_status",
    "dob_ssn",
    "zipcode",
    "location_type",
]
train_y = train_df.loan_status
train_X = train_df.drop(columns=columns_to_drop) # drop columns with no modeling role
train_X = train_X.reindex(sorted(train_X.columns), axis=1) # sort columns to keep the same order
for col in categorical_features: # "category" dtype of categorical columns
    train_X[col] = train_X[col].astype("category")
one_hot_encoder = ce.OneHotEncoder(cols=categorical_features) # create a categorical values transformer
train_X = one_hot_encoder.fit_transform(train_X) # Fit and apply a transformer to the training dataset

In [None]:
print(f"train_X df size: {train_X.shape[0]} rows, {train_X.shape[1]} cols")
train_X.head(3)

In [None]:
%%time
# Define training and test matrices in XGBoost format
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, random_state=42) # split the dataset
dtrain = xgb.DMatrix(data=X_train, label=y_train) # DMatrix for XGBoost training
dtest = xgb.DMatrix(data=X_test, label=y_test) # DMatrix for XGBoost test

### Train an XGBoost model

In [None]:
%%time
# Set xgboost params
param = {
    'booster' : 'dart',
    'learning_rate': 0.1,
    'verbosity': 1,
    'max_depth': 7,  # the maximum depth of each tree
    'objective': 'binary:hinge',  # error evaluation for binary class training
    'eval_metric': ['logloss', 'error'],
    'rate_drop': 0.1,
    'skip_drop': 0.5,
}
num_rounds = 100  # Set the number of training iterations
model = xgb.train(param, dtrain, num_rounds) # Train model
preds = model.predict(dtest) # Test model
print(f"ROC-AUC score: {roc_auc_score(y_test, preds, average='weighted')}") # Evaluate model's performance

## Section 5. Make predictions using the online feature store

In [None]:
# Using loan id numbers, grab the zipcode and dob_ssn entities to make a prediction
infer_vectors = []
for loan_id in [28821, 38637, 10000]:
    loan_record = loans_df[loans_df.loan_id == loan_id]
    infer_vectors.append(loan_record)
    display(loan_record)

In [None]:
for vec in infer_vectors:
    vec = vec.iloc[0].to_dict()
    print(f"\n>>>> Build feature vector for loan_id {vec['loan_id']}")
    zipcode=vec['zipcode']
    dob_ssn=vec['dob_ssn']

    # Get the feature vector from the online store to enrich the inference vector
    feat_vec = fs.get_online_features(entity_rows=[{"zipcode": vec['zipcode'], "dob_ssn": vec['dob_ssn']}],
                          features=feast_features).to_dict()
    vec.update(feat_vec)
    inf_vec = pd.DataFrame.from_dict(vec)
    display(inf_vec) # show the complete inference vector
    
    # Apply transformations to inference vector
    for col in categorical_features: # "category" dtype of categorical columns
        inf_vec[col] = inf_vec[col].astype("category")
    to_drop = columns_to_drop.copy()
    to_drop = list(map(lambda x: x.replace('created_timestamp__', 'created_timestamp'), to_drop)) # Fix a weird column rename made by Feast
    inf_vec = inf_vec.drop(columns=to_drop) # Drop columns with no modeling role
    inf_vec = inf_vec.reindex(sorted(inf_vec.columns), axis=1) # Sort columns alphabetically for consisting ordering
    inf_vec = one_hot_encoder.transform(inf_vec) # Apply the transformer
    data = xgb.DMatrix(inf_vec) # convert Pandas to DMatrix
    
    prediction = model.predict(data) # Make predictions
    print(f"\nPrediction for loan_id {vec['loan_id']}: {'approved' if prediction else 'denied'} <<<< \n")