# Credit scoring model training and inference using Feast feature store.
### Putting most of the code pieces together.

In [121]:
# System imports
from datetime import timedelta
import sys
sys.path.insert(0, "../")

# 3rd party imports
import category_encoders as ce
import pandas as pd
from datetime import datetime
from pathlib import Path
from pprint import pprint
from feast import   Entity,\
                    Feature, \
                    FeatureService,\
                    FeatureStore,\
                    FeatureView,\
                    Field,\
                    FileSource,\
                    ValueType

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb

# Local imports
from entities.entity import zipcode, dob_ssn
from features.feature_views import zipcode_features, credit_history
from feature_service.feature_svc import zipcode_features_svc
from utils.data_fetcher import DataFetcher

In [2]:
# Functions and options definitions
pd.set_option('display.max_columns', 50)

def get_data_from_file(file: str) -> pd.DataFrame:
    df = pd.read_parquet(file)
    return df

def show_df_size(df: pd.DataFrame, df_name: str) -> None:
    print(f"{df_name} df size: {df.shape[0]:,d} rows, {df.shape[1]:,d} columns")

In [3]:
# Define source data files paths
ZIPCODE_TABLE = "/Users/kike/Library/CloudStorage/OneDrive-VMware,Inc/OCTO/2022-H1/Taurus/Feast/feast_workshops-master/module_3/feature_repo/data/zipcode_table.parquet"
CREDIT_HISTORY_TABLE="/Users/kike/Library/CloudStorage/OneDrive-VMware,Inc/OCTO/2022-H1/Taurus/Feast/feast_workshops-master/module_3/feature_repo/data/credit_history.parquet"
LOANS_TABLE = "/Users/kike/Library/CloudStorage/OneDrive-VMware,Inc/OCTO/2022-H1/Taurus/Feast/feast_workshops-master/module_3/feature_repo/data/loan_table.parquet"

## Section 1: Data exploration (only using local files)

In [4]:
# Get economy-wise geographical info.
zip_df = get_data_from_file(ZIPCODE_TABLE)
show_df_size(zip_df, "Zip")
zip_df.head(3)

Zip df size: 28,844 rows, 9 columns


Unnamed: 0,zipcode,city,state,location_type,tax_returns_filed,population,total_wages,event_timestamp,created_timestamp
0,7675,WESTWOOD,NJ,PRIMARY,13245,24083,1089095041,2017-01-01 12:00:00+00:00,2017-01-01 12:00:00+00:00
1,7677,WOODCLIFF LAKE,NJ,PRIMARY,2945,5471,325436960,2017-01-01 12:00:00+00:00,2017-01-01 12:00:00+00:00
2,7885,WHARTON,NJ,PRIMARY,5273,8999,240827990,2017-01-01 12:00:00+00:00,2017-01-01 12:00:00+00:00


In [6]:
# Get credit-related data
credit_df = get_data_from_file(CREDIT_HISTORY_TABLE)
show_df_size(credit_df, "Credit")
credit_df.head(3)

Credit df size: 2,033,298 rows, 12 columns


Unnamed: 0,event_timestamp,dob_ssn,credit_card_due,mortgage_due,student_loan_due,vehicle_loan_due,hard_pulls,missed_payments_2y,missed_payments_1y,missed_payments_6m,bankruptcies,created_timestamp
0,2020-04-26 18:01:04.746575,19530219_5179,8419,91803,22328,15078,0,1,0,0,0,2020-04-26 18:01:04.746575
1,2020-04-26 18:01:04.746575,19781116_7723,2944,741165,2515,28605,0,3,3,1,0,2020-04-26 18:01:04.746575
2,2020-04-26 18:01:04.746575,19931128_5771,833,976522,33000,21733,9,7,0,0,0,2020-04-26 18:01:04.746575


In [7]:
credit_df.event_timestamp.sort_values(ignore_index=True)

0         2020-04-26 18:01:04.746575
1         2020-04-26 18:01:04.746575
2         2020-04-26 18:01:04.746575
3         2020-04-26 18:01:04.746575
4         2020-04-26 18:01:04.746575
                     ...            
2033293   2021-08-29 18:01:04.746575
2033294   2021-08-29 18:01:04.746575
2033295   2021-08-29 18:01:04.746575
2033296   2021-08-29 18:01:04.746575
2033297   2021-08-29 18:01:04.746575
Name: event_timestamp, Length: 2033298, dtype: datetime64[ns]

## Section 2: Creating the feature repo and the online store

In [8]:
# Set the feature store repo path
FEAST_REPO = "/Users/kike/Library/CloudStorage/OneDrive-VMware,Inc/OCTO/2022-H1/Taurus/Feast/feast_workshops-master/module_3/feature_repo"
repo_path = Path(FEAST_REPO)
fs = FeatureStore(repo_path=repo_path) # a FeatureStore object is used to define, create, and retrieve features

#### Define the 'zipcode'and 'dob_ssn' (date-of-birth_social-sec-number) entities. 

An entity is a collection of semantically related features. Users define entities to map to the domain of their use case. In this case. the zip code and the dob_ssn wiil identify the requestor of a loan. We want to build a model that helps decide whether the loan should be granted or denied.

In [9]:
zipcode = Entity(
    name="zipcode",
    value_type=ValueType.INT64,
    description="Zipcode for the loan origin"
)

dob_ssn = Entity(
    name="dob_ssn",
    value_type=ValueType.STRING,
    description="Date of birth and last four digits of social security number"
)

#### Define the FeatureViews and FeatureService

A feature view is an object that represents a logical group of time-series feature data as it is found in a data source. Feature views consist of zero or more entities, one or more features, and a data source. Feature views allow Feast to model your existing feature data in a consistent way in both an offline (training) and online (serving) environment. Feature views generally contain features that are properties of a specific object, in which case that object is defined as an entity and included in the feature view. If the features are not related to a specific object, the feature view might not have entities.

In [10]:
'''
Create Feat's FileSource objects:
- Notice that only Parquet files are supported by FileSource
'''

zipcode_batch_source = FileSource(
    path="/Users/kike/Library/CloudStorage/OneDrive-VMware,Inc/OCTO/2022-H1/Taurus/Feast/feast_workshops-master/module_3/feature_repo/data/zipcode_table.parquet",
    timestamp_field="event_timestamp",
    created_timestamp_column="created_timestamp" 
)


credit_history_source = FileSource(
    path="/Users/kike/Library/CloudStorage/OneDrive-VMware,Inc/OCTO/2022-H1/Taurus/Feast/feast_workshops-master/module_3/feature_repo/data/credit_history.parquet",
    timestamp_field="event_timestamp",
    created_timestamp_column="created_timestamp"
)

# Create the FeatureView objects

zipcode_features = FeatureView(
    name="zipcode_features",
    entities=["zipcode"], # entity defined by Feast.Entity
    ttl=timedelta(days=3650), # time to live
    features=[
        Feature(name="city", dtype=ValueType.STRING),
        Feature(name="state", dtype=ValueType.STRING),
        Feature(name="location_type", dtype=ValueType.STRING),
        Feature(name="tax_returns_filed", dtype=ValueType.INT64),
        Feature(name="population", dtype=ValueType.INT64),
        Feature(name="total_wages", dtype=ValueType.INT64),
    ],
    batch_source=zipcode_batch_source,
    online=True,
)

credit_history = FeatureView(
    name="credit_history",
    entities=["dob_ssn"], # entity defined by Feast.Entity
    ttl=timedelta(days=90), # time to live
    features=[
        Feature(name="credit_card_due", dtype=ValueType.INT64),
        Feature(name="mortgage_due", dtype=ValueType.INT64),
        Feature(name="student_loan_due", dtype=ValueType.INT64),
        Feature(name="vehicle_loan_due", dtype=ValueType.INT64),
        Feature(name="hard_pulls", dtype=ValueType.INT64),
        Feature(name="missed_payments_2y", dtype=ValueType.INT64),
        Feature(name="missed_payments_1y", dtype=ValueType.INT64),
        Feature(name="missed_payments_6m", dtype=ValueType.INT64),
        Feature(name="bankruptcies", dtype=ValueType.INT64),
    ],
    batch_source=credit_history_source,
    online=True
)

'''
Create the feature service: A feature service defines a logical group of features from one or more feature views.
This group of features can be retrieved together during training or serving.
'''

model_features_svc = FeatureService(
    name="model_features_svc",
    features=[zipcode_features, credit_history],
    tags={"Description": "Used for training a XGBoost Logistic Regression model"}
)



#### Register objects to metadata store and update related infrastructure.

The apply method registers one or more definitions (e.g., Entity, FeatureView) and registers or updates these
objects in the Feast registry. Once the apply method has updated the infrastructure (e.g., create tables in
an online store), it will commit the updated registry. All operations are idempotent, meaning they can safely
be rerun.


In [11]:
fs.apply([zipcode, dob_ssn, # Entities
          zipcode_features, credit_history, # FeatureViews
          model_features_svc # FeatureService
         ])

#### Load (materialize) data from the offline store into the online store.

This method loads feature data in the specified interval from either
the specified feature views, or all feature views if none are specified,
into the online store where it is available for online serving.

In [12]:
start_date = datetime(2020, 3, 1, 0, 0, 0)
end_date = datetime(2020, 6, 1,  0, 0, 0)
fs.materialize(start_date, end_date)

Materializing [1m[32m2[0m feature views from [1m[32m2020-03-01 00:00:00-06:00[0m to [1m[32m2020-06-01 00:00:00-05:00[0m into the [1m[32msqlite[0m online store.

[1m[32mzipcode_features[0m:


0it [00:00, ?it/s]


[1m[32mcredit_history[0m:


100%|███████████████████████████████████████████████████████| 28633/28633 [00:05<00:00, 4791.89it/s]


## Section 3: Use feature defintions augmented with a "loans" data table

In [13]:
# Recovering features from the feature service
feat_svc = fs.list_feature_services()[-1] # get the last feature service in the list
feast_features = []
for view_proj in feat_svc.feature_view_projections: 
    #print(view_proj.name)
    for feature in view_proj.features:
        #print(f"\t{feature.name}")
        feast_features.append(f"{view_proj.name}:{feature.name}")
pprint(feast_features)

['zipcode_features:city',
 'zipcode_features:state',
 'zipcode_features:location_type',
 'zipcode_features:tax_returns_filed',
 'zipcode_features:population',
 'zipcode_features:total_wages',
 'credit_history:credit_card_due',
 'credit_history:mortgage_due',
 'credit_history:student_loan_due',
 'credit_history:vehicle_loan_due',
 'credit_history:hard_pulls',
 'credit_history:missed_payments_2y',
 'credit_history:missed_payments_1y',
 'credit_history:missed_payments_6m',
 'credit_history:bankruptcies']


In [14]:
# Get a couple of vector features from the online store
zipcodes_dob_ssns = [(70460, "19721022_2735"), (74337, "19550130_7970")]
for zipcode, dob_ssn in zipcodes_dob_ssns:
    print(f"Feature vector for zipcode {zipcode} and dob_ssn {dob_ssn}:")
    #data = fetcher.get_online_data(zipcode, dob_ssn)
    data = fs.get_online_features(
            entity_rows=[{"zipcode": zipcode, "dob_ssn": dob_ssn}],
            features=feast_features).to_dict()
    display(pd.DataFrame.from_dict({key:data[key] for key in data}))

Feature vector for zipcode 70460 and dob_ssn 19721022_2735:


Unnamed: 0,zipcode,dob_ssn,population,state,tax_returns_filed,city,total_wages,location_type,missed_payments_2y,bankruptcies,missed_payments_1y,mortgage_due,credit_card_due,hard_pulls,student_loan_due,vehicle_loan_due,missed_payments_6m
0,70460,19721022_2735,16334,LA,8986,SLIDELL,315061217,PRIMARY,1,0,2,690650,1777,5,46372,10439,1


Feature vector for zipcode 74337 and dob_ssn 19550130_7970:


Unnamed: 0,zipcode,dob_ssn,population,state,tax_returns_filed,city,total_wages,location_type,missed_payments_2y,bankruptcies,missed_payments_1y,mortgage_due,credit_card_due,hard_pulls,student_loan_due,vehicle_loan_due,missed_payments_6m
0,74337,19550130_7970,3716,OK,1969,CHOUTEAU,59412230,PRIMARY,7,2,1,462670,1791,8,19421,3583,0


In [15]:
# Get the Loans table to be used to train the model
loans_df = pd.read_parquet(LOANS_TABLE)
print(f"Loans df size: {loans_df.shape[0]} rows, {loans_df.shape[1]} cols")
loans_df.head(3)

Loans df size: 28638 rows, 13 cols


Unnamed: 0,loan_id,dob_ssn,zipcode,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,event_timestamp,created_timestamp
0,10000,19530219_5179,76104,22,59000,RENT,123.0,PERSONAL,35000,16.02,1,2021-08-25 20:34:41.361000+00:00,2021-08-25 20:34:41.361000+00:00
1,10001,19520816_8737,70380,21,9600,OWN,5.0,EDUCATION,1000,11.14,0,2021-08-25 20:16:20.128000+00:00,2021-08-25 20:16:20.128000+00:00
2,10002,19860413_2537,97039,25,9600,MORTGAGE,1.0,MEDICAL,5500,12.87,1,2021-08-25 19:57:58.896000+00:00,2021-08-25 19:57:58.896000+00:00


In [17]:
# Recall the entity names defined in the feature store
for entity in fs.list_entities():
    print(f"Entity: {entity.name}")

Entity: zipcode
Entity: dob_ssn


#### get_historical_features(). Enrich an entity dataframe with historical feature values for either training or batch scoring.

This method joins historical feature data from one or more feature views to an entity dataframe by using a time
travel join. Each feature view is joined to the entity dataframe using all entities configured for the respective feature
view. All configured entities must be available in the entity dataframe. Therefore, the entity dataframe must
contain all entities found in all feature views, but the individual feature views can have different entities.

Time travel is based on the configured TTL for each feature view. A shorter TTL will limit the
amount of scanning that will be done in order to find feature data for a specific entity key. 

->> Setting a short TTL may result in null values being returned.


In [130]:
%%time
train_df = fs.get_historical_features(entity_df=loans_df, features=feast_features).to_df()
print(f"Training df size: {train_df.shape[0]} rows, {train_df.shape[1]} cols")
train_df.head(3)

Training df size: 28638 rows, 28 cols
CPU times: user 22.9 s, sys: 1.59 s, total: 24.5 s
Wall time: 24.8 s


Unnamed: 0,loan_id,dob_ssn,zipcode,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,event_timestamp,created_timestamp__,city,state,location_type,tax_returns_filed,population,total_wages,credit_card_due,mortgage_due,student_loan_due,vehicle_loan_due,hard_pulls,missed_payments_2y,missed_payments_1y,missed_payments_6m,bankruptcies
1358886,38450,19721022_2735,70460,55,24543,RENT,3.0,VENTURE,4000,13.92,0,2020-08-28 05:46:51.871000+00:00,2017-01-01 12:00:00+00:00,SLIDELL,LA,PRIMARY,8986,16334,315061217,1777,690650,46372,10439,5,1,2,1,0
1358815,38449,19550130_7970,74337,58,20000,RENT,0.0,EDUCATION,4000,9.99,0,2020-08-28 06:05:13.103000+00:00,2017-01-01 12:00:00+00:00,CHOUTEAU,OK,PRIMARY,1969,3716,59412230,1791,462670,19421,3583,8,7,1,0,2
1353348,38554,19970610_1743,58504,64,24000,RENT,1.0,MEDICAL,3000,6.99,0,2020-08-26 21:58:03.673000+00:00,2017-01-01 12:00:00+00:00,BISMARCK,ND,PRIMARY,11564,19832,469621263,5917,1780959,11835,27910,8,3,2,1,0


## Section 4. Create Model

### Data preparation

In [132]:
%%time
# Define categorical columns and the list of columns to remove for modeling purposes
categorical_features = [
    "person_home_ownership",
    "loan_intent",
    "location_type",
]

columns_to_drop = [
    "city",
    "state",
    "event_timestamp",
    "created_timestamp__",
    "loan_id",
    "loan_status",
    "dob_ssn",
    "zipcode",    
]
train_y = train_df.loan_status
train_X = train_df.drop(columns=columns_to_drop) # drop columns with no modeling role
train_X = train_X.reindex(sorted(train_X.columns), axis=1) # sort columns to keep the same order
for col in categorical_features: # "category" dtype of categorical columns
    train_X[col] = train_X[col].astype("category")
one_hot_encoder = ce.OneHotEncoder(cols=categorical_features) # create a categorical values transformer
train_X = one_hot_encoder.fit_transform(train_X) # Tit and apply a tranformer to the training dataset

CPU times: user 93.7 ms, sys: 108 ms, total: 201 ms
Wall time: 200 ms


In [133]:
print(f"train_X df size: {train_X.shape[0]} rows, {train_X.shape[1]} cols")
train_X.head(3)

train_X df size: 28638 rows, 28 cols


Unnamed: 0,bankruptcies,credit_card_due,hard_pulls,loan_amnt,loan_int_rate,loan_intent_1,loan_intent_2,loan_intent_3,loan_intent_4,loan_intent_5,loan_intent_6,location_type_1,missed_payments_1y,missed_payments_2y,missed_payments_6m,mortgage_due,person_age,person_emp_length,person_home_ownership_1,person_home_ownership_2,person_home_ownership_3,person_home_ownership_4,person_income,population,student_loan_due,tax_returns_filed,total_wages,vehicle_loan_due
1358886,0,1777,5,4000,13.92,1,0,0,0,0,0,1,2,1,1,690650,55,3.0,1,0,0,0,24543,16334,46372,8986,315061217,10439
1358815,2,1791,8,4000,9.99,0,1,0,0,0,0,1,1,7,0,462670,58,0.0,1,0,0,0,20000,3716,19421,1969,59412230,3583
1353348,0,5917,8,3000,6.99,0,0,1,0,0,0,1,2,3,1,1780959,64,1.0,1,0,0,0,24000,19832,11835,11564,469621263,27910


In [134]:
%%time
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, random_state=42) # split the dataset
dtrain = xgb.DMatrix(data=X_train, label=y_train) # DMatrix for XGBoost training
dtest = xgb.DMatrix(data=X_test, label=y_test) # DMatrix for XGBoost test

CPU times: user 2.98 s, sys: 751 ms, total: 3.73 s
Wall time: 423 ms


### Train an XGBoost model

In [135]:
%%time
# Set xgboost params
param = {
    'booster' : 'dart',
    'learning_rate': 0.1,
    'verbosity': 1,
    'max_depth': 7,  # the maximum depth of each tree
    'objective': 'binary:hinge',  # error evaluation for binary class training
    'eval_metric': ['logloss', 'error'],
    'rate_drop': 0.1,
    'skip_drop': 0.5,
}
num_rounds = 100  # the number of training iterations
model = xgb.train(param, dtrain, num_rounds) # train model
preds = model.predict(dtest)
print(f"ROC-AUC score: {roc_auc_score(y_test, preds, average='weighted')}") # Evaluate model's performance

ROC-AUC score: 0.8745599015164233
CPU times: user 45.6 s, sys: 2.72 s, total: 48.3 s
Wall time: 3.81 s


## Section 5. Make predictions using the online feature store

In [175]:
# Using loan id numbers, grab the zipcode and dob_ssn entities to make a prediction
infer_vectors = []
for loan_id in [10000, 38637]:
    loan_record = loans_df[loans_df.loan_id == loan_id]
    infer_vectors.append(loan_record)
    display(loan_record)

Unnamed: 0,loan_id,dob_ssn,zipcode,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,event_timestamp,created_timestamp
0,10000,19530219_5179,76104,22,59000,RENT,123.0,PERSONAL,35000,16.02,1,2021-08-25 20:34:41.361000+00:00,2021-08-25 20:34:41.361000+00:00


Unnamed: 0,loan_id,dob_ssn,zipcode,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,event_timestamp,created_timestamp
28637,38637,19960703_3449,69033,66,42000,RENT,2.0,MEDICAL,6475,9.99,0,2020-08-25 20:34:41.361000+00:00,2020-08-25 20:34:41.361000+00:00


In [172]:
for vec in infer_vectors:
    vec = vec.iloc[0].to_dict()
    print(f"\n>>>> Build feature vector for loan_id {vec['loan_id']}")
    zipcode=vec['zipcode']
    dob_ssn=vec['dob_ssn']
    # Get the feature vector from the online store to enrich the inference vector
    feat_vec = fs.get_online_features(entity_rows=[{"zipcode": vec['zipcode'], "dob_ssn": vec['dob_ssn']}],
                          features=feast_features).to_dict()
    vec.update(feat_vec)
    inf_vec = pd.DataFrame.from_dict(vec)
    display(inf_vec) # show the complet inference vector
    
    # Apply transformations to inference vector
    for col in categorical_features: # "category" dtype of categorical columns
        inf_vec[col] = inf_vec[col].astype("category")
    to_drop = columns_to_drop.copy()
    to_drop = list(map(lambda x: x.replace('created_timestamp__', 'created_timestamp'), to_drop)) # Fix a weird column rename made by Feast
    inf_vec = inf_vec.drop(columns=to_drop) # Drop columns with no modeling role
    inf_vec = inf_vec.reindex(sorted(inf_vec.columns), axis=1) # Sort columns to keep the same order
    inf_vec = one_hot_encoder.transform(inf_vec) # Encode categorical values. We need to use the fitted transformer
    
    
    data = xgb.DMatrix(inf_vec)
    prediction = model.predict(data)
    print(f"\nPrediction for loan_id {vec['loan_id']}: {'approved' if prediction else 'denied'} <<<< \n")


>>>> Build feat vector for loan_id 10000


Unnamed: 0,loan_id,dob_ssn,zipcode,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,event_timestamp,created_timestamp,population,state,tax_returns_filed,city,total_wages,location_type,missed_payments_2y,bankruptcies,missed_payments_1y,mortgage_due,credit_card_due,hard_pulls,student_loan_due,vehicle_loan_due,missed_payments_6m
0,10000,19530219_5179,76104,22,59000,RENT,123.0,PERSONAL,35000,16.02,1,2021-08-25 20:34:41.361000+00:00,2021-08-25 20:34:41.361000+00:00,10534,TX,6058,FORT WORTH,142325465,PRIMARY,1,0,0,91803,8419,0,22328,15078,0



Prediction for loan_id 10000: approved <<<< 


>>>> Build feat vector for loan_id 38637


Unnamed: 0,loan_id,dob_ssn,zipcode,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,event_timestamp,created_timestamp,population,state,tax_returns_filed,city,total_wages,location_type,missed_payments_2y,bankruptcies,missed_payments_1y,mortgage_due,credit_card_due,hard_pulls,student_loan_due,vehicle_loan_due,missed_payments_6m
0,38637,19960703_3449,69033,66,42000,RENT,2.0,MEDICAL,6475,9.99,0,2020-08-25 20:34:41.361000+00:00,2020-08-25 20:34:41.361000+00:00,2141,NE,1171,IMPERIAL,32596076,PRIMARY,4,0,0,1197324,1928,1,24208,4691,1



Prediction for loan_id 38637: denied <<<< 

