# Feature Store for Data Science and Machine Learning

In [None]:
# Start Google Meeting and Record!

## Setup

Use GCP Vertex AI Notebooks. If that doesn't work you can use Google Colab as well.

In [None]:
!conda install -q mamba -n base -c conda-forge -y

In [None]:
# There is no (officially supported) way to install feast using conda.
# This open issue asks for conda support: https://github.com/feast-dev/feast/issues/2748

!pip install -q feast

In [None]:
# We'll use only these direct dependencies.

!mamba install -q -c conda-forge numpy pandas scikit-learn kaggle auto-sklearn -y

In [None]:
!mamba install -q -c conda-forge black nb_black -y

In [None]:
# Formatting your code can be done automatically even in a notebook.
# Don't waste your time doing that yourself or wrose not doing it at all.

%load_ext lab_black

In [None]:
# Sometimes there is some weird issue with autosklearn
# Just restart the kernel and it should be fine

import json
import pickle
from pathlib import Path

import autosklearn
import numpy as np
import pandas as pd
from autosklearn.classification import AutoSklearnClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [None]:
# The autoreload extension allows us to reload imported code without reloading the Jupyter lab.
# You can read more about it here: https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html

%load_ext autoreload
%autoreload 2

In [None]:
# Some constants

DATA_DIR = "data"
FEATURE_STORE_DIR = "feature_store"
MODELS_DIR = "models"
N_CORES = 4
USABLE_MEMORY_PER_CORE = 1024 * 2
RANDOM_SEED = 420

In [None]:
# Go to this website and accept the rules of the competition: https://www.kaggle.com/competitions/spaceship-titanic/rules

In [None]:
# Upload your kaggle.json file to the current working directory.
# This script will place it in the right place and download the dataset.
# P.S. if you don't have kaggle.json, you can get it using these instructions: https://github.com/Kaggle/kaggle-api#api-credentials

!rm -rf ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c spaceship-titanic
!unzip spaceship-titanic.zip -d {DATA_DIR}

## EDA

In this section we will do the exploratory data analysis.
What is exploratory data analysis? 
If you'd like to learn more about EDA, I'd recommend reading "Making Sense of Data I: A Practical Guide to Exploratory Data Analysis and Data Mining, 2nd Edition" by Glenn J. Myatt and Wayne P. Johnson. Or the "Exploratory Data Analysis" by John Tukey.

In [None]:
all_data = pd.read_csv(Path(DATA_DIR) / "train.csv")

In [None]:
all_data.sample(10)

In [None]:
target_name = "Transported"

In [None]:
feature_names = list(
    filter(
        lambda x: x not in ["PassengerId", "Cabin", "Name", target_name],
        all_data.columns,
    )
)

In [None]:
# If you don't like filters and lambdas you can use list comprehensions

feature_names = [
    x
    for x in all_data.columns
    if x not in ["PassengerId", "Cabin", "Name", target_name]
]

In [None]:
# Just making sure that it worked

feature_names

In [None]:
# train_test_split doesn't make a copy of the dataframe, but just points to the relevant sections of the original dataframe.
# We don't have too much data, so we can make a copy.
# It will be more convenient than overriding.

train_data, val_data = train_test_split(all_data.copy(), random_state=RANDOM_SEED)

In [None]:
# Let's inspect the data types of our data

train_data.dtypes

### Handling categorical features 

In [None]:
# Not all features are be useful for the model.
# First let's select the categorical features that are useful.

categorical_cols = ["HomePlanet", "Destination"]

In [None]:
# It's very important to remember the mappings.
# Otherwise it will be impossible to reverse the encodings.

category_mappings = {}

In [None]:
# pd.Categorical is the right tool to encode data.
# It shows the strings to the users and gives numerical data to the ML models.

for col in categorical_cols:
    train_data[col] = pd.Categorical(train_data[col])
    category_mappings[col] = dict(enumerate(train_data[col].cat.categories))

In [None]:
# And this is how to save this category_mappings for the future.

with open(Path(DATA_DIR) / "category_mappings.json", "w", encoding="utf-8") as file:
    json.dump(category_mappings, file, ensure_ascii=False, indent=4)

In [None]:
# Let's use the newly created mappings to convert the val_data.

for col in categorical_cols:
    val_data[col] = pd.Categorical(
        val_data[col], categories=category_mappings[col].values()
    )

### Handling binary features 

In [None]:
binary_cols = ["CryoSleep", "VIP", "Transported"]

In [None]:
# We don't need to save any info about binary features.

for col in binary_cols:
    train_data[col] = train_data[col].astype(bool)
    val_data[col] = val_data[col].astype(bool)

In [None]:
# Our data should be ready for training.
# Let's take a look.

train_data.dtypes

In [None]:
train_data

In [None]:
val_data

## Creating the model

In [None]:
# Task 1: get the highest F1 macro score.
# You only have 5 minutes to do that.

In [None]:
model = AutoSklearnClassifier(
    time_left_for_this_task=60 * 2,
    memory_limit=USABLE_MEMORY_PER_CORE,
    n_jobs=N_CORES,
    metric=autosklearn.metrics.roc_auc,
)

In [None]:
model.fit(
    train_data[feature_names],
    train_data[target_name],
)

In [None]:
Path(MODELS_DIR).mkdir(parents=True, exist_ok=True)

In [None]:
with open(Path(MODELS_DIR) / "model1.pkl", "wb") as file:
    pickle.dump(model, file)

In [None]:
with open(Path(MODELS_DIR) / "model1.pkl", "rb") as file:
    model = pickle.load(file)

In [None]:
pred_data = model.predict(val_data[feature_names], n_jobs=-1)

In [None]:
print(classification_report(val_data[[target_name]], pred_data))

In [None]:
# A reasonable expected performance

#               precision    recall  f1-score   support

#        False       0.83      0.75      0.79      1065
#         True       0.78      0.85      0.82      1109

#     accuracy                           0.80      2174
#    macro avg       0.81      0.80      0.80      2174
# weighted avg       0.81      0.80      0.80      2174

## Feature Store

Feature store allows you to store and later retrieve features.
Most importantly it allows you to get features for both training and (real-time) inference.

### Which features should stored be in the feature store?

Derived features. Doing aggregations and other transformations in backend is difficult or sometimes even impossible. Feature store is the perfect place for these types of features.

Common or shareable features. Put any features that are going to be useful for many projects or for future iterations of your project.

### What are the limitations of Vinted's feature store?

1-24 hour delay between when the event took place and the feature depending on that event is available to use (for now). This is because of how our data warehouse jobs are scheduled. There are 1 or 24 hour jobs.

Feature engineering has to be done in the data warehouse jobs.

In [None]:
# Q: what value does the feature store bring to you?
# A1: real-time derived features
# A2: prepared shareable features for training and inference
# A3: feature self-service - you won't depend on the backend engineers to create and send you features

In [None]:
# Let's make sure that the feature store directory is free and initialize the feature store.

!rm -rf {FEATURE_STORE_DIR}
!feast init {FEATURE_STORE_DIR}

In [None]:
# Feast doesn't support CSV files, so we need to convert our CSV files to Parquet
# https://github.com/feast-dev/feast/issues/2563

In [None]:
# In addition, Parquet supports categorical encoding.

In [None]:
# We will only store some data in the feature store.
# This will replicate a typical work scenario, where only some of the features are available in the feature store.

In [None]:
fs_feature_names = [
    "HomePlanet",
    "CryoSleep",
    "Destination",
    "Age",
    "VIP",
    "Transported",
]
request_feature_names = [name for name in feature_names if name not in fs_feature_names]

In [None]:
request_feature_names

In [None]:
data_for_feast = all_data.copy()

for col in categorical_cols:
    data_for_feast[col] = pd.Categorical(
        data_for_feast[col], categories=category_mappings[col].values()
    )

for col in binary_cols:
    data_for_feast[col] = data_for_feast[col].astype(bool)

In [None]:
# Feast also needs an event timestamp column named "event_timestamp" to keep track of the updates
# https://github.com/feast-dev/feast/issues/2257

In [None]:
data_for_feast["event_timestamp"] = np.datetime64("2022-04-20")
data_for_feast.drop(request_feature_names, axis=1, inplace=True)
data_for_feast.to_parquet("/home/jupyter/data/train.parquet")
del data_for_feast

In [None]:
Replace the contents of the feature_store/example.py with the contents of example.py from the workshop repo

In [None]:
# Feast tries to read all Python files in the directory.
# Since you've opened the feature store dict Jupyter created ".ipynb_checkpoints" folder in there.
# It will cause issues, so you need to remove it by running:

!rm -rf feature_store/.ipynb_checkpoints/

In [None]:
# feast apply will create (or update) a feature store deployment

!cd feature_store && feast apply

In [None]:
from feast import FeatureStore

In [None]:
# We will assume that "partial_data" will contain features that are NOT going to be in the feature store
# Q: Where will they come from?
# A: The request to your model.

partial_data = all_data[["PassengerId", *request_feature_names]].copy()

In [None]:
partial_data["event_timestamp"] = np.datetime64("2022-04-20")

In [None]:
store = FeatureStore(repo_path=FEATURE_STORE_DIR)

In [None]:
# Here we merge two datasets, which we will use for training.
# "combined_data" is data coming from data warehouse, or any other source
# Then we are merging it with data from the feature store using "PassengerId" as key

combined_data = store.get_historical_features(
    entity_df=partial_data,
    features=[f"space_titanic:{name}" for name in fs_feature_names],
).to_df()

In [None]:
# Let's check that the categorical features still have the right data types.
# Namely some of the features have to be categorical and bool.

combined_data.dtypes

In [None]:
train_data, val_data = train_test_split(combined_data, random_state=RANDOM_SEED)

In [None]:
# Don't write large classes in Jupyter, it is not the right tool for the job.
# Just write it as a module and load it in Jupyter.

from not_a_real_trainer import NotARealTrainer

In [None]:
# Martynas: to use the feature store in real-life you'll have to do the model schema changes.

In [None]:
trainer = NotARealTrainer(
    name="model2",
    work_dir=Path(MODELS_DIR),
    memory=USABLE_MEMORY_PER_CORE,
    time_limit_in_seconds=60 * 2,
    fs_feature_names=fs_feature_names,
    metric=autosklearn.metrics.roc_auc,
)

In [None]:
trainer.fit((train_data[feature_names], train_data[target_name]))

In [None]:
# What is materialization? It just means load your features to the online feature store to make them available for inference.

!cd {FEATURE_STORE_DIR} && feast materialize-incremental $(date -u +"%Y-%m-%dT%H:%M:%S")

In [None]:
# Let's simulate how we'd do a real inference by calling our model repeatedly with a batch of data.

%%time
step = 1000
predictions = []
for i in range(0, len(val_data), step):
    predictions.append(
        trainer.predict(store,
            val_data[["PassengerId", *request_feature_names]].iloc[i : i + step]
        )
    )
predictions = np.concatenate(predictions)

In [None]:
# We should get similar results as before.

print(classification_report(val_data[[target_name]], predictions))

In [None]:
# You can read more about how to use the Feature Store for your projects here: https://github.com/vinted/vmip-docs