# Feature Store for Data Science and Machine Learning

In [90]:
# Start Google Meeting and Record!

## Setup

Use GCP Vertex AI Notebooks. If that doesn't work you can use Google Colab as well.

In [91]:
# The autoreload extension allows us to reload imported code without reloading the Jupyter lab.
# You can read more about it here: https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [92]:
!conda install -q mamba -n base -c conda-forge -y

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [93]:
# There is no (officially supported) way to install feast using conda.
# This open issue asks for conda support: https://github.com/feast-dev/feast/issues/2748

!pip install -q feast

In [94]:
# We'll use only these direct dependencies.

!mamba install -q -c conda-forge numpy pandas scikit-learn kaggle auto-sklearn -y


                  __    __    __    __
                 /  \  /  \  /  \  /  \
                /    \/    \/    \/    \
███████████████/  /██/  /██/  /██/  /████████████████████████
              /  / \   / \   / \   / \  \____
             /  /   \_/   \_/   \_/   \    o \__,
            / _/                       \_____/  `
            |/
        ███╗   ███╗ █████╗ ███╗   ███╗██████╗  █████╗
        ████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
        ██╔████╔██║███████║██╔████╔██║██████╔╝███████║
        ██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
        ██║ ╚═╝ ██║██║  ██║██║ ╚═╝ ██║██████╔╝██║  ██║
        ╚═╝     ╚═╝╚═╝  ╚═╝╚═╝     ╚═╝╚═════╝ ╚═╝  ╚═╝

        mamba (0.25.0) supported by @QuantStack

        GitHub:  https://github.com/mamba-org/mamba
        Twitter: https://twitter.com/QuantStack

█████████████████████████████████████████████████████████████



In [95]:
!mamba install -q -c conda-forge black nb_black -y


                  __    __    __    __
                 /  \  /  \  /  \  /  \
                /    \/    \/    \/    \
███████████████/  /██/  /██/  /██/  /████████████████████████
              /  / \   / \   / \   / \  \____
             /  /   \_/   \_/   \_/   \    o \__,
            / _/                       \_____/  `
            |/
        ███╗   ███╗ █████╗ ███╗   ███╗██████╗  █████╗
        ████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
        ██╔████╔██║███████║██╔████╔██║██████╔╝███████║
        ██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
        ██║ ╚═╝ ██║██║  ██║██║ ╚═╝ ██║██████╔╝██║  ██║
        ╚═╝     ╚═╝╚═╝  ╚═╝╚═╝     ╚═╝╚═════╝ ╚═╝  ╚═╝

        mamba (0.25.0) supported by @QuantStack

        GitHub:  https://github.com/mamba-org/mamba
        Twitter: https://twitter.com/QuantStack

█████████████████████████████████████████████████████████████



In [7]:
# Formatting your code can be done automatically even in a notebook.
# Don't waste your time doing that yourself or wrose not doing it at all.

%load_ext lab_black

In [8]:
import json
import pickle
from pathlib import Path

import autosklearn
import numpy as np
import pandas as pd
from autosklearn.classification import AutoSklearnClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [9]:
# Some constants

DATA_DIR = "data"
FEATURE_STORE_DIR = "feature_store"
MODELS_DIR = "models"
N_CORES = 4
USABLE_MEMORY_PER_CORE = 1024 * 2
RANDOM_SEED = 420

In [19]:
# Upload your kaggle.json file to the current working directory.
# This script will place it in the right place and download the dataset.
# P.S. if you don't have kaggle.json, you can get it using these instructions: https://github.com/Kaggle/kaggle-api#api-credentials

!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c spaceship-titanic
!unzip spaceship-titanic.zip -d {DATA_DIR}

Downloading spaceship-titanic.zip to /home/jupyter
  0%|                                                | 0.00/299k [00:00<?, ?B/s]
100%|████████████████████████████████████████| 299k/299k [00:00<00:00, 88.6MB/s]
Archive:  spaceship-titanic.zip
  inflating: data/sample_submission.csv  
  inflating: data/test.csv           
  inflating: data/train.csv          


## EDA

In this section we will do the exploratory data analysis.
What is exploratory data analysis? 
If you'd like to learn more about EDA, I'd recommend reading "Making Sense of Data I: A Practical Guide to Exploratory Data Analysis and Data Mining, 2nd Edition" by Glenn J. Myatt and Wayne P. Johnson. Or the "Exploratory Data Analysis" by John Tukey.

In [20]:
all_data = pd.read_csv(Path(DATA_DIR) / "train.csv")

In [21]:
all_data.sample(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
3290,3535_01,Mars,False,,TRAPPIST-1e,36.0,False,1304.0,0.0,7.0,18.0,0.0,Clow Porki,False
4149,4432_01,Earth,True,G/728/P,TRAPPIST-1e,50.0,False,0.0,0.0,0.0,0.0,0.0,Robyna Loway,False
3937,4204_03,Earth,True,G/691/P,TRAPPIST-1e,12.0,False,0.0,0.0,0.0,0.0,0.0,Gailya Avisnydes,True
7810,8338_01,Mars,True,F/1716/P,TRAPPIST-1e,21.0,False,0.0,0.0,0.0,0.0,0.0,Choney Empie,True
5361,5726_01,Europa,True,B/192/P,55 Cancri e,27.0,False,0.0,0.0,,0.0,0.0,Bottom Dindeng,True
75,0082_01,Mars,False,F/16/P,TRAPPIST-1e,42.0,False,7406.0,0.0,0.0,0.0,0.0,Totse Datte,False
6842,7230_01,Earth,True,G/1177/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Trice Blancoy,True
3111,3353_05,Europa,False,C/125/S,55 Cancri e,19.0,False,0.0,4494.0,0.0,11.0,3460.0,Ankaan Reming,True
925,0992_01,Europa,False,D/35/S,TRAPPIST-1e,49.0,False,31.0,2727.0,0.0,184.0,135.0,Zino Kiling,True
2463,2645_01,Europa,False,C/95/S,TRAPPIST-1e,55.0,False,0.0,815.0,1558.0,3.0,1334.0,Alrais Fordulgaug,True


In [22]:
target_name = "Transported"

In [23]:
feature_names = list(
    filter(
        lambda x: x not in ["PassengerId", "Cabin", "Name", target_name],
        all_data.columns,
    )
)

In [24]:
# If you don't like filters and lambdas you can use list comprehensions

feature_names = [
    x
    for x in all_data.columns
    if x not in ["PassengerId", "Cabin", "Name", target_name]
]

In [25]:
feature_names

['HomePlanet',
 'CryoSleep',
 'Destination',
 'Age',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck']

In [26]:
# train_test_split doesn't make a copy of the dataframe, but just points to the relevant sections of the original dataframe.
# We don't have too much data, so we can make a copy.
# It will be more convenient than overriding.

train_data, val_data = train_test_split(all_data.copy(), random_state=RANDOM_SEED)

In [27]:
train_data.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

### Handling categorical features 

In [28]:
# Not all features are be useful for the model
# First let's select the categorical features that are useful

categorical_cols = ["HomePlanet", "Destination"]

In [29]:
# It's very important to remember the mappings.
# Otherwise it will be impossible to reverse the encodings.

category_mappings = {}

In [30]:
# pd.Categorical is the right tool to encode data.
# It shows the strings to the users and gives numerical data to the ML models.

for col in categorical_cols:
    train_data[col] = pd.Categorical(train_data[col])
    category_mappings[col] = dict(enumerate(train_data[col].cat.categories))

In [31]:
# And this is how to save this category_mappings for the future.

with open(Path(DATA_DIR) / "category_mappings.json", "w", encoding="utf-8") as file:
    json.dump(category_mappings, file, ensure_ascii=False, indent=4)

In [32]:
# Let's use the newly created mappings to convert the val_data

for col in categorical_cols:
    val_data[col] = pd.Categorical(
        val_data[col], categories=category_mappings[col].values()
    )

### Handling binary features 

In [33]:
binary_cols = ["CryoSleep", "VIP", "Transported"]

In [34]:
# We don't need to save any info about binary features.

for col in binary_cols:
    train_data[col] = train_data[col].astype(bool)
    val_data[col] = val_data[col].astype(bool)

In [35]:
# Our data should be ready for training.
# Let's take a look

train_data.dtypes

PassengerId       object
HomePlanet      category
CryoSleep           bool
Cabin             object
Destination     category
Age              float64
VIP                 bool
RoomService      float64
FoodCourt        float64
ShoppingMall     float64
Spa              float64
VRDeck           float64
Name              object
Transported         bool
dtype: object

In [36]:
train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
2744,2944_04,Earth,False,G/469/S,TRAPPIST-1e,0.0,False,0.0,0.0,0.0,0.0,0.0,Terta Nichoan,True
7013,7458_01,Earth,True,G/1211/S,TRAPPIST-1e,64.0,False,0.0,0.0,0.0,0.0,0.0,Fayey Vincenton,False
254,0282_01,Earth,False,F/54/S,TRAPPIST-1e,21.0,False,0.0,918.0,0.0,0.0,0.0,Valex Baketton,True
5689,6032_02,,False,E/397/S,TRAPPIST-1e,40.0,False,521.0,4.0,53.0,70.0,141.0,Lison Mcdowns,False
5849,6191_03,Europa,True,A/59/P,TRAPPIST-1e,45.0,False,0.0,0.0,0.0,0.0,0.0,Taurah Ametic,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,0290_03,Europa,True,B/7/S,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,Dhenar Excialing,True
1209,1288_01,Earth,False,G/192/P,TRAPPIST-1e,24.0,False,621.0,0.0,0.0,0.0,0.0,Nance Flemaney,False
2675,2866_01,Europa,True,C/110/S,TRAPPIST-1e,36.0,True,0.0,0.0,0.0,0.0,0.0,Hadirk Wheededly,True
4671,4979_01,Earth,True,G/804/P,PSO J318.5-22,24.0,False,0.0,0.0,0.0,0.0,0.0,Arllia Mckinn,False


In [37]:
val_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
2276,2445_01,Earth,False,E/170/S,,43.0,False,602.0,3.0,27.0,89.0,69.0,Sanney Lerez,False
2155,2306_06,Europa,False,C/82/P,55 Cancri e,47.0,False,1320.0,428.0,4.0,24.0,0.0,Luxons Colensid,False
2940,3189_02,Mars,True,D/102/P,TRAPPIST-1e,25.0,False,1299.0,0.0,0.0,0.0,0.0,Womel Che,False
1731,1844_02,Earth,True,G/289/S,TRAPPIST-1e,2.0,False,0.0,0.0,0.0,0.0,0.0,Garley Stannondez,True
4217,4493_01,Earth,False,F/845/S,TRAPPIST-1e,26.0,False,42.0,65.0,895.0,407.0,0.0,Sus Coolez,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7920,8458_04,Europa,True,C/319/S,TRAPPIST-1e,54.0,False,0.0,0.0,0.0,0.0,0.0,Tachib Sempreate,True
4504,4787_01,Earth,False,G/780/S,TRAPPIST-1e,19.0,False,2.0,0.0,,9.0,754.0,Milyla Lancock,False
8536,9115_01,Earth,False,F/1867/P,PSO J318.5-22,53.0,False,1215.0,0.0,0.0,0.0,0.0,Chesty Wolffy,False
2871,3102_01,Europa,True,B/101/P,55 Cancri e,30.0,False,0.0,0.0,0.0,0.0,0.0,Tachba Mirinanty,True


## Creating the model

In [38]:
# Task 1: get the highest F1 macro score.
# You only have 5 minutes to do that.

In [39]:
model = AutoSklearnClassifier(
    time_left_for_this_task=60 * 2,
    memory_limit=USABLE_MEMORY_PER_CORE,
    n_jobs=N_CORES,
    metric=autosklearn.metrics.roc_auc,
)

In [None]:
model.fit(
    train_data[feature_names],
    train_data[target_name],
)

In [None]:
Path(MODELS_DIR).mkdir(parents=True, exist_ok=True)

In [None]:
with open(Path(MODELS_DIR) / "model1.pkl", "wb") as file:
    pickle.dump(model, file)

In [None]:
with open(Path(MODELS_DIR) / "model1.pkl", "rb") as file:
    model = pickle.load(file)

In [None]:
pred_data = model.predict(val_data[feature_names], n_jobs=-1)

In [58]:
print(classification_report(val_data[[target_name]], pred_data))

              precision    recall  f1-score   support

       False       0.85      0.71      0.77      1087
        True       0.75      0.88      0.81      1087

    accuracy                           0.79      2174
   macro avg       0.80      0.79      0.79      2174
weighted avg       0.80      0.79      0.79      2174



In [59]:
# A reasonable expected performance

#               precision    recall  f1-score   support

#        False       0.83      0.75      0.79      1065
#         True       0.78      0.85      0.82      1109

#     accuracy                           0.80      2174
#    macro avg       0.81      0.80      0.80      2174
# weighted avg       0.81      0.80      0.80      2174

## Feature Store

Feature store allows you to store and later get features.
Most importantly it allows you to get features for both training and (real-time) inference.

### Which features should stored be in the feature store?

Derived features. Doing aggregations and other transformations 

### What are the limitations of Vinted's feature store?

1-24 hour delay between when the event took place and the feature depending on that event is available to use (for now). This is because of how our data warehouse jobs are scheduled. There are 1 or 24 hour jobs.

Feature engineering has to be done in the data warehouse jobs.

In [60]:
# Q: what value does the feature store bring to you?
# A1: real-time derived features
# A2: prepared shareable features for training and inference
# A3: feature self-service - you won't depend on the backend engineers to create and send you features

In [61]:
# Let's make sure that the feature store directory is free and initialize the feature store.

!rm -rf {FEATURE_STORE_DIR}
!feast init {FEATURE_STORE_DIR}


Creating a new Feast repository in [1m[32m/home/jupyter/feature_store[0m.



In [62]:
# Feast doesn't support CSV files, so we need to convert our CSV files to Parquet
# https://github.com/feast-dev/feast/issues/2563

In [63]:
# In addition, Parquet supports categorical encoding

In [64]:
# We will only store some data in the feature store.
# This will replicate a typical work scenario, where only some of the features are available in the feature store.

In [65]:
fs_feature_names = [
    "HomePlanet",
    "CryoSleep",
    "Destination",
    "Age",
    "VIP",
    "Transported",
]
request_feature_names = [name for name in feature_names if name not in fs_feature_names]

In [66]:
request_feature_names

['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [67]:
data_for_feast = all_data.copy()

for col in categorical_cols:
    data_for_feast[col] = pd.Categorical(
        data_for_feast[col], categories=category_mappings[col].values()
    )

for col in binary_cols:
    data_for_feast[col] = data_for_feast[col].astype(bool)

In [68]:
# Feast also needs an event timestamp column named "event_timestamp" to keep track of the updates
# https://github.com/feast-dev/feast/issues/2257

In [69]:
data_for_feast["event_timestamp"] = np.datetime64("2022-04-20")
data_for_feast.drop(request_feature_names, axis=1, inplace=True)
data_for_feast.to_parquet("/home/jupyter/data/train.parquet")
del data_for_feast

In [70]:
Replace the contents of the feature_store/example.py with the contents of example.py from the workshop repo

SyntaxError: invalid syntax (3934617454.py, line 1)

[ERROR] [2022-07-27 18:50:25,846:root] Cannot parse: 1:8: Replace the contents of the feature_store/example.py with the contents of example.py from the workshop repo
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/lab_black.py", line 218, in format_cell
    formatted_code = _format_code(cell)
  File "/opt/conda/lib/python3.7/site-packages/lab_black.py", line 29, in _format_code
    return format_str(src_contents=code, mode=FileMode())
  File "/opt/conda/lib/python3.7/site-packages/black/__init__.py", line 1163, in format_str
    dst_contents = _format_str_once(src_contents, mode=mode)
  File "/opt/conda/lib/python3.7/site-packages/black/__init__.py", line 1173, in _format_str_once
    src_node = lib2to3_parse(src_contents.lstrip(), mode.target_versions)
  File "/opt/conda/lib/python3.7/site-packages/black/parsing.py", line 128, in lib2to3_parse
    raise exc from None
black.parsing.InvalidInput: Cannot parse: 1:8: Replace the contents of the feature_st

In [71]:
# Feast tries to read all Python files in the directory.
# Since you've opened the feature store dict Jupyter created ".ipynb_checkpoints" folder in there.
# It will cause issues, so you need to remove it by running:

!rm -rf feature_store/.ipynb_checkpoints/

In [72]:
# feast apply will create (or update) a feature store deployment

!cd feature_store && feast apply

Created entity [1m[32mpassenger[0m
Created feature view [1m[32mspace_titanic[0m
Created feature service [1m[32mspace_titanic[0m

Created sqlite table [1m[32mfeature_store_space_titanic[0m



In [73]:
from feast import FeatureStore

In [74]:
# We will assume that "partial_data" will contain features that are NOT going to be in the feature store
# Q: Where will they come from?
# A: The request to your model.

partial_data = all_data[["PassengerId", *request_feature_names]].copy()

In [75]:
partial_data["event_timestamp"] = np.datetime64("2022-04-20")

In [76]:
store = FeatureStore(repo_path=FEATURE_STORE_DIR)

In [77]:
# Here we merge two datasets, which we will use for training.
# "combined_data" is data coming from data warehouse, or any other source
# Then we are merging it with data from the feature store using "PassengerId" as key

combined_data = store.get_historical_features(
    entity_df=partial_data,
    features=[f"space_titanic:{name}" for name in fs_feature_names],
).to_df()

In [78]:
# Let's check that the categorical features still have the right data types.
# Namely some of the features have to be categorical and bool.

combined_data.dtypes

PassengerId                     object
RoomService                    float64
FoodCourt                      float64
ShoppingMall                   float64
Spa                            float64
VRDeck                         float64
event_timestamp    datetime64[ns, UTC]
HomePlanet                    category
CryoSleep                         bool
Destination                   category
Age                            float64
VIP                               bool
Transported                       bool
dtype: object

In [79]:
train_data, val_data = train_test_split(combined_data, random_state=RANDOM_SEED)

In [80]:
# Don't write large classes in Jupyter, it is not the right tool for the job.
# Just write it as a module and load it in Jupyter.

from not_a_real_trainer import NotARealTrainer

In [82]:
# Martynas: to use the feature store in real-life you'll have to do the model schema changes.

In [83]:
trainer = NotARealTrainer(
    name="model2",
    work_dir=Path(MODELS_DIR),
    memory=USABLE_MEMORY_PER_CORE,
    time_limit_in_seconds=60 * 2,
    fs_feature_names=fs_feature_names,
    metric=autosklearn.metrics.roc_auc,
)

In [84]:
trainer.fit((train_data[feature_names], train_data[target_name]))

In [85]:
# What is materialization? It just means load your features to the online feature store to make them available for inference.

!cd {FEATURE_STORE_DIR} && feast materialize-incremental $(date -u +"%Y-%m-%dT%H:%M:%S")

Materializing [1m[32m1[0m feature views to [1m[32m2022-07-27 18:53:44+00:00[0m into the [1m[32msqlite[0m online store.

Since the ttl is 0 for feature view [1m[32mspace_titanic[0m, the start date will be set to 1 year before the current time.
[1m[32mspace_titanic[0m from [1m[32m2021-07-28 18:53:45+00:00[0m to [1m[32m2022-07-27 18:53:44+00:00[0m:
100%|█████████████████████████████████████████████████████████| 8693/8693 [00:01<00:00, 4995.26it/s]


In [88]:
# Let's simulate how we'd do a real inference by calling our model repeatedly with a batch of data.

%%time
step = 1000
predictions = []
for i in range(0, len(val_data), step):
    predictions.append(
        trainer.predict(store,
            val_data[["PassengerId", *request_feature_names]].iloc[i : i + step]
        )
    )
predictions = np.concatenate(predictions)

CPU times: user 6.37 s, sys: 416 ms, total: 6.78 s
Wall time: 9.01 s


In [89]:
# We should get similar results as before.

print(classification_report(val_data[[target_name]], predictions))

              precision    recall  f1-score   support

       False       0.79      0.79      0.79      1040
        True       0.80      0.81      0.81      1134

    accuracy                           0.80      2174
   macro avg       0.80      0.80      0.80      2174
weighted avg       0.80      0.80      0.80      2174



In [None]:
# You can read more about how to use the Feature Store for your projects here: https://github.com/vinted/vmip-docs