In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from datetime import datetime
import numpy as np

chosen_datetime = '2023-02-20'

data = load_iris()
data.feature_names = ['sepal_length_cm',
 'sepal_width_cm',
 'petal_length_cm',
 'petal_width_cm']
df = pd.DataFrame(data= np.c_[data['data'], data['target']],
                     columns=data['feature_names'] + ['target'])
df['event_timestamp'] = [datetime.fromisoformat(chosen_datetime) for _ in data.target]
df['iris_id'] = df.index.values
df.head()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,target,event_timestamp,iris_id
0,5.1,3.5,1.4,0.2,0.0,2023-02-20,0
1,4.9,3.0,1.4,0.2,0.0,2023-02-20,1
2,4.7,3.2,1.3,0.2,0.0,2023-02-20,2
3,4.6,3.1,1.5,0.2,0.0,2023-02-20,3
4,5.0,3.6,1.4,0.2,0.0,2023-02-20,4


In [8]:
# init feast feature repository
# https://docs.feast.dev/reference/feast-cli-commands#init
!feast init -t local feature_repo


Creating a new Feast repository in [1m[32m/home/sbalawajder/projects/train/feast-mlflow/feast-mlflow-project/f[0m.



In [3]:
# save iris dataframe in feature repository in parquet format
df.to_parquet('feature_repo/feature_repo/data/iris_stats.parquet')

In [33]:
# overwrite example feast elements definition
import os 
with open("feature_repo/feature_repo/example_repo.py", "w") as my_frepo:
    my_frepo.write(f"""from datetime import timedelta

from feast import Entity, FeatureView, Field, FileSource, ValueType, FeatureService
from feast.types import Float32, Int64

iris_stats_source = FileSource(
    name = "iris_stats_source",
    path="{os.path.abspath(os.getcwd())}/feature_repo/feature_repo/data/iris_stats.parquet",
    timestamp_field="event_timestamp",
)

iris = Entity(name="iris", join_keys=["iris_id"])

# Our parquet files contain sample data that includes a driver_id column, timestamps and
# three feature column. Here we define a Feature View that will allow us to serve this
# data to our model online.
iris_stats_fv = FeatureView(
    name="iris_stats_fv",
    entities=[iris],  # reference entity by name
    ttl=timedelta(days=1),
    schema=[
        Field(name="sepal_length_cm", dtype=Float32),
        Field(name="sepal_width_cm", dtype=Float32),
        Field(name="petal_length_cm", dtype=Float32),
        Field(name="petal_width_cm", dtype=Float32),
        Field(name="target", dtype=Int64),
    ],
    online=True,
    source=iris_stats_source
)

iris_stats_fs = FeatureService(
    name="iris_stats_fs",
    features=[iris_stats_fv]
)

""")

In [4]:
# create/update feature store deployment
# https://docs.feast.dev/reference/feast-cli-commands#apply
!feast -c feature_repo/feature_repo apply

  schema = ParquetDataset(path).schema.to_arrow_schema()
[1m[94mNo changes to registry
[1m[94mNo changes to infrastructure


In [4]:
!feast -c feature_repo/feature_repo ui 

[32mINFO[0m:     Started server process [[36m1442[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Uvicorn running on [1mhttp://0.0.0.0:8889[0m (Press CTRL+C to quit)
^C
[32mINFO[0m:     Shutting down


In [5]:
from feast import FeatureStore

store = FeatureStore(repo_path="feature_repo/feature_repo/.")

fv = store.get_feature_view("iris_stats_fv")

fv

<FeatureView(name = iris_stats_fv, entities = ['iris'], ttl = 1 day, 0:00:00, stream_source = None, batch_source = {
  "type": "BATCH_FILE",
  "timestampField": "event_timestamp",
  "fileOptions": {
    "uri": "/home/sbalawajder/projects/train/feast-mlflow/feast-mlflow-project/feature_repo/feature_repo/data/iris_stats.parquet"
  },
  "name": "iris_stats_source"
}, entity_columns = [iris_id-Int64], features = [sepal_length_cm-Float32, sepal_width_cm-Float32, petal_length_cm-Float32, petal_width_cm-Float32, target-Int64], description = , tags = {}, owner = , projection = FeatureViewProjection(name='iris_stats_fv', name_alias=None, desired_features=[], features=[sepal_length_cm-Float32, sepal_width_cm-Float32, petal_length_cm-Float32, petal_width_cm-Float32, target-Int64], join_key_map={}), created_timestamp = 2023-02-21 14:44:17.464122, last_updated_timestamp = 2023-02-21 14:44:17.464122, online = True, materialization_intervals = [])>

In [6]:
from feast_extra_functions import get_entity_df

training_df = store.get_historical_features(
    entity_df = get_entity_df(no_ids=150, dt = chosen_datetime, join_key = 'iris_id'),
    features=[
        "iris_stats_fv:sepal_length_cm",
        "iris_stats_fv:sepal_width_cm",
        "iris_stats_fv:petal_length_cm",
        "iris_stats_fv:petal_width_cm",
        "iris_stats_fv:target",
    ],
    full_feature_names = True,
).to_df()

training_df.head()

Unnamed: 0,iris_id,event_timestamp,iris_stats_fv__sepal_length_cm,iris_stats_fv__sepal_width_cm,iris_stats_fv__petal_length_cm,iris_stats_fv__petal_width_cm,iris_stats_fv__target
0,0,2023-02-20 00:00:00+00:00,5.1,3.5,1.4,0.2,0.0
1,96,2023-02-20 00:00:00+00:00,5.7,2.9,4.2,1.3,1.0
2,97,2023-02-20 00:00:00+00:00,6.2,2.9,4.3,1.3,1.0
3,98,2023-02-20 00:00:00+00:00,5.1,2.5,3.0,1.1,1.0
4,99,2023-02-20 00:00:00+00:00,5.7,2.8,4.1,1.3,1.0


In [9]:
import os
import warnings
import sys
import logging
import numpy as np
import importlib
import mlflow 
importlib.reload(mlflow)
import mlflow.sklearn
from sklearn.metrics import precision_score, accuracy_score, recall_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, LogisticRegression
from urllib.parse import urlparse

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [12]:
# get features from dataframe taken from offline_store

train_X = training_df[['iris_stats_fv__sepal_length_cm','iris_stats_fv__sepal_width_cm', 'iris_stats_fv__petal_length_cm', 'iris_stats_fv__petal_width_cm']].to_numpy()
train_Y = training_df['iris_stats_fv__target'].to_numpy()

assert(len(train_X) == len(train_Y))

# Split data in train & test
x_train, x_test, y_train, y_test = train_test_split(
    train_X,
    train_Y,
    test_size= 0.2,
    random_state= 1234
    )


In [14]:
# Define hiperparameter grid with ML algorithms
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10],
            'max_depth':[6,8,10], 
            'min_samples_split':[2,3,4,5],
            'min_samples_leaf':[2,3,4,5],
            'max_features': [2,3]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

# Grid search
scores = []
class_grid_fit_list = []

for model_name, mp in model_params.items():
    class_grid =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    class_grid_fit = class_grid.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': class_grid.best_score_,
        'best_params': class_grid.best_params_
        
    })
    class_grid_fit_list.append(class_grid_fit)
        
df_training_results = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_training_results

Unnamed: 0,model,best_score,best_params
0,svm,0.975,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.983333,"{'max_depth': 6, 'max_features': 3, 'min_sampl..."
2,logistic_regression,0.991667,{'C': 10}
