### Import General Python libs, defined classes, and modules¶

In [1]:
# 3rd party imports
import category_encoders as ce
import feast
import mlflow
from mlflow.tracking import MlflowClient
import pandas as pd
from sklearn.neural_network import MLPClassifier

### Setup the feature store

In [2]:
# Initialize feature store parameters
REPO_PATH = "/Users/kike/Library/CloudStorage/OneDrive-VMware,Inc/OCTO/2022-H1/Taurus/Feast/feast_workshops-master/module_1/feature_repo"
DATA_SOURCE = './data/driver_orders.csv'
orders = pd.read_csv(DATA_SOURCE, sep="\t")
print(f"Driver orders table has {orders.shape[0]} rows and {orders.shape[1]} columns")
orders["event_timestamp"] = pd.to_datetime(orders["event_timestamp"])
orders.head(10)

Driver orders table has 10 rows and 3 columns


Unnamed: 0,event_timestamp,driver_id,trip_completed
0,2021-04-16 20:29:28+00:00,1001,1
1,2021-04-17 04:29:28+00:00,1002,0
2,2021-04-17 12:29:28+00:00,1003,0
3,2021-04-17 20:29:28+00:00,1001,1
4,2021-04-18 04:29:28+00:00,1002,0
5,2021-04-18 12:29:28+00:00,1003,0
6,2021-04-18 20:29:28+00:00,1001,1
7,2021-04-19 04:29:28+00:00,1002,0
8,2021-04-19 12:29:28+00:00,1003,0
9,2021-04-19 20:29:28+00:00,1004,1


In [3]:
# Connect to the feature store service
REPO_PATH = "/Users/kike/Library/CloudStorage/OneDrive-VMware,Inc/OCTO/2022-H1/Taurus/Feast/feast_workshops-master/module_1/feature_repo"
FEATURE_SERVICE_NAME = "driver_ranking_fv_svc"
fs = feast.FeatureStore(repo_path=REPO_PATH) # create feature store object
feature_service = fs.get_feature_service(FEATURE_SERVICE_NAME) # get the feature service
# Display the feature views available and their features:
for feat_view in feature_service.feature_view_projections:
    print(f"Feature view: {feat_view.name}")
    for feat in feat_view.features:
        print(f"\tFeature: {feat.name}, type: {feat.dtype}")

Feature view: driver_hourly_stats
	Feature: conv_rate, type: Float32
	Feature: acc_rate, type: Float32
	Feature: avg_daily_trips, type: Int64


In [4]:
# Retrieve training data from local parquet FileSource
training_df = fs.get_historical_features(
    entity_df=orders,
    features=feature_service
).to_df()
training_df

Unnamed: 0,event_timestamp,driver_id,trip_completed,conv_rate,acc_rate,avg_daily_trips
360,2021-04-16 20:29:28+00:00,1001,1,0.175219,0.761434,385
721,2021-04-17 04:29:28+00:00,1002,0,0.312347,0.481786,810
1082,2021-04-17 12:29:28+00:00,1003,0,0.736727,0.936667,939
1445,2021-04-17 20:29:28+00:00,1001,1,0.175219,0.761434,385
1806,2021-04-18 04:29:28+00:00,1002,0,0.312347,0.481786,810
2167,2021-04-18 12:29:28+00:00,1003,0,0.736727,0.936667,939
2530,2021-04-18 20:29:28+00:00,1001,1,0.175219,0.761434,385
2891,2021-04-19 04:29:28+00:00,1002,0,0.312347,0.481786,810
3252,2021-04-19 12:29:28+00:00,1003,0,0.736727,0.936667,939
3615,2021-04-19 20:29:28+00:00,1004,1,0.094609,0.151163,166


In [5]:
""# Transform the driver_id column into a dummy variable
target = "trip_completed"
train_y = training_df.loc[:, target]
train_X = training_df[training_df.columns.drop(target).drop("event_timestamp")]
cat_encoder = ce.OneHotEncoder(cols=['driver_id']) # create the encoder
train_X = cat_encoder.fit_transform(train_X) # train and apply encoder
train_X = train_X.reindex(sorted(train_X.columns), axis=1) # sort column names
train_X

Unnamed: 0,acc_rate,avg_daily_trips,conv_rate,driver_id_1,driver_id_2,driver_id_3,driver_id_4
360,0.761434,385,0.175219,1,0,0,0
721,0.481786,810,0.312347,0,1,0,0
1082,0.936667,939,0.736727,0,0,1,0
1445,0.761434,385,0.175219,1,0,0,0
1806,0.481786,810,0.312347,0,1,0,0
2167,0.936667,939,0.736727,0,0,1,0
2530,0.761434,385,0.175219,1,0,0,0
2891,0.481786,810,0.312347,0,1,0,0
3252,0.936667,939,0.736727,0,0,1,0
3615,0.151163,166,0.094609,0,0,0,1


"### Train the model
 * Use the `DriverRankingTrainingModel` class, which fetches training data from the Feast data source
 * Use `mlflow.autolog` to automatically log the parameters and computed metrics during training
 * Use `mflow` Fluent APIs to log the Feast training set

### Define MLPClassifier tuning parameters and run some experiments

In [6]:
# Enable auto logging for mlflow and set the tracking uri with the local model registry
# SQLite db
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.sklearn.autolog()

# Define parameters for best model search
params_list = ['logistic', 'tanh', 'relu']

for params in params_list:
    model = MLPClassifier(activation=params)

    # Log some Feast data:Feast data source, features, feature services
    with mlflow.start_run() as run:
        model.fit(train_X[sorted(train_X)], train_y) # train model
        # log model fit experiment in MLflow
        mlflow.log_dict({"features": ["driver_hourly_stats:conv_rate",
                                      "driver_hourly_stats:acc_rate",
                                      "driver_hourly_stats:avg_daily_trips"],
                     "feast_feature_service": FEATURE_SERVICE_NAME,
                     "feast_feature_data": "driver_hourly_stats"}, "feast_data.json")
    print(f"MLPClassifier params: {params}")
    print(f"Model run id: {run.info.run_id}")



SGDClassifier params: logistic
Model run id: 107e130525bc40078eea006a1e03cff1




SGDClassifier params: tanh
Model run id: 2e78ee78e0734eb1a3fc61f6eb49ee20


  _warn_prf(average, modifier, msg_start, len(result))


SGDClassifier params: relu
Model run id: 83c0b890c4d041d9af1c84e74673c8c0


### Iterate and experiment MLflow runs tunning parameters

### Launch the MLflow ui, with Model Registry at the local SQLite database
 * Navigate and examine runs for the model
 * Register the best model with lowest RMSE with the Model Registry

In [7]:
#!mlflow ui  --backend-store-uri sqlite:///mlruns.db

### CI/CD Intergration: Fetch the registered model from the Model Registry
 * Use the model URI (either by stage or version)
 * Make the predicion

In [8]:
driver_ids = [1001, 1002, 1003]
driver_features = fs.get_online_features(
    entity_rows=[{"driver_id": driver_id} for driver_id in driver_ids],
    features=feature_service
)
df = pd.DataFrame.from_dict(driver_features.to_dict())
df = cat_encoder.transform(df)
df = df.reindex(sorted(df.columns), axis=1)
df

Unnamed: 0,acc_rate,avg_daily_trips,conv_rate,driver_id_1,driver_id_2,driver_id_3,driver_id_4
0,0.927691,114,0.222534,1,0,0,0
1,0.357142,573,0.913528,0,1,0,0
2,0.558202,451,0.581626,0,0,1,0


In [9]:
#mlflow.set_tracking_uri("sqlite:///mlruns.db")
#model_uri = "models:/sklearn_feast_integration/staging"
model_uri = "./mlruns/0/e96717fbfdc84c0895078346568ed2e2/artifacts/model"
saved_model = mlflow.sklearn.load_model(model_uri)

In [10]:
for pred in saved_model.predict(df):
    print(f"Prediction: Trip completed?  {True if pred else False}")

Prediction: Trip completed?  True
Prediction: Trip completed?  False
Prediction: Trip completed?  False


### CI/CD Integration: Transition the model to production 

In [11]:
client = MlflowClient()
client.transition_model_version_stage(
    name="MLP_relu",
    version=1,
    stage="Production"
)

<ModelVersion: creation_timestamp=1655745422910, current_stage='Production', description='', last_updated_timestamp=1655756747605, name='MLP_relu', run_id='e96717fbfdc84c0895078346568ed2e2', run_link='', source='./mlruns/0/e96717fbfdc84c0895078346568ed2e2/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>