## Import General Python libs, defined classes, and modules¶

In [None]:
# 3rd party imports
import category_encoders as ce
import feast
import mlflow
from mlflow.tracking import MlflowClient
import pandas as pd
from sklearn.neural_network import MLPClassifier

## Setup the feature store

In [None]:
# Initialize feature store parameters
REPO_PATH = "/Users/kike/Library/CloudStorage/OneDrive-VMware,Inc/OCTO/2022-H1/Taurus/Feast/feast_workshops-master/module_1/feature_repo"
DATA_SOURCE = './data/driver_orders.csv'
orders = pd.read_csv(DATA_SOURCE, sep="\t")
print(f"Driver orders table has {orders.shape[0]} rows and {orders.shape[1]} columns")
orders["event_timestamp"] = pd.to_datetime(orders["event_timestamp"])
display(orders)

In [None]:
# Connect to the feature store service and display available services
fs = feast.FeatureStore(repo_path=REPO_PATH) # create feature store object
for feature_svc in fs.list_feature_services():
    print(f"Feature service name: {feature_svc.name}")
    for projection in feature_svc.feature_view_projections:
        print(f"\tFeature view: {projection.name}")
        for feat in projection.features:
            print(f"\t\tFeature: {feat.name}, type: {feat.dtype}")

In [None]:
# Initiate a feature service object
FEATURE_SERVICE_NAME = "driver_ranking_fv_svc"
feature_service = fs.get_feature_service(FEATURE_SERVICE_NAME) # get the feature service
# Retrieve training data making a point-in-time join between the orders table
# and the feature service.
training_df = fs.get_historical_features(
    entity_df=orders,
    features=feature_service
).to_df()
display(training_df)

In [None]:
target = "trip_completed" # we want to predict is a trip would be completed
train_y = training_df.loc[:, target].astype(float) # save the target column.

# avg_daily_trips column standardization
train_X = training_df.copy()
col_mean = train_X['avg_daily_trips'].mean()
col_std = train_X['avg_daily_trips'].std()
train_X['avg_daily_trips_std'] = (train_X['avg_daily_trips'] - col_mean) / col_std

# Drop columns not needed
train_X = train_X[train_X.columns.drop(target).drop("event_timestamp").drop("avg_daily_trips")]

# Categorical variable transformation
cat_encoder = ce.OneHotEncoder(cols=['driver_id']) # create the dummy variable encoder
train_X = cat_encoder.fit_transform(train_X) # train and apply encoder to get a numeric training set

# Make dtypes float to avoid MLFlow warnings
for col in train_X.columns:
    train_X[col] = train_X[col].astype(float)

train_X = train_X.reindex(sorted(train_X.columns), axis=1) # sort column names
display(train_X)

In [None]:
# Enable auto logging for mlflow and set the tracking uri with the local model registry
# SQLite db
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.sklearn.autolog()

# Define parameter options for best model search
activations_list = ['logistic', 'tanh', 'relu']

for activation in activations_list:
    model = MLPClassifier(activation=activation,
                          max_iter=500)

    # Log some Feast data:Feast data source, features, feature services
    with mlflow.start_run() as run:
        model.fit(train_X[sorted(train_X)], train_y) # train model
        # log model fit experiment in MLflow
        mlflow.log_dict({"features": ["driver_hourly_stats:conv_rate",
                                      "driver_hourly_stats:acc_rate",
                                      "driver_hourly_stats:avg_daily_trips"],
                     "feast_feature_service": FEATURE_SERVICE_NAME,
                     "feast_feature_data": "driver_hourly_stats"}, "feast_data.json")
        print(f"MLPClassifier params: {activation}")
        print(f"Model run id: {run.info.run_id}")

### CI/CD Intergration: Fetch the registered model from the Model Registry
 * Use the model URI (either by stage or version)
 * Make the predicion

In [None]:
driver_ids = [1001, 1002, 1003] # Define entity rows to get from the online store
# Query the online store
df = driver_features = fs.get_online_features(
    entity_rows=[{"driver_id": driver_id} for driver_id in driver_ids],
    features=feature_service
).to_df()
display(df)

In [None]:
# Apply transformations (notice why it matters to save them from training time)
df = cat_encoder.transform(df) # encode categorical variables
df['avg_daily_trips_std'] = (df['avg_daily_trips'] - col_mean) / col_std # standardize values
df.drop(columns=['avg_daily_trips'], inplace=True) # remove unwanted column
df = df.reindex(sorted(df.columns), axis=1) # sort columns by name
df

In [None]:
model_uri = "./mlruns/0/fd4c8846f63b4b29af58d5fa88360d99/artifacts/model"
saved_model = mlflow.sklearn.load_model(model_uri) # Recover model from MLFlow registry

In [None]:
preds = saved_model.predict(df) # Run predictions from the dataframe
for i,driver in enumerate(driver_ids):
    print(f"Prediction: Trip completed by driver {driver}?  {True if preds[i] else False}")

### CI/CD Integration: Transition the model to production 

In [None]:
client = MlflowClient()
client.transition_model_version_stage(
    name="MLP_relu",
    version=1,
    stage="Production"
)