# Tutorial

This tutorial shows how to use FeatHub to achieve the following objectives:

- Define, extract, transform and materialize features into feature stores.
- Transform features into Pandas dataframe for offline training.
- Materialize features into online feature store.
- Fetch features with on-demand feature transformation from online feature store
  for online feature serving.

This tutorial shows these FeatHub capabilities by walking you an example, that
trains a GradientBoostingRegressor model on the NYC Taxi Records
[dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) to
predict taxi fares, evaluates the prediction accuracy, and materialize features
into online feature store for online feature serving.

## Install dependencies and download resource files

In [None]:
!pip install -q feathub-nightly
!wget -nc https://raw.githubusercontent.com/alibaba/feathub/master/python/feathub/examples/sample_data.csv

## Import Python dependencies

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta, datetime
from math import sqrt

from feathub.table.schema import Schema

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

from feathub.feature_tables.sources.file_system_source import FileSystemSource
from feathub.feature_tables.sinks.memory_store_sink import MemoryStoreSink
from feathub.feature_views.feature import Feature
from feathub.feature_views.on_demand_feature_view import OnDemandFeatureView
from feathub.common import types
from feathub.feature_tables.sources.memory_store_source import MemoryStoreSource
from feathub.feathub_client import FeathubClient
from feathub.feature_views.transforms.over_window_transform import OverWindowTransform
from feathub.feature_views.derived_feature_view import DerivedFeatureView

## Initialize FeatHub client

In [None]:
client = FeathubClient(
    props={
        "processor": {
            "type": "local",
            "local": {},
        },
        "registry": {
            "type": "local",
            "local": {
                "namespace": "default",
            },
        },
        "feature_service": {
            "type": "local",
            "local": {},
        },
    }
)

## Specify source dataset

In [None]:
source_file_path = "sample_data.csv"

schema = (
    Schema.new_builder()
    .column("trip_id", types.Int64)
    .column("VendorID", types.Float64)
    .column("lpep_pickup_datetime", types.String)
    .column("lpep_dropoff_datetime", types.String)
    .column("store_and_fwd_flag", types.String)
    .column("RatecodeID", types.Float64)
    .column("PULocationID", types.Int64)
    .column("DOLocationID", types.Int64)
    .column("passenger_count", types.Float64)
    .column("trip_distance", types.Float64)
    .column("fare_amount", types.Float64)
    .column("extra", types.Float64)
    .column("mta_tax", types.Float64)
    .column("tip_amount", types.Float64)
    .column("tolls_amount", types.Float64)
    .column("ehail_fee", types.Float64)
    .column("improvement_surcharge", types.Float64)
    .column("total_amount", types.Float64)
    .column("payment_type", types.Float64)
    .column("trip_type", types.Float64)
    .column("congestion_surcharge", types.Float64)
    .build()
)

source = FileSystemSource(
    name="source_1",
    path=source_file_path,
    data_format="csv",
    schema=schema,
    timestamp_field="lpep_dropoff_datetime",
    timestamp_format="%Y-%m-%d %H:%M:%S",
)

## Define features as transformations on the source dataset

In [None]:
f_trip_time_duration = Feature(
    name="f_trip_time_duration",
    transform="UNIX_TIMESTAMP(lpep_dropoff_datetime) - "
    "UNIX_TIMESTAMP(lpep_pickup_datetime)",
)

f_location_avg_fare = Feature(
    name="f_location_avg_fare",
    transform=OverWindowTransform(
        expr="fare_amount",
        agg_func="AVG",
        group_by_keys=["DOLocationID"],
        window_size=timedelta(days=90),
    ),
)

f_location_max_fare = Feature(
    name="f_location_max_fare",
    transform=OverWindowTransform(
        expr="fare_amount",
        agg_func="MAX",
        group_by_keys=["DOLocationID"],
        window_size=timedelta(days=90),
    ),
)

f_location_total_fare_cents = Feature(
    name="f_location_total_fare_cents",
    transform=OverWindowTransform(
        expr="fare_amount * 100",
        agg_func="SUM",
        group_by_keys=["DOLocationID"],
        window_size=timedelta(days=90),
    ),
)

feature_view_1 = DerivedFeatureView(
    name="feature_view_1",
    source=source,
    features=[
        f_trip_time_duration,
        f_location_avg_fare,
        f_location_max_fare,
        f_location_total_fare_cents,
    ],
    keep_source_fields=True,
)

f_trip_time_rounded = Feature(
    name="f_trip_time_rounded",
    transform="f_trip_time_duration / 10",
    input_features=[f_trip_time_duration],
)

f_is_long_trip_distance = Feature(
    name="f_is_long_trip_distance",
    transform="trip_distance > 30",
)

feature_view_2 = DerivedFeatureView(
    name="feature_view_2",
    source="feature_view_1",
    features=[
        "f_location_avg_fare",
        f_trip_time_rounded,
        f_is_long_trip_distance,
        "f_location_total_fare_cents",
    ],
    keep_source_fields=True,
)

_ = client.build_features(features_list=[feature_view_1, feature_view_2])

## Transform features into Pandas DataFrame for offline training

In [None]:
final_df = client.get_features(feature_view_2).to_pandas()

## Train a model using the dataset and evaluate the model accuracy

In [None]:
final_df.drop(
    ["lpep_pickup_datetime", "lpep_dropoff_datetime", "store_and_fwd_flag"],
    axis=1,
    inplace=True,
    errors="ignore",
)

final_df.fillna(0, inplace=True)
final_df["fare_amount"] = final_df["fare_amount"].astype("float64")

train_x, test_x, train_y, test_y = train_test_split(
    final_df.drop(["fare_amount"], axis=1),
    final_df["fare_amount"],
    test_size=0.2,
    random_state=42,
)
model = GradientBoostingRegressor()
model.fit(train_x, train_y)

y_predict = model.predict(test_x)

y_actual = test_y.values.flatten().tolist()
rmse = sqrt(mean_squared_error(y_actual, y_predict))

sum_actuals = sum_errors = 0

for actual_val, predict_val in zip(y_actual, y_predict):
    abs_error = actual_val - predict_val
    if abs_error < 0:
        abs_error = abs_error * -1

    sum_errors = sum_errors + abs_error
    sum_actuals = sum_actuals + actual_val

mean_abs_percent_error = sum_errors / sum_actuals

print(f"Model MSE {rmse}")
print(f"Model MAPE {mean_abs_percent_error}")
print(f"Model Accuracy: {1 - mean_abs_percent_error}")

## Materialize features into online feature store

In [None]:
sink = MemoryStoreSink(table_name="table_name_1")
selected_features = DerivedFeatureView(
    name="feature_view_3",
    source="feature_view_2",
    features=["f_location_avg_fare", "f_location_max_fare"],
)
client.build_features([selected_features])

job = client.materialize_features(
    features=selected_features,
    sink=sink,
    start_datetime=datetime(2020, 1, 1),
    end_datetime=datetime(2020, 5, 20),
    allow_overwrite=True,
)
job.wait(timeout_ms=10000)

## Fetch features from online feature store with on-demand transformations

In [None]:
source = MemoryStoreSource(
    name="online_store_source",
    keys=["DOLocationID"],
    table_name="table_name_1",
)
on_demand_feature_view = OnDemandFeatureView(
    name="on_demand_feature_view",
    features=[
        "online_store_source.f_location_avg_fare",
        "online_store_source.f_location_max_fare",
        Feature(
            name="max_avg_ratio",
            transform="f_location_max_fare / f_location_avg_fare",
        ),
    ],
    request_schema=Schema.new_builder().column("DOLocationID", types.Int64).build(),
)
client.build_features([source, on_demand_feature_view])

request_df = pd.DataFrame(np.array([[247]]), columns=["DOLocationID"])
online_features = client.get_online_features(
    request_df=request_df,
    feature_view=on_demand_feature_view,
)

print(online_features)