# Baseline model for batch monitoring example

In [1]:
import datetime

import evidently
import evidently.metrics
import evidently.sdk.panels
import evidently.ui.workspace
import pandas as pd
import requests

from tqdm import tqdm


In [2]:
! mkdir -p data

In [3]:
# files = [('green_tripdata_2022-02.parquet', './data'), ('green_tripdata_2022-01.parquet', './data')]
files = [('green_tripdata_2024-03.parquet', './data')]

print('Download files:')
for file, path in files:
    url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/{file}'
    resp = requests.get(url, stream=True)
    save_path = f'{path}/{file}'
    with open(save_path, 'wb') as handle:
        for data in tqdm(
            resp.iter_content(), desc=file, postfix=f'save to {save_path}', total=int(resp.headers['Content-Length'])
        ):
            handle.write(data)

Download files:


green_tripdata_2024-03.parquet: 100%|██████████| 1372372/1372372 [00:03<00:00, 401145.24it/s, save to ./data/green_tripdata_2024-03.parquet]


In [4]:
march_data = pd.read_parquet('data/green_tripdata_2024-03.parquet')
print(f'Initial dataframe shape: {march_data.shape}')

march_data = march_data[
    (datetime.datetime(2024, 3, 1) <= march_data['lpep_pickup_datetime']) &
    (march_data['lpep_pickup_datetime'] < datetime.datetime(2024, 4, 1))
]

march_data['pickup_date'] = march_data['lpep_pickup_datetime'].dt.strftime('%Y-%m-%d')
march_data.describe()

Initial dataframe shape: (57457, 20)


Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
count,57447.0,57447,57447,55350.0,57447.0,57447.0,55350.0,57447.0,57447.0,57447.0,57447.0,57447.0,57447.0,0.0,57447.0,57447.0,55350.0,55343.0,55350.0
mean,1.877313,2024-03-16 08:44:02.534353,2024-03-16 09:02:10.326718,1.179729,95.522708,138.633384,1.309359,13.524739,17.312937,0.904577,0.577476,2.386274,0.19257,,0.979409,22.904645,1.320994,1.037981,0.737864
min,1.0,2024-03-01 00:00:57,2024-03-01 00:08:23,1.0,1.0,1.0,0.0,0.0,-295.08,-2.5,-0.5,-1.56,0.0,,-1.0,-296.08,1.0,1.0,-2.75
25%,2.0,2024-03-08 13:56:18.500000,2024-03-08 14:16:35,1.0,74.0,74.0,1.0,1.1,9.3,0.0,0.5,0.0,0.0,,1.0,13.44,1.0,1.0,0.0
50%,2.0,2024-03-15 22:51:40,2024-03-15 23:14:44,1.0,75.0,138.0,1.0,1.79,13.5,0.0,0.5,2.0,0.0,,1.0,18.5,1.0,1.0,0.0
75%,2.0,2024-03-23 20:12:35.500000,2024-03-23 20:35:28.500000,1.0,97.0,220.0,1.0,3.1,19.8,1.0,0.5,3.61,0.0,,1.0,27.05,2.0,1.0,2.75
max,2.0,2024-03-31 23:54:09,2024-04-01 16:11:00,99.0,265.0,265.0,9.0,125112.2,841.6,10.0,4.25,150.0,26.76,,1.0,856.98,5.0,2.0,2.75
std,0.32808,,,1.356452,57.281061,76.295054,0.96737,770.483292,14.956774,1.38253,0.366888,3.159001,1.184652,,0.154148,17.011924,0.497767,0.191153,1.218108


In [5]:
# target = 'duration_min'
num_features = ['passenger_count', 'trip_distance', 'fare_amount', 'total_amount', 'tip_amount']
cat_features = ['PULocationID', 'DOLocationID', 'pickup_date']

# Evidently Report

In [6]:
data_definition = evidently.DataDefinition(numerical_columns=num_features + ['prediction'], categorical_columns=cat_features)
evidently_dataset = evidently.Dataset.from_pandas(
    march_data, data_definition=data_definition
)

In [7]:
fare_report = evidently.Report(
    metrics=[
        evidently.metrics.group_by.GroupBy(evidently.metrics.QuantileValue(column='fare_amount', quantile=0.5), 'pickup_date')
    ]
)
fare_report_result = fare_report.run(evidently_dataset)
max_median_fare_amount = max(metric['value'].item() for metric in fare_report_result.dict()['metrics'])
print(f'Maximum daily medium fare amount in march: {max_median_fare_amount}')

Maximum daily medium fare amount in march: 14.2


# Evidently Dashboard

In [8]:
ws = evidently.ui.workspace.Workspace('workspace')

In [9]:
project = ws.create_project('NYC Taxi Data Quality Project')
project.save()

In [10]:
regular_report = evidently.Report(
    metrics=[
        evidently.metrics.QuantileValue(column='fare_amount', quantile=0.5),
        evidently.metrics.MeanValue(column='tip_amount')
    ]
)

regular_snapshot = regular_report.run(current_data=evidently_dataset)
ws.add_run(project.id, regular_snapshot);


note: To view a report please run `evidently ui` command in a separate tab in your terminal.

In [11]:
# configure the dashboard
project.dashboard.add_panel(
    evidently.sdk.panels.text_panel(title='NYC taxi data dashboard')
)

project.dashboard.add_panel(
    evidently.sdk.panels.bar_plot_panel(
        title='Inference Count',
        values=[
            evidently.sdk.panels.PanelMetric(
                metric='QuantileValue',
                legend='count',
            )
        ],
        size='half'
    )
)

project.save()

To view a dashboard please run `evidently ui` command in a separate tab in your terminal.