In [1]:
import requests
import datetime
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [11]:
files = [('green_tripdata_2023-03.parquet', './data'), ('green_tripdata_2024-03.parquet', './data')]

print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

Download files:


green_tripdata_2023-03.parquet:  15%|█▍        | 258937/1730999 [00:01<00:07, 184682.90it/s, save to ./data/green_tripdata_2023-03.parquet]

green_tripdata_2023-03.parquet: 100%|██████████| 1730999/1730999 [00:09<00:00, 179051.10it/s, save to ./data/green_tripdata_2023-03.parquet]
green_tripdata_2024-03.parquet: 100%|██████████| 1372372/1372372 [00:07<00:00, 178731.30it/s, save to ./data/green_tripdata_2024-03.parquet]


In [3]:
mar_data = pd.read_parquet('data/green_tripdata_2023-03.parquet')


In [14]:
mar_data.describe()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
count,72044.0,72044,72044,67507.0,72044.0,72044.0,67507.0,72044.0,72044.0,72044.0,72044.0,72044.0,72044.0,72044.0,72044.0,72044.0,67507.0,67500.0,67507.0
mean,1.865443,2023-03-16 08:02:42.689842,2023-03-16 08:20:56.525748,1.145703,98.437413,137.613556,1.286844,12.449511,17.018203,0.878109,0.576929,2.157651,0.183268,0.0,0.961604,22.29231,1.37368,1.028681,0.714837
min,1.0,2023-02-23 16:46:29,2023-02-23 16:55:25,1.0,1.0,1.0,0.0,0.0,-115.0,-5.0,-0.5,-1.1,0.0,0.0,-1.0,-116.0,1.0,1.0,-2.75
25%,2.0,2023-03-08 13:55:49.500000,2023-03-08 14:15:59.750000,1.0,74.0,74.0,1.0,1.1,9.3,0.0,0.5,0.0,0.0,0.0,1.0,12.98,1.0,1.0,0.0
50%,2.0,2023-03-16 08:05:27.500000,2023-03-16 08:22:27,1.0,75.0,138.0,1.0,1.84,13.5,0.0,0.5,1.6,0.0,0.0,1.0,18.1,1.0,1.0,0.0
75%,2.0,2023-03-23 21:51:25.500000,2023-03-23 22:12:31.500000,1.0,129.0,216.0,1.0,3.3,20.0,1.0,0.5,3.36,0.0,0.0,1.0,26.73,2.0,1.0,2.75
max,2.0,2023-04-01 00:09:27,2023-04-01 17:32:21,99.0,265.0,265.0,9.0,92064.68,477.0,12.5,4.25,270.27,36.05,0.0,1.0,478.0,5.0,2.0,2.75
std,0.341252,,,1.225735,60.48187,76.169705,0.923652,641.094653,13.832399,1.297642,0.384129,3.136215,1.139159,0.0,0.185185,15.852047,0.512307,0.166911,1.206435


In [4]:
mar_data.shape

(72044, 20)

In [5]:
# # create target
# mar_data["duration_min"] = mar_data.lpep_dropoff_datetime - mar_data.lpep_pickup_datetime
# mar_data.duration_min = mar_data.duration_min.apply(lambda td : float(td.total_seconds())/60)
# # filter out outliers
# mar_data = mar_data[(mar_data.duration_min >= 0) & (mar_data.duration_min <= 60)]
# mar_data = mar_data[(mar_data.passenger_count > 0) & (mar_data.passenger_count <= 8)]
# # data labeling
# target = "fare_amount"
# num_features = ["passenger_count", "trip_distance", "total_amount"]
# cat_features = ["PULocationID", "DOLocationID"]
# # mar_data.shape
# train_data = mar_data[:30000]
# val_data = mar_data[30000:]
# model = LinearRegression()
# model.fit(train_data[num_features + cat_features], train_data[target])
# train_preds = model.predict(train_data[num_features + cat_features])
# train_data['fare_amount'] = train_preds
# # val_preds = model.predict(val_data[num_features + cat_features])
# # val_data['fare_amount'] = val_preds
# print(mean_absolute_error(train_data.fare_amount, train_data.prediction))
# print(mean_absolute_error(val_data.fare_amount, val_data.prediction))

# # Dump model and reference data
# with open('models/lin_reg.bin', 'wb') as f_out:
#     dump(model, f_out)
# val_data.to_parquet('data/reference.parquet')


In [6]:
from evidently.metrics import ColumnQuantileMetric


In [7]:
# Define column mapping 
num_features = ["passenger_count", "trip_distance", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]
column_mapping = ColumnMapping(
    target=None,
    prediction='fare_amount',
    numerical_features=num_features,
    categorical_features=cat_features
)

In [8]:
# Define the report with the chosen metrics
mar_data['ehail_fee'] = mar_data['ehail_fee'].fillna(0)
report = Report(metrics=[
    ColumnDriftMetric(column_name="fare_amount"),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric(),
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)  # New metric
])


In [None]:
# Generate the report
report.run(reference_data=mar_data, current_data=mar_data)
# report.show(mode='inline')

Q3. Monitoring
Let’s start monitoring. Run expanded monitoring for a new batch of data (March 2024).

What is the maximum value of metric quantile = 0.5 on the "fare_amount" column during March 2024 (calculated daily)?

In [43]:
# Load the data
data_march_2024 = pd.read_parquet('data/green_tripdata_2024-03.parquet')

# Preprocess Data
# Add a 'date' column for daily grouping
data_march_2024['date'] = data_march_2024['lpep_pickup_datetime'].dt.date

data_march_2024.shape


(57457, 21)

In [45]:
# Filter for March 2024
start_date = datetime.date(2024, 3, 1)
end_date = datetime.date(2024, 3, 31)

march_2024_data = data_march_2024[
    (data_march_2024['date'] >= start_date) & (data_march_2024['date'] <= end_date)
]

data_march_2024.shape

(57457, 21)

In [37]:
# Define column mapping
column_mapping = ColumnMapping(
    target=None,
    prediction=None,
    numerical_features=['fare_amount']
)

# Create the report with the quantile metric
report = Report(metrics=[
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)
])



In [48]:
# Generate the report for each day's data
daily_results = []
start_date = datetime.date(2024, 3, 1)
end_date = datetime.date(2024, 3, 31)

for date, group in data_march_2024.groupby("date"):
    report.run(reference_data=group, current_data=group)
    result = report.as_dict()

    if (date >= start_date) and (date <= end_date):
        # Extract the quantile value
        quantile_value = result['metrics'][0]['result']['current']['value']
        daily_results.append((date, quantile_value))

# Convert results to a DataFrame for analysis
daily_df = pd.DataFrame(daily_results, columns=["date", "quantile_0.5"])

# report.show(mode='inline')

max_quantile = daily_df["quantile_0.5"].max()
print(f"Maximum quantile value: {max_quantile}")


Maximum quantile value: 14.2
