In [89]:
import requests
import datetime
import pandas as pd
from calendar import Calendar

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnQuantileMetric, DatasetMissingValuesMetric

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# Download Data

In [90]:
year = 2024
month = 3

In [91]:
files = [(f'green_tripdata_{year}-{format(month,"02d")}.parquet', './data')]

print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    print(url)
    df = pd.read_parquet(url)
    save_path=f"{path}/{file}"
    df.to_parquet(save_path)

Download files:
https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet


In [92]:
mar_data = pd.read_parquet('./data/green_tripdata_2024-03.parquet')

In [93]:
mar_data.shape

(57457, 20)

# Create Metric

In [94]:
column_mapping = ColumnMapping(
    target=None,
    prediction=None,
    numerical_features=["fare_amount"],
    categorical_features=None
)

In [100]:
report = Report(metrics=[
    ColumnQuantileMetric(column_name= "fare_amount", quantile= 0.5),
    DatasetMissingValuesMetric()
]
)

In [105]:
result = pd.DataFrame(columns = ['date', 'metric','column', 'value'])

In [106]:
dates = [i for i in Calendar().itermonthdates(2024,3) if i.month == month]
for date in dates:
    curr_data = mar_data[mar_data['lpep_pickup_datetime'].dt.date == date]
    report.run(reference_data=None, current_data=curr_data)
    r = report.as_dict()
    new_row1 = {"date": date, "metric": r['metrics'][0]['metric'], "column": "fare_amount", 'value': r['metrics'][0]['result']['current']['value']}
    result.loc[len(result)] = new_row1
    new_row2 = {"date": date, "metric": r['metrics'][1]['metric'], "column": "all", 'value': r['metrics'][1]['result']['current']['number_of_missing_values']}
    result.loc[len(result)] = new_row2

In [108]:
result

Unnamed: 0,date,metric,column,value
0,2024-03-01,ColumnQuantileMetric,fare_amount,13.5
1,2024-03-01,DatasetMissingValuesMetric,all,2443.0
2,2024-03-02,ColumnQuantileMetric,fare_amount,13.5
3,2024-03-02,DatasetMissingValuesMetric,all,2178.0
4,2024-03-03,ColumnQuantileMetric,fare_amount,14.2
...,...,...,...,...
57,2024-03-29,DatasetMissingValuesMetric,all,2178.0
58,2024-03-30,ColumnQuantileMetric,fare_amount,14.2
59,2024-03-30,DatasetMissingValuesMetric,all,1771.0
60,2024-03-31,ColumnQuantileMetric,fare_amount,13.5


In [109]:
result[result['metric'] == 'ColumnQuantileMetric']['value'].max()

np.float64(14.2)

# Create Dashboard

In [138]:
from evidently.metric_preset import DataDriftPreset, DataQualityPreset

from evidently.ui.workspace import Workspace
from evidently.ui.dashboards import DashboardPanelCounter, DashboardPanelPlot, CounterAgg, PanelValue, PlotType, ReportFilter
from evidently.renderers.html_widgets import WidgetSize

In [139]:
ws = Workspace("workspace")

In [140]:
project = ws.create_project("NYC Taxi Data Quality Project")
# project.description = "My project descriotion"
project.save()

Project(id=UUID('8f773c5d-e663-4d4a-acb2-c59e7d507d58'), name='NYC Taxi Data Quality Project', description=None, dashboard=DashboardConfig(name='NYC Taxi Data Quality Project', panels=[], tabs=[], tab_id_to_panel_ids={}), team_id=None, date_from=None, date_to=None, created_at=datetime.datetime(2024, 6, 23, 7, 45, 28, 792786))

In [146]:
dates = [i for i in Calendar().itermonthdates(2024,3) if i.month == month]
for date in dates:
    curr_data = mar_data[mar_data['lpep_pickup_datetime'].dt.date == date]
    report = Report(metrics=[
    ColumnQuantileMetric(column_name= "fare_amount", quantile= 0.5),
    DatasetMissingValuesMetric()],
                    timestamp = datetime.datetime.combine(date, datetime.datetime.min.time()))
    report.run(reference_data=None, current_data=curr_data)
    ws.add_report(project.id, report)

In [147]:
project.dashboard.add_panel(
    DashboardPanelCounter(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        agg=CounterAgg.NONE,
        title="NYC taxi data dashboard"
    )
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Number of Missing Values",
        values=[
            PanelValue(
                metric_id="DatasetSummaryMetric",
                field_path="current.number_of_missing_values",
                legend="count"
            ),
        ],
        plot_type=PlotType.LINE,
        size=WidgetSize.HALF,
    ),
)

In [148]:
project.save()

Project(id=UUID('8f773c5d-e663-4d4a-acb2-c59e7d507d58'), name='NYC Taxi Data Quality Project', description=None, dashboard=DashboardConfig(name='NYC Taxi Data Quality Project', panels=[DashboardPanelCounter(type='evidently.ui.dashboards.reports.DashboardPanelCounter', id=UUID('fa6fbcb2-b25f-4303-b267-445c3c7be041'), title='NYC taxi data dashboard', filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=False), size=<WidgetSize.FULL: 2>, agg=<CounterAgg.NONE: 'none'>, value=None, text=None), DashboardPanelPlot(type='evidently.ui.dashboards.reports.DashboardPanelPlot', id=UUID('dcadf5a2-5c86-410a-8312-32658f85b300'), title='Number of Missing Values', filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=False), size=<WidgetSize.HALF: 1>, values=[PanelValue(field_path='current.number_of_missing_values', metric_id='DatasetSummaryMetric', metric_fingerprint=None, metric_args={}, legend='count')], plot_type=<PlotType.LINE: 'line'>)], tabs=[], tab_id_t