In [1]:
import requests
import datetime
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric
from evidently.metrics import ColumnQuantileMetric, ColumnSummaryMetric
from evidently.metrics.base_metric import generate_column_metrics

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
import matplotlib

In [2]:
files = [('green_tripdata_2024-03.parquet', './data')]

print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

Download files:


green_tripdata_2024-03.parquet: 100%|█| 1372372/1372372 [00:08<00:00, 163564.90it/s, save to ./data/green_tripdata_2024-


In [3]:
mar_data = pd.read_parquet('data/green_tripdata_2024-03.parquet')

In [4]:
mar_data.shape

(57457, 20)

In [5]:
mar_data["duration_min"] = mar_data.lpep_dropoff_datetime - mar_data.lpep_pickup_datetime
mar_data.duration_min = mar_data.duration_min.apply(lambda td : float(td.total_seconds())/60)

In [6]:
mar_data = mar_data[(mar_data.duration_min >= 0) & (mar_data.duration_min <= 60)]
mar_data = mar_data[(mar_data.passenger_count > 0) & (mar_data.passenger_count <= 8)]

In [7]:
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

In [8]:
model = LinearRegression()

In [9]:
model.fit(mar_data[num_features + cat_features], mar_data[target])

In [10]:
mar_preds = model.predict(mar_data[num_features + cat_features])
mar_data['prediction'] = mar_preds

In [11]:
with open('models/lin_reg.bin', 'wb') as f_out:
    dump(model, f_out)

In [12]:
mar_data.to_parquet('data/reference.parquet')

In [19]:
maxVal = 0
report = Report(metrics=[ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)])
for i in range(1, 10):
    daily_data =  mar_data.loc[mar_data.lpep_pickup_datetime.between(f'2024-03-{i}', f'2024-03-{i+1}', inclusive="left")]
    #print(daily_data)
    report.run(reference_data=None, current_data=daily_data)
    result = report.as_dict()
    maxVal = max(maxVal,result['metrics'][0]['result']['current']['value'])
print(maxVal)
result


14.2


{'metrics': [{'metric': 'ColumnQuantileMetric',
   'result': {'column_name': 'fare_amount',
    'column_type': 'num',
    'quantile': 0.5,
    'current': {'value': np.float64(13.5)},
    'reference': None}},
  {'metric': 'ColumnQuantileMetric',
   'result': {'column_name': 'fare_amount',
    'column_type': 'num',
    'quantile': 0.5,
    'current': {'value': np.float64(13.5)},
    'reference': None}},
  {'metric': 'ColumnQuantileMetric',
   'result': {'column_name': 'fare_amount',
    'column_type': 'num',
    'quantile': 0.5,
    'current': {'value': np.float64(13.5)},
    'reference': None}},
  {'metric': 'ColumnQuantileMetric',
   'result': {'column_name': 'fare_amount',
    'column_type': 'num',
    'quantile': 0.5,
    'current': {'value': np.float64(13.5)},
    'reference': None}},
  {'metric': 'ColumnQuantileMetric',
   'result': {'column_name': 'fare_amount',
    'column_type': 'num',
    'quantile': 0.5,
    'current': {'value': np.float64(13.5)},
    'reference': None}},
  {'

In [15]:
from evidently.metric_preset import DataDriftPreset, DataQualityPreset

from evidently.ui.workspace import Workspace
from evidently.ui.dashboards import DashboardPanelCounter, DashboardPanelPlot, CounterAgg, PanelValue, PlotType, ReportFilter
from evidently.renderers.html_widgets import WidgetSize



In [22]:
ws = Workspace("workspace")

In [23]:
project = ws.create_project("NYC Taxi Data Quality Project")
project.description = "My project description"
project.save()

Project(id=UUID('0f0c627c-6168-4d63-a04b-40be65434782'), name='NYC Taxi Data Quality Project', description='My project description', dashboard=DashboardConfig(name='NYC Taxi Data Quality Project', panels=[], tabs=[], tab_id_to_panel_ids={}), team_id=None, date_from=None, date_to=None, created_at=datetime.datetime(2024, 6, 23, 15, 16, 17, 352967))

In [24]:
ws.add_report(project.id, report)

In [25]:
project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Fare amount",
        values=[
            PanelValue(
                metric_id="ColumnQuantileMetric",
                field_path="current.value",
                legend="count"
            ),
        ],
        plot_type=PlotType.BAR,
        size=WidgetSize.HALF,
    ),
)

project.save()

Project(id=UUID('0f0c627c-6168-4d63-a04b-40be65434782'), name='NYC Taxi Data Quality Project', description='My project description', dashboard=DashboardConfig(name='NYC Taxi Data Quality Project', panels=[DashboardPanelPlot(type='evidently.ui.dashboards.reports.DashboardPanelPlot', id=UUID('d4d3e11c-8bb4-4519-a393-9e74339b1c4a'), title='Fare amount', filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=False), size=<WidgetSize.HALF: 1>, values=[PanelValue(field_path='current.value', metric_id='ColumnQuantileMetric', metric_fingerprint=None, metric_args={}, legend='count')], plot_type=<PlotType.BAR: 'bar'>)], tabs=[], tab_id_to_panel_ids={}), team_id=None, date_from=None, date_to=None, created_at=datetime.datetime(2024, 6, 23, 15, 16, 17, 352967))