# Monintoring

In [1]:
import requests
import prefect
import pandas as pd
import datetime

from joblib import dump, load
from tqdm.auto import tqdm

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

In [75]:
from evidently import Report
from evidently import DataDefinition, Dataset
from evidently.presets import DataDriftPreset, DataSummaryPreset
from evidently.metrics import QuantileValue, DriftedColumnsCount, DatasetMissingValueCount, ValueDrift
import evidently
from evidently.core.report import Snapshot


In [20]:
url="https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet"
resp = requests.get(url, stream = True)

with open('./data/green_tripdata_2024-03.parquet', "wb") as f_in:
    for data in tqdm(resp.iter_content(chunk_size = 1024),
                        unit = "KB"
                        ):
        f_in.write(data)
        
        


0KB [00:00, ?KB/s]

In [4]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60
    df['date'] = pd.to_datetime(df['lpep_pickup_datetime'])

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


In [5]:
jan_data = read_dataframe("./data/green_tripdata_2024-01.parquet")

In [6]:
mar_data = read_dataframe('./data/green_tripdata_2024-03.parquet')

In [7]:
mar_data.shape

(55139, 22)

In [8]:
mar_data.columns

Index(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge',
       'duration', 'date'],
      dtype='object')

In [40]:
# data labelling

target = ['duration']
num_features = ['fare_amount', 'trip_distance']
cat_features = ['PULocationID', 'DOLocationID']
time_features = ['date']

In [10]:
jan_data[num_features+cat_features+time_features]

Unnamed: 0,fare_amount,trip_distance,PULocationID,DOLocationID,date
0,12.80,1.98,236,239,2024-01-01 00:46:55
1,30.30,6.54,65,170,2024-01-01 00:31:42
2,19.80,3.08,74,262,2024-01-01 00:30:21
3,14.20,2.40,74,116,2024-01-01 00:30:20
4,22.60,5.14,74,243,2024-01-01 00:32:38
...,...,...,...,...,...
56546,11.58,0.00,33,25,2024-01-31 20:46:00
56547,11.58,0.49,72,72,2024-01-31 21:06:00
56548,11.58,0.52,72,72,2024-01-31 21:36:00
56549,14.22,1.17,41,42,2024-01-31 22:45:00


In [11]:
# train data

X_train = jan_data[num_features + cat_features]
y_train = jan_data['duration']

X_val = mar_data[num_features + cat_features]
y_val = mar_data['duration']


In [12]:
model = LinearRegression()

In [13]:
model.fit(X_train, y_train)

In [14]:
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)

In [15]:
with open ("./models/lin_reg.bin", "wb") as f_out:
    dump(model, f_out)

In [16]:
train_data = X_train.copy(deep = True)
train_data['duration'] = y_train
train_data['prediction'] = train_preds
train_data['date'] = jan_data['date']

val_data = X_val.copy(deep = True)
val_data['duration'] = y_val
val_data['prediction'] = val_preds
val_data['date'] = mar_data['date']

In [41]:
# Column mapping

column_def = DataDefinition(
    timestamp= 'date',
    numerical_columns=num_features,
    categorical_columns= cat_features,
    datetime_columns= time_features
)

In [42]:
train_data_df = Dataset.from_pandas(train_data, data_definition= column_def)
val_data_df = Dataset.from_pandas(val_data, data_definition= column_def)

In [62]:
type(train_data_df)

evidently.core.datasets.PandasDataset

In [53]:
train_data.describe()

Unnamed: 0,fare_amount,trip_distance,duration,prediction,date
count,54373.0,54373.0,54373.0,54373.0,54373
mean,16.427333,28.97473,13.778175,13.778175,2024-01-16 21:34:14.512681
min,-70.0,0.0,1.0,-42.350858,2023-12-31 14:38:47
25%,9.3,1.15,7.583333,9.212552,2024-01-09 11:26:36
50%,13.5,1.82,11.566667,11.827249,2024-01-17 07:38:46
75%,19.59,3.1,17.3,15.829876,2024-01-24 15:28:12
max,400.0,201421.68,60.0,262.131826,2024-01-31 23:57:29
std,11.430829,1355.349789,9.021503,7.418638,


In [54]:
val_data.describe()

Unnamed: 0,fare_amount,trip_distance,duration,prediction,date
count,55139.0,55139.0,55139.0,55139.0,55139
mean,16.699886,12.550098,13.965343,13.94953,2024-03-16 09:07:54.800485
min,-167.94,0.0,1.0,-105.706015,2024-02-23 23:14:54
25%,9.3,1.15,7.666667,9.304862,2024-03-08 13:58:52.500000
50%,13.5,1.82,11.733333,11.907494,2024-03-15 23:12:53
75%,19.8,3.11,17.6,15.987997,2024-03-23 20:59:42.500000
max,245.0,125112.2,60.0,162.063928,2024-04-01 00:01:45
std,11.755443,749.960019,9.133786,7.626887,


In [93]:
report = Report(
    metrics=[
        DatasetMissingValueCount(),
        DriftedColumnsCount(),
        QuantileValue(column = 'fare_amount', quantile = 0.5),
        QuantileValue(column = 'trip_distance', quantile = 0.5)
        ],
    include_tests= True
)

In [94]:
snapshot = Snapshot(
    report = report,
    name = "this is name",
    timestamp= 'date',
    metadata={},
    tags= []
)

In [95]:
snapshot.run(val_data_df,train_data_df)

In [96]:
dict_1 = snapshot.dict()

In [84]:
def print_snapshot_metrics(snapshot_dict):
    metrics = snapshot_dict.get("metrics", [])
    
    if not metrics:
        print("No metrics found.")
        return

    for idx, metric in enumerate(metrics, 1):
        metric_id = metric.get("metric_id", "Unknown")
        value = metric.get("value", "N/A")
        
        # Handle nested or complex values
        if isinstance(value, dict):
            print(f"{idx}. {metric_id}:")
            for key, val in value.items():
                print(f"   - {key}: {val}")
        else:
            print(f"{idx}. {metric_id}: {value}")


In [97]:
print_snapshot_metrics(dict_1)

1. DatasetMissingValueCount():
   - count: 0.0
   - share: 0.0
2. DriftedColumnsCount(drift_share=0.5):
   - count: 0.0
   - share: 0.0
3. QuantileValue(column=fare_amount,quantile=0.5): 13.5
4. QuantileValue(column=trip_distance,quantile=0.5): 1.82


In [106]:
snapshot.dict()['metrics'][3]

{'id': '9e6652d83cddd2d55921cbface872360',
 'metric_id': 'QuantileValue(column=trip_distance,quantile=0.5)',
 'value': np.float64(1.82)}

In [86]:
dict_1

{'metrics': [{'id': '89b8d715e93a15e3acbc483410aff2f4',
   'metric_id': 'DatasetMissingValueCount()',
   'value': {'count': 0.0, 'share': 0.0}},
  {'id': '15e89f895b482f9b84ba7274ed18a106',
   'metric_id': 'DriftedColumnsCount(drift_share=0.5)',
   'value': {'count': 0.0, 'share': 0.0}},
  {'id': 'f6e3fad8f5b0bc4766dea6099806687e',
   'metric_id': 'ValueDrift(column=fare_amount)',
   'value': np.float64(0.025693556088609862)},
  {'id': '32bb865e004396989318986fca20adc1',
   'metric_id': 'ValueDrift(column=trip_distance)',
   'value': np.float64(0.012125903301272011)},
  {'id': '55ac0d3ab973e295b757b905344717f8',
   'metric_id': 'ValueDrift(column=PULocationID)',
   'value': np.float64(0.05338930538622606)},
  {'id': '4982f3e54d389610ea57eac0b6f86a8d',
   'metric_id': 'ValueDrift(column=DOLocationID)',
   'value': np.float64(0.046590741780082635)},
  {'id': '641f3d487377ef8aeba674733f8705f4',
   'metric_id': 'QuantileValue(column=fare_amount,quantile=0.5)',
   'value': np.float64(13.5)}