# Monintoring

In [1]:
import requests
import prefect
import pandas as pd
import datetime

from joblib import dump, load
from tqdm.auto import tqdm

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

In [4]:
from evidently import Report
from evidently import DataDefinition, Dataset
from evidently.presets import DataDriftPreset, DataSummaryPreset
from evidently.metrics import QuantileValue
import evidently


In [20]:
url="https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet"
resp = requests.get(url, stream = True)

with open('./data/green_tripdata_2024-03.parquet', "wb") as f_in:
    for data in tqdm(resp.iter_content(chunk_size = 1024),
                        unit = "KB"
                        ):
        f_in.write(data)
        
        


0KB [00:00, ?KB/s]

In [8]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60
    df['date'] = pd.to_datetime(df['lpep_pickup_datetime'])

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


In [9]:
jan_data = read_dataframe("./data/green_tripdata_2024-01.parquet")

In [10]:
mar_data = read_dataframe('./data/green_tripdata_2024-03.parquet')

In [11]:
mar_data.shape

(55139, 22)

In [12]:
mar_data.columns

Index(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge',
       'duration', 'date'],
      dtype='object')

In [13]:
# data labelling

target = ['duration']
num_features = ['fare_amount', 'trip_distance']
cat_features = ['PULocationID', 'DOLocationID']
time_features = ['date']

In [14]:
jan_data[num_features+cat_features+time_features]

Unnamed: 0,fare_amount,trip_distance,PULocationID,DOLocationID,date
0,12.80,1.98,236,239,2024-01-01 00:46:55
1,30.30,6.54,65,170,2024-01-01 00:31:42
2,19.80,3.08,74,262,2024-01-01 00:30:21
3,14.20,2.40,74,116,2024-01-01 00:30:20
4,22.60,5.14,74,243,2024-01-01 00:32:38
...,...,...,...,...,...
56546,11.58,0.00,33,25,2024-01-31 20:46:00
56547,11.58,0.49,72,72,2024-01-31 21:06:00
56548,11.58,0.52,72,72,2024-01-31 21:36:00
56549,14.22,1.17,41,42,2024-01-31 22:45:00


In [15]:
# train data

X_train = jan_data[num_features + cat_features]
y_train = jan_data['duration']


In [16]:
model = LinearRegression()

In [17]:
model.fit(X_train, y_train)

In [18]:
train_preds = model.predict(X_train)

In [19]:
with open ("./models/lin_reg.bin", "wb") as f_out:
    dump(model, f_out)

In [20]:
train_data = X_train.copy(deep = True)
train_data['duration'] = y_train
train_data['prediction'] = train_preds
train_data['date'] = jan_data['date']

In [21]:
# Column mapping

column_def = DataDefinition(
    timestamp= time_features,
    numerical_columns=num_features,
    categorical_columns= cat_features,
    datetime_columns= time_features
)

In [22]:
train_data_df = Dataset.from_pandas(train_data, data_definition= column_def)

In [23]:
train_data.head()

Unnamed: 0,fare_amount,trip_distance,PULocationID,DOLocationID,duration,prediction,date
0,12.8,1.98,236,239,11.5,11.382683,2024-01-01 00:46:55
1,30.3,6.54,65,170,20.866667,22.952229,2024-01-01 00:31:42
2,19.8,3.08,74,262,19.033333,16.445502,2024-01-01 00:30:21
3,14.2,2.4,74,116,11.866667,12.307879,2024-01-01 00:30:20
4,22.6,5.14,74,243,10.983333,18.193331,2024-01-01 00:32:38


In [24]:
report = Report(metrics=[
    # DataSummaryPreset(),
    # DataSummaryPreset(['fare_amount']),
    # DataDriftPreset(),
    QuantileValue(column = 'fare_amount', quantile = 0.5)]
)

In [45]:
report1 = Report(metrics=[
    DataDriftPreset()
])

In [47]:
report1.run(train_data)

ValueError: Reference dataset should be present

In [26]:
snapshot = report.run(train_data_df, timestamp = 'date')

In [27]:
type(snapshot)

evidently.core.report.Snapshot

In [28]:
print(snapshot)

<evidently.core.report.Snapshot object at 0x7fe4242e6c60>


In [59]:
report.__dict__

{'metrics': [QuantileValue(type='evidently:metric_v2:QuantileValue', tests=None, column='fare_amount', quantile=0.5)],
 'metadata': {},
 'tags': [],
 '_timestamp': None,
 'include_tests': False}

In [29]:
from evidently.core.report import Snapshot

In [30]:
snapshot = Snapshot(
    report = report,
    name = "this is name",
    timestamp= 'date',
    metadata={},
    tags= []
)

In [32]:
snapshot.run(train_data_df, None)

In [37]:
snapshot.save_json("./snapshot.json")

In [45]:
dict_1 = snapshot.dict()

In [46]:
print(dict_1)

{'metrics': [{'id': '641f3d487377ef8aeba674733f8705f4', 'metric_id': 'QuantileValue(column=fare_amount,quantile=0.5)', 'value': np.float64(13.5)}], 'tests': []}
