In [14]:
# import dependencies
import json
import os
import pickle
import pandas as pd
from datetime import datetime

import pandas
import pyarrow.parquet as pq
from evidently import ColumnMapping
from evidently.metrics import DataDriftTable, RegressionPerformanceMetrics
from evidently.metrics import RegressionErrorPlot, RegressionErrorDistribution
from evidently.metrics import DataDriftTable
from evidently.metrics import DatasetDriftMetric
from evidently.metric_preset import TargetDriftPreset
from evidently.report import Report


from prefect import flow, task

from pymongo import MongoClient

In [2]:
MONGO_CLIENT_ADDRESS = "mongodb://localhost:27017/"
MONGO_DATABASE = "prediction_service"
PREDICTION_COLLECTION = "data"
REPORT_COLLECTION = "report"
REFERENCE_DATA_FILE = "../data/green_tripdata_2021-03.parquet"
TARGET_DATA_FILE = "target.csv"
MODEL_FILE = os.getenv('MODEL_FILE', '../prediction_service/lin_reg.bin') 

In [3]:
# create a task for prefect, a function to upload the target variable
#@task
def upload_target(filename):
    # declare the mongo client by accessing the mongo client address
    client = MongoClient(MONGO_CLIENT_ADDRESS)
    # get  the dataase and assigining the collection a collection table name
    collection = client.get_database(MONGO_DATABASE).get_collection(PREDICTION_COLLECTION)
    # open the uploaded file
    with open(filename) as f_target:
        # read each line, for each line
        for line in f_target.readlines():
            # split the row since it is a csv file
            row = line.split(",")
            # access the id and the target, place them in the collection table called data
            collection.update_one({"id": row[0]},
                                  {"$set": {"target": float(row[1])}}
                                 )

In [4]:
upload_target(TARGET_DATA_FILE)

In [5]:
# task to load reference data
#@task
def load_reference_data(filename):
    # load the trained and tested model, with its pickle file
    with open(MODEL_FILE, 'rb') as f_in:
        dv, model = pickle.load(f_in)
    # read the ref data from the filename, coinvert it to pandas and sample 5000 rows
    reference_data = pq.read_table(filename).to_pandas().sample(n=5000,random_state=42) #Monitoring for 1st 5000 records
    # Create features
    reference_data['PU_DO'] = reference_data['PULocationID'].astype(str) + "_" + reference_data['DOLocationID'].astype(str)

    # add target column which is the duration
    #reference_data['lpep_dropoff_datetime'] = reference_data['lpep_dropoff_datetime'].astype(datetime)
    #reference_data['lpep_pickup_datetime'] = reference_data['lpep_pickup_datetime'].astype(datetime)
    reference_data['target'] = reference_data.lpep_dropoff_datetime - reference_data.lpep_pickup_datetime
    # transform the target feature into minutes
    reference_data.target = reference_data.target.apply(lambda td: td.total_seconds() / 60)
    # make sure the minutes are greater than 1 and less than 60
    reference_data = reference_data[(reference_data.target >= 1) & (reference_data.target <= 60)]
    # create a list of teh features that will be used or are of interest
    features = ['PU_DO', 'PULocationID', 'DOLocationID', 'trip_distance']
    # transform the features usingg the dict vectorizer
    x_pred = dv.transform(reference_data[features].to_dict(orient='records'))
    # place the rpediction in a columnn of the reference data
    reference_data['prediction'] = model.predict(x_pred)
    return reference_data


In [6]:
ref_data = load_reference_data(REFERENCE_DATA_FILE)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [7]:
def fetch_data():
    '''create a dataframe of the data that i stored in mongo db'''
    client = MongoClient(MONGO_CLIENT_ADDRESS)
    data = client.get_database(MONGO_DATABASE).get_collection(PREDICTION_COLLECTION).find()
    df = pandas.DataFrame(list(data))
    df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
    df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
    # df['lpep_dropoff_datetime'] = df['lpep_dropoff_datetime'].astype(datetime)
    # df['lpep_pickup_datetime'] = df['lpep_pickup_datetime'].astype(datetime)
    df.drop(['_id'], axis = 1, inplace=True)
    return df

In [8]:
data = fetch_data()

In [9]:
ref_data.head(1)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,PU_DO,target,prediction
55780,2,2021-03-10 07:00:00,2021-03-10 07:10:00,,,51,185,,4.08,18.96,...,0.0,,0.3,22.01,,,,51_185,10.0,11.241891


In [13]:
data.head(2)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,id,PU_DO,prediction,target
0,2,1970-01-01 00:27:00.060840,1970-01-01 00:27:00.062940,,,35,256,,5.97,20.93,...,,0.3,23.98,,,,34d604de-2ba9-4941-8b3a-aec26af3e3f5,35_256,27.751079,35.0
1,2,1970-01-01 00:27:00.822900,1970-01-01 00:27:00.823860,,,16,252,,5.2,27.06,...,,0.3,30.11,,,,578fc3c6-9d56-4d89-985b-53a5ae5d3f19,16_252,27.751022,16.0


In [11]:
ref_data.shape

(4774, 23)

In [12]:
data.shape

(5000, 24)

In [16]:
ref_data.drop(['ehail_fee'], axis=1, inplace=True)
data.drop('ehail_fee', axis=1, inplace=True)

In [17]:
#dataset-level metrics
data_drift_dataset_report = Report(metrics=[
    DatasetDriftMetric(),
    DataDriftTable(num_stattest='kl_div', cat_stattest='psi'),    
])

data_drift_dataset_report.run(reference_data=ref_data, current_data=data)
data_drift_dataset_report

In [18]:
type(data_drift_dataset_report)

evidently.report.report.Report

In [19]:
data_drift_dataset_report.show()

In [21]:
data_drift_dataset_report.save_html(filename='data_drift.html')