In [67]:
import json
import uuid
from datetime import datetime
from time import sleep

import pyarrow.parquet as pq
import requests

table = pq.read_table("green_tripdata_2022-01.parquet")

data = table.to_pylist()


class DateTimeEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, datetime):
            return o.isoformat()
        return json.JSONEncoder.default(self, o)

In [68]:
from pydantic import BaseModel
from typing import Union

In [79]:
df = pd.read_parquet("green_tripdata_2021-01.parquet")
df.columns = df.columns.str.upper()
df.to_parquet("green_tripdata_2021-01.parquet")

In [69]:
class Ride(BaseModel):
    VENDORID: str
    LPEP_PICKUP_DATETIME: datetime
    LPEP_DROPOFF_DATETIME: datetime
    STORE_AND_FWD_FLAG: str
    RATECODEID: int
    PULOCATIONID: str
    DOLOCATIONID: str
    PASSENGER_COUNT: int
    TRIP_DISTANCE: float
    FARE_AMOUNT: float
    EXTRA: float
    MTA_TAX: float
    TIP_AMOUNT: float
    TOLLS_AMOUNT: float
    EHAIL_FEE: Union[float, None]
    IMPROVEMENT_SURCHARGE: float
    TOTAL_AMOUNT: float
    PAYMENT_TYPE: int
    TRIP_TYPE: int
    CONGESTION_SURCHARGE: float
    ID: str
    

In [70]:
from pymongo import MongoClient

In [71]:
mongo_client = MongoClient("mongodb://127.0.0.1:27017")
db = mongo_client.get_database("prediction_service")
collection = db.get_collection("data")

In [74]:
list(collection.find())[-1]

{'_id': ObjectId('62c7249f8c6ba847ae6f943a'),
 'PULOCATIONID': 7,
 'DOLOCATIONID': 129,
 'TRIP_DISTANCE': 3.26,
 'PU_DO': '7_129',
 'prediction': 13.566366978374369}

In [18]:
collection.insert_one({"prediction": 1.023})

<pymongo.results.InsertOneResult at 0x11cf85fd0>

In [19]:
db.list_collections().address

('127.0.0.1', 27017)

In [25]:
import pickle
with open("lin_reg.bin", "rb") as f_in:
    dv, model = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [63]:
def predict(record):
    record["PU_DO"] = str(record["PULOCATIONID"]) + "_" + str(record["DOLOCATIONID"])
    X = dv.transform(record)
    y_pred = model.predict(X)
    
    
    return {
        "duration": y_pred
    }

In [64]:
for i in data[0:25]:
  row = {key.upper() if type(key) == str else key: value for key, value in i.items()}
  row["ID"] = str(uuid.uuid4())
  req = Ride(**row).json()
  record = json.loads(req)
  print(predict(record))

{'duration': array([5.11869385])}
{'duration': array([10.20102853])}
{'duration': array([20.63146117])}
{'duration': array([6.64319515])}
{'duration': array([19.88736889])}
{'duration': array([7.3526372])}
{'duration': array([26.43757842])}
{'duration': array([5.14106809])}
{'duration': array([8.23193295])}
{'duration': array([26.43757842])}
{'duration': array([16.53869167])}
{'duration': array([10.52649735])}
{'duration': array([6.06953197])}
{'duration': array([19.36880346])}
{'duration': array([20.72490798])}
{'duration': array([8.64865525])}
{'duration': array([26.41821026])}
{'duration': array([16.77341692])}
{'duration': array([12.82696392])}
{'duration': array([29.47930536])}
{'duration': array([9.00501793])}
{'duration': array([9.66328401])}
{'duration': array([16.68926104])}
{'duration': array([19.75016234])}
{'duration': array([26.44129867])}


{'VENDORID': '2',
 'LPEP_PICKUP_DATETIME': '2022-01-01T00:14:21',
 'LPEP_DROPOFF_DATETIME': '2022-01-01T00:15:33',
 'STORE_AND_FWD_FLAG': 'N',
 'RATECODEID': 1,
 'PULOCATIONID': '42',
 'DOLOCATIONID': '42',
 'PASSENGER_COUNT': 1,
 'TRIP_DISTANCE': 0.44,
 'FARE_AMOUNT': 3.5,
 'EXTRA': 0.5,
 'MTA_TAX': 0.5,
 'TIP_AMOUNT': 0.0,
 'TOLLS_AMOUNT': 0.0,
 'EHAIL_FEE': None,
 'IMPROVEMENT_SURCHARGE': 0.3,
 'TOTAL_AMOUNT': 4.8,
 'PAYMENT_TYPE': 2,
 'TRIP_TYPE': 1,
 'CONGESTION_SURCHARGE': 0.0,
 'ID': '81ca1a63-61ff-44e7-8647-2cf6cda4b602'}

In [56]:
predict(record)

{'duration': array([26.43757842])}

In [22]:
requests.post("http://127.0.0.1:9696/predict",
                             headers={"Content-Type": "application/json"},
                             data=json.dumps(row, cls=DateTimeEncoder))

<Response [200]>

In [35]:

requests.post("http://127.0.0.1:9696/predict",
                             headers={"Content-Type": "application/json"},
                             data=json.dumps(row, cls=DateTimeEncoder)).json()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [4]:
data[0].upper()

AttributeError: 'dict' object has no attribute 'upper'

In [24]:



with open("target.csv", 'w') as f_target:
    for row in data:
        row['ID'] = str(uuid.uuid4())
        row = {key.upper() if type(key) == str else key: value for key, value in row.items()}
        duration = (row['LPEP_DROPOFF_DATETIME'] - row['LPEP_PICKUP_DATETIME']).total_seconds() / 60
        f_target.write(f"{row['ID']},{duration}\n")
        resp = requests.post("http://127.0.0.1:9696/predict",
                             headers={"Content-Type": "application/json"},
                             data=json.dumps(row, cls=DateTimeEncoder)).json()
        print(f"prediction: {resp}")
        sleep(1)

prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}
prediction: {'ride_duration': 21.578590757985687}


KeyboardInterrupt: 

In [66]:
 import pandas as pd
 df = pd.read_parquet(f'https://nyc-tlc.s3.amazonaws.com/trip+data/yellow_tripdata_2021-01.parquet')

HTTPError: HTTP Error 403: Forbidden

In [80]:
import json
import os
import pickle

import pandas
from prefect import flow, task
from pymongo import MongoClient
import pyarrow.parquet as pq

from evidently import ColumnMapping

from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataDriftTab,RegressionPerformanceTab

from evidently.model_profile import Profile
from evidently.model_profile.sections import DataDriftProfileSection, RegressionPerformanceProfileSection


In [103]:
filename = "target.csv"

In [104]:
client = MongoClient("mongodb://localhost:27017/")
collection = client.get_database("prediction_service").get_collection("data")
with open(filename) as f_target:
    for line in f_target.readlines():
        row = line.split(",")
        collection.update_one({"_id": row[0]}, {"$set": {"target": float(row[1])}})

In [83]:
filename = "./evidently_service/datasets/green_tripdata_2021_01.parquet"
MODEL_FILE = os.getenv('MODEL_FILE', './prediction_service/lin_reg.bin')
with open(MODEL_FILE, 'rb') as f_in:
    dv, model = pickle.load(f_in)

reference_data = pq.read_table(filename).to_pandas()
# Create features
reference_data['PU_DO'] = reference_data['PULOCATIONID'].astype(str) + "_" + reference_data['DOLOCATIONID'].astype(str)
# add target column
reference_data['target'] = reference_data["LPEP_DROPOFF_DATETIME"] - reference_data["LPEP_PICKUP_DATETIME"]
reference_data.target = reference_data.target.apply(lambda td: td.total_seconds() / 60)
features = ['PU_DO', 'PULOCATIONID', 'DOLOCATIONID', 'TRIP_DISTANCE']
x_pred = dv.transform(reference_data[features].to_dict(orient='records'))
reference_data['prediction'] = model.predict(x_pred)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [84]:
reference_data

Unnamed: 0,VENDORID,LPEP_PICKUP_DATETIME,LPEP_DROPOFF_DATETIME,STORE_AND_FWD_FLAG,RATECODEID,PULOCATIONID,DOLOCATIONID,PASSENGER_COUNT,TRIP_DISTANCE,FARE_AMOUNT,...,TOLLS_AMOUNT,EHAIL_FEE,IMPROVEMENT_SURCHARGE,TOTAL_AMOUNT,PAYMENT_TYPE,TRIP_TYPE,CONGESTION_SURCHARGE,PU_DO,target,prediction
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.50,...,0.00,,0.3,6.80,2.0,1.0,0.00,43_151,3.933333,6.120171
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.00,...,0.00,,0.3,16.86,1.0,1.0,2.75,166_239,8.750000,10.265737
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1.0,41,42,1.0,1.12,6.00,...,0.00,,0.3,8.30,1.0,1.0,0.00,41_42,5.966667,6.757073
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1.0,168,75,1.0,1.99,8.00,...,0.00,,0.3,9.30,2.0,1.0,0.00,168_75,7.083333,14.444309
4,2,2021-01-01 00:16:36,2021-01-01 00:16:40,N,2.0,265,265,3.0,0.00,-52.00,...,0.00,,-0.3,-52.80,3.0,1.0,0.00,265_265,0.066667,15.236120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,2,2021-01-31 21:38:00,2021-01-31 22:16:00,,,81,90,,17.63,56.23,...,6.12,,0.3,65.40,,,,81_90,38.000000,41.961900
76514,2,2021-01-31 22:43:00,2021-01-31 23:21:00,,,35,213,,18.36,46.66,...,6.12,,0.3,65.28,,,,35_213,38.000000,33.436942
76515,2,2021-01-31 22:16:00,2021-01-31 22:27:00,,,74,69,,2.50,18.95,...,0.00,,0.3,22.00,,,,74_69,11.000000,16.366728
76516,2,2021-01-31 23:10:00,2021-01-31 23:37:00,,,168,215,,14.48,48.87,...,6.12,,0.3,58.04,,,,168_215,27.000000,33.323312


In [85]:
client = MongoClient("mongodb://localhost:27017/")
data = client.get_database("prediction_service").get_collection("data").find()
df = pandas.DataFrame(list(data))

In [90]:
reference_data = reference_data.drop('EHAIL_FEE', axis=1)

In [94]:
profile = Profile(sections=[DataDriftProfileSection(), RegressionPerformanceProfileSection()])
mapping = ColumnMapping(prediction="prediction", numerical_features=['TRIP_DISTANCE'],
                        categorical_features=['PULOCATIONID', 'DOLOCATIONID'],
                        datetime_features=[])

In [98]:
df

Unnamed: 0,_id,prediction,PULOCATIONID,DOLOCATIONID,TRIP_DISTANCE,ROUTE,PU_DO
0,62c51f03146bd6672df7d948,1.023000,,,,,
1,62c5225f146bd6672df7d94a,1.023000,,,,,
2,62c67dd74a12ef4de4860a2d,1.023000,,,,,
3,62c67de328a31bc17624a008,21.578591,42.0,42.0,0.44,42_42,
4,62c67ebf28a31bc17624a009,21.578591,42.0,42.0,0.44,42_42,
...,...,...,...,...,...,...,...
8813,62c74454b0784dc7fc4b61a2,15.191871,244.0,167.0,3.47,,244_167
8814,62c74455b0784dc7fc4b61a3,4.717365,159.0,159.0,2.45,,159_159
8815,62c74456b0784dc7fc4b61a4,26.437578,66.0,238.0,8.98,,66_238
8816,62c74457b0784dc7fc4b61a5,8.996568,41.0,75.0,0.62,,41_75


In [97]:
reference_data

Unnamed: 0,VENDORID,LPEP_PICKUP_DATETIME,LPEP_DROPOFF_DATETIME,STORE_AND_FWD_FLAG,RATECODEID,PULOCATIONID,DOLOCATIONID,PASSENGER_COUNT,TRIP_DISTANCE,FARE_AMOUNT,...,TIP_AMOUNT,TOLLS_AMOUNT,IMPROVEMENT_SURCHARGE,TOTAL_AMOUNT,PAYMENT_TYPE,TRIP_TYPE,CONGESTION_SURCHARGE,PU_DO,target,prediction
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.50,...,0.00,0.00,0.3,6.80,2.0,1.0,0.00,43_151,3.933333,6.120171
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.00,...,2.81,0.00,0.3,16.86,1.0,1.0,2.75,166_239,8.750000,10.265737
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1.0,41,42,1.0,1.12,6.00,...,1.00,0.00,0.3,8.30,1.0,1.0,0.00,41_42,5.966667,6.757073
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1.0,168,75,1.0,1.99,8.00,...,0.00,0.00,0.3,9.30,2.0,1.0,0.00,168_75,7.083333,14.444309
4,2,2021-01-01 00:16:36,2021-01-01 00:16:40,N,2.0,265,265,3.0,0.00,-52.00,...,0.00,0.00,-0.3,-52.80,3.0,1.0,0.00,265_265,0.066667,15.236120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,2,2021-01-31 21:38:00,2021-01-31 22:16:00,,,81,90,,17.63,56.23,...,0.00,6.12,0.3,65.40,,,,81_90,38.000000,41.961900
76514,2,2021-01-31 22:43:00,2021-01-31 23:21:00,,,35,213,,18.36,46.66,...,12.20,6.12,0.3,65.28,,,,35_213,38.000000,33.436942
76515,2,2021-01-31 22:16:00,2021-01-31 22:27:00,,,74,69,,2.50,18.95,...,0.00,0.00,0.3,22.00,,,,74_69,11.000000,16.366728
76516,2,2021-01-31 23:10:00,2021-01-31 23:37:00,,,168,215,,14.48,48.87,...,0.00,6.12,0.3,58.04,,,,168_215,27.000000,33.323312


In [95]:
profile.calculate(reference_data, df, mapping)

KeyError: 'target'

In [None]:


dashboard = Dashboard(tabs=[DataDriftTab(), RegressionPerformanceTab(verbose_level=0)])
dashboard.calculate(reference_data, df, mapping)

In [106]:
list(collection.find())

[{'_id': ObjectId('62c51f03146bd6672df7d948'), 'prediction': 1.023},
 {'_id': ObjectId('62c5225f146bd6672df7d94a'), 'prediction': 1.023},
 {'_id': ObjectId('62c67dd74a12ef4de4860a2d'), 'prediction': 1.023},
 {'_id': ObjectId('62c67de328a31bc17624a008'),
  'PULOCATIONID': 42,
  'DOLOCATIONID': 42,
  'TRIP_DISTANCE': 0.44,
  'ROUTE': '42_42',
  'prediction': 21.578590757985687},
 {'_id': ObjectId('62c67ebf28a31bc17624a009'),
  'PULOCATIONID': 42,
  'DOLOCATIONID': 42,
  'TRIP_DISTANCE': 0.44,
  'ROUTE': '42_42',
  'prediction': 21.578590757985687},
 {'_id': ObjectId('62c67ec028a31bc17624a00a'),
  'PULOCATIONID': 116,
  'DOLOCATIONID': 41,
  'TRIP_DISTANCE': 2.1,
  'ROUTE': '116_41',
  'prediction': 21.578590757985687},
 {'_id': ObjectId('62c67ec128a31bc17624a00b'),
  'PULOCATIONID': 41,
  'DOLOCATIONID': 140,
  'TRIP_DISTANCE': 3.7,
  'ROUTE': '41_140',
  'prediction': 21.578590757985687},
 {'_id': ObjectId('62c67ec228a31bc17624a00c'),
  'PULOCATIONID': 181,
  'DOLOCATIONID': 181,
  'TRI