In [93]:
!pip freeze | grep scikit-learn

'grep' is not recognized as an internal or external command,
operable program or batch file.


In [94]:
!python -V

Python 3.11.9


In [95]:
import pickle
import pandas as pd

In [96]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [97]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [104]:
year = 2023
month = 4
taxi_type = 'yellow'

input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/{taxi_type}_tripdata_{year:04d}-{month:02d}.parquet'
output_file = f'output/{taxi_type}/{taxi_type}_tripdata_{year}-{month}.parquet'

df = read_data(input_file)

In [105]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [106]:
import numpy as np
std_dev = np.std(y_pred)
mean_duration = np.mean(y_pred)
print('std_dev is: ', std_dev)
print('mean predicted duration is: ', mean_duration)

std_dev is:  6.353996941249663
mean predicted duration is:  14.292282936862449


In [80]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.00,1.0,N,238,42,2,8.60,1.0,0.5,0.00,0.0,1.0,11.10,0.0,0.00,10.000000
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.40,1.0,N,138,231,1,52.70,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.30,1.0,N,140,186,1,18.40,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.00,14.366667
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.90,1.0,N,140,43,1,15.60,3.5,0.5,4.10,0.0,1.0,24.70,2.5,0.00,11.466667
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,7.20,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.00,3.033333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3403761,2,2023-03-31 23:24:25,2023-03-31 23:40:54,,3.16,,,163,75,0,12.13,0.0,0.5,4.23,0.0,1.0,20.36,,,16.483333
3403762,2,2023-03-31 23:24:50,2023-04-01 00:04:12,,6.89,,,125,198,0,40.92,0.0,0.5,8.98,0.0,1.0,53.90,,,39.366667
3403763,2,2023-03-31 23:26:31,2023-03-31 23:49:39,,4.01,,,50,224,0,24.02,0.0,0.5,0.00,0.0,1.0,28.02,,,23.133333
3403764,2,2023-03-31 23:07:51,2023-03-31 23:15:56,,1.31,,,113,158,0,8.51,0.0,0.5,3.50,0.0,1.0,16.01,,,8.083333


In [81]:
import uuid

In [82]:
str(uuid.uuid4())

'6b93c863-b098-4ce1-bf09-753eb6956252'

In [83]:
n = len(df)
ride_ids = []
for i in range(n):
    ride_ids.append(str(uuid.uuid4()))

In [84]:
ride_ids[:10]

['f374e318-4f50-4d62-b9ab-f166acb0ff15',
 '7594dff6-e4f8-4f72-aaf8-f46edcdd50e1',
 '1938d8a6-d27e-4fe8-a119-33e97a4f453a',
 '6d300895-e9ec-4096-ac0c-89f6ecbe3eeb',
 'e5ff3e45-9445-471a-9730-cbd5811cc9b0',
 '0cbbb853-2393-49dc-9d77-c9a2061a3d14',
 '9905b175-95f0-4707-b67f-f6b6e32c2a6d',
 '6f9fbd24-0b05-4d63-a1ba-8558ced2af88',
 'c29581dc-f2f7-4370-bfed-0cf6eb32eeb1',
 '099bd9f7-aadc-42d3-bb56-1e78fccb7b30']

In [85]:
df['ride_id'] = ride_ids

In [86]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration,ride_id
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.0,1.0,N,238,42,2,...,1.0,0.5,0.0,0.0,1.0,11.1,0.0,0.0,10.0,f374e318-4f50-4d62-b9ab-f166acb0ff15
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.4,1.0,N,138,231,1,...,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333,7594dff6-e4f8-4f72-aaf8-f46edcdd50e1
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.3,1.0,N,140,186,1,...,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.0,14.366667,1938d8a6-d27e-4fe8-a119-33e97a4f453a
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.9,1.0,N,140,43,1,...,3.5,0.5,4.1,0.0,1.0,24.7,2.5,0.0,11.466667,6d300895-e9ec-4096-ac0c-89f6ecbe3eeb
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,...,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.0,3.033333,e5ff3e45-9445-471a-9730-cbd5811cc9b0


In [87]:
df_result = pd.DataFrame()

In [88]:
df_result['ride_id'] = df['ride_id']
# df_result['tpep_pickup_datetime'] = df['tpep_pickup_datetime']
# df_result['PULocationID'] = df['PULocationID']
# df_result['DOLocationID'] = df['DOLocationID']
# df_result['actual_duration'] = df['duration']
df_result['predicted_duration'] = y_pred
# df_result['diff'] = df_result['actual_duration'] - df_result['predicted_duration']

In [89]:
df_result

Unnamed: 0,ride_id,predicted_duration
0,f374e318-4f50-4d62-b9ab-f166acb0ff15,16.245906
1,7594dff6-e4f8-4f72-aaf8-f46edcdd50e1,26.134796
2,1938d8a6-d27e-4fe8-a119-33e97a4f453a,11.884264
3,6d300895-e9ec-4096-ac0c-89f6ecbe3eeb,11.997720
4,e5ff3e45-9445-471a-9730-cbd5811cc9b0,10.234486
...,...,...
3403761,efa213d5-0a96-4031-9fca-a9320d42435f,11.952635
3403762,3ea6d064-e16d-4bc0-8034-a2feeea35630,20.049958
3403763,55a6e87d-321a-41c8-9616-30f59d5f1903,11.595336
3403764,b1d0f630-4f70-4b54-8dcd-9d0bbc745aed,13.113178


In [90]:
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [91]:
df_result

Unnamed: 0,ride_id,predicted_duration
0,f374e318-4f50-4d62-b9ab-f166acb0ff15,16.245906
1,7594dff6-e4f8-4f72-aaf8-f46edcdd50e1,26.134796
2,1938d8a6-d27e-4fe8-a119-33e97a4f453a,11.884264
3,6d300895-e9ec-4096-ac0c-89f6ecbe3eeb,11.997720
4,e5ff3e45-9445-471a-9730-cbd5811cc9b0,10.234486
...,...,...
3403761,efa213d5-0a96-4031-9fca-a9320d42435f,11.952635
3403762,3ea6d064-e16d-4bc0-8034-a2feeea35630,20.049958
3403763,55a6e87d-321a-41c8-9616-30f59d5f1903,11.595336
3403764,b1d0f630-4f70-4b54-8dcd-9d0bbc745aed,13.113178


In [92]:
import os
file_size = os.path.getsize(output_file)  # Size in bytes
file_size_mb = file_size / (1024 * 1024)  # Convert to megabytes

print(f"Size of the output file: {file_size_mb:.2f} MB")


Size of the output file: 132.68 MB
