In [17]:
!pip freeze | grep scikit-learn

scikit-learn==1.6.1


In [18]:
!python -V

Python 3.9.12


In [35]:
import pickle
import pandas as pd
import os

In [20]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [21]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [22]:
year = 2023
month = 3
df = read_data(f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet')
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [23]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.00,1.0,N,238,42,2,8.60,1.0,0.5,0.00,0.0,1.0,11.10,0.0,0.00,10.000000
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.40,1.0,N,138,231,1,52.70,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.30,1.0,N,140,186,1,18.40,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.00,14.366667
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.90,1.0,N,140,43,1,15.60,3.5,0.5,4.10,0.0,1.0,24.70,2.5,0.00,11.466667
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,7.20,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.00,3.033333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3403761,2,2023-03-31 23:24:25,2023-03-31 23:40:54,,3.16,,,163,75,0,12.13,0.0,0.5,4.23,0.0,1.0,20.36,,,16.483333
3403762,2,2023-03-31 23:24:50,2023-04-01 00:04:12,,6.89,,,125,198,0,40.92,0.0,0.5,8.98,0.0,1.0,53.90,,,39.366667
3403763,2,2023-03-31 23:26:31,2023-03-31 23:49:39,,4.01,,,50,224,0,24.02,0.0,0.5,0.00,0.0,1.0,28.02,,,23.133333
3403764,2,2023-03-31 23:07:51,2023-03-31 23:15:56,,1.31,,,113,158,0,8.51,0.0,0.5,3.50,0.0,1.0,16.01,,,8.083333


In [24]:
print(f"Standard deviation of predictions: {y_pred.std():.2f}")

Standard deviation of predictions: 6.25


In [33]:
def save_results(output_file, df_result):
    df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
    )
    print(f"Results saved to {output_file}")

In [37]:
df_results = pd.DataFrame()
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
df_results['prediction_duration'] = y_pred

output_file = f'yellow_tripdata_{year:04d}-{month:02d}_predictions.parquet'
save_results(output_file, df_results)

Results saved to yellow_tripdata_2023-03_predictions.parquet


In [38]:

file_size = os.path.getsize(output_file) / (1024 * 1024)  # size in MB
print(f"Size of the parquet file: {file_size:.2f} MB")

Size of the parquet file: 5.97 MB
