In this homework, we'll deploy the ride duration model in batch mode. Like in homework 1 and 3, we'll use the FHV data. 

In [None]:
# check the version of scikit-learn used
!pip freeze | grep scikit-learn

In [None]:
# import dependencies
import pandas as pd
import numpy as np
import pickle


In [None]:
# variables to be kept for ease of changes
year = 2021 # we only interested in 2021 data
month = 2 # for feb

# the files to be inputed for the moddel to use and outputed version of the file
input_file = f'https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_{year:04d}-{month:02d}.parquet'
output_file = f'output/fhv_tripdata_{year:04d}-{month:02d}.parquet'

In [None]:
# load the bin file that has both the model and the dict vectorizer using pickle
with open('model.bin', 'rb') as f_in:
    dv, lr = pickle.load(f_in)

In [None]:
# prepare the data and keep only features that are of interest to us
# keep the drop off location ID and the pick up location ID
categorical = ['PUlocationID', 'DOlocationID']
# define a function to read the data, prepare in and output the correct dataframe that
# will be used for prediction
def read_data(filename):
    # read the filename that contains the parquet file
    df = pd.read_parquet(filename)
    # for duration we whave to subtract the pickup time from the drop off time to get the full
    # duration of the ride
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    # for duration, we convert the rides into minutes not hour
    df['duration'] = df.duration.dt.total_seconds() / 60
    # now we want to have durations that are less than 60 minutes and greater than 1 mminute
    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
    # create a categorical feature columns for drop off ID and pick up ID
    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [None]:
# use the function to the input file
df = read_data(input_file)
# format the new column ride_id to have year and date 
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [None]:
# convert the categorical features into dictionaries of vectors
dicts = df[categorical].to_dict(orient='records')
# transfrom the dictionaries to vectors
X_val = dv.transform(dicts)
# predict using the model loaded from the pickle above
y_pred = lr.predict(X_val)

In [None]:
# get the mean value for the y_predictions
y_pred.mean()


In [None]:
# create a dataframe for repdicted results
df_result = pd.DataFrame()
# use the ride_id as the distinct identifier for each ride
df_result['ride_id'] = df['ride_id']
# create a predicted duration column for y predictions
df_result['predicted_duration'] = y_pred

In [None]:
# save the dataframe of results as a parquet file
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)