In [14]:
pip install pandas==2.2.2 numpy==1.26.4 scikit-learn==1.5.0 pyarrow==15.0.2 jupyterlab


Note: you may need to restart the kernel to use updated packages.


In [15]:
import pickle
import pandas as pd
import numpy as np
import os
import pyarrow.parquet as pq
import pyarrow as pa


In [16]:
# Load model.bin (downloaded from Docker image)
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)


In [33]:
def read_data(year, month):
    url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet'
    df = pd.read_parquet(url)

    df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df['PULocationID'] = df['PULocationID'].astype(str)
    df['DOLocationID'] = df['DOLocationID'].astype(str)
    df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

    return df


In [34]:
# After making predictions:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

df_result = pd.DataFrame()
df_result['ride_id'] = df['ride_id']
df_result['predicted_duration'] = y_pred

output_file = f'predicted_{year:04d}_{month:02d}.parquet'

df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)


In [35]:
import os
size_mb = os.path.getsize(output_file) / (1024 * 1024)
print(f"✅ Q2: Output file size = {size_mb:.0f} MB")


✅ Q2: Output file size = 63 MB


In [46]:
# Set the target month for each question
year = 2023
month = 5 # Change to 4 for Q5, 5 for Q6


In [47]:
df = read_data(year, month)
features = df[['PULocationID', 'DOLocationID']].to_dict(orient='records')
X = dv.transform(features)
y_pred = model.predict(X)

# Q1 – Std Dev
print(f'✅ Q1: Std deviation (March): {np.std(y_pred):.2f} minutes' if month == 3 else '')
mean_pred = np.mean(y_pred)

# Print only relevant answer
if month == 4:
    print(f"✅ Q5: Mean predicted duration for April 2023 = {mean_pred:.2f} minutes")
elif month == 5:
    print(f"✅ Q6: Mean predicted duration for May 2023 (Docker) = {mean_pred:.2f} minutes")




✅ Q6: Mean predicted duration for May 2023 (Docker) = 14.24 minutes


In [50]:
!jupyter nbconvert --to script starter.ipynb



This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr

