In [6]:
import pickle
import pandas as pd
import sklearn

In [7]:
from numpy import tri


year = 2023
month = 3
trip_type = 'yellow'

input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/{trip_type}_tripdata_{year:04d}-{month:02d}.parquet'
output_file = f'output/{trip_type}_tripdata_{year:04d}-{month:02d}.parquet'

In [8]:
with open('model.bin', 'rb') as f_in:
    dv, lr = pickle.load(f_in)

In [9]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [10]:
df = read_data(input_file)

In [11]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = lr.predict(X_val)

In [12]:
y_pred

array([16.24590642, 26.1347962 , 11.88426424, ..., 11.59533603,
       13.11317847, 12.89999218])

### Q1. Standard deviation

In [13]:
y_pred.std()

np.float64(6.247488852238703)

#### Ans: 6.24

### Q2. Preparing the output

In [14]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
df_result = pd.DataFrame()
df_result['ride_id'] = df['ride_id']
df_result['predicted_duration'] = y_pred

In [16]:
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [17]:
!ls -lh output

total 66M
-rw-r--r-- 1 varshithvvs varshithvvs 66M Sep 27 06:54 yellow_tripdata_2023-03.parquet


#### Ans: 66M

### Q3. Creating the scoring script

In [19]:
!jupyter nbconvert --to script homework.ipynb

[NbConvertApp] Converting notebook homework.ipynb to script
[NbConvertApp] Writing 1708 bytes to homework.py


#### Ans: !jupyter nbconvert --to script homework.ipynb

### Q4. Virtual environment

#### Ans: sha256:03b6158efa3faaf1feea3faa884c840ebd61b6484167c711548fce208ea09445

### Q5. Parametrize the script

In [20]:
!python3 homework.py 2023 4

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
predicted mean duration: 14.292282936862449


#### Ans: 14.29

### Q6. Docker container

In [24]:
!cd .. && docker build -t module4_homework -f module_4/Dockerfile .

[1A[1B[0G[?25l[+] Building 0.0s (0/0)  docker:default
[?25h[1A[0G[?25l[+] Building 0.0s (0/1)                                          docker:default
[?25h[1A[0G[?25l[+] Building 0.2s (1/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 516B                                       0.0s
[0m => [internal] load metadata for docker.io/agrigorev/zoomcamp-model:mlops  0.2s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (1/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 516B                                       0.0s
[0m => [internal] load metadata for docker.io/agrigorev/zoomcamp-model:mlops  0.3s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.5s (1/2)                                          docker:default
[

In [26]:
!docker run module4_homework 2023 5

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
predicted mean duration: 0.19174419265916945


#### Ans: 0.19