In [1]:
#@title Notebook & DagsHub pre-configurations  🏗🐶

#@markdown Enter the branch name:
WEEK = "week4" #@param {type:"string"}

#@markdown Enter the DAGsHub repository owner name:
DAGSHUB_REPO_OWNER= "wonhyeongseo" #@param {type:"string"}

#@markdown Enter the DAGsHub repository name:
DAGSHUB_REPO_NAME= "mlops-zoomcamp" #@param {type:"string"}

#@markdown Enter the username of your DAGsHub account:
DAGSHUB_USER_NAME = "wonhyeongseo" #@param {type:"string"}

#@markdown Enter the email for your DAGsHub account:
DAGSHUB_EMAIL = "wonhseo@kakao.com" #@param {type:"string"}

#@markdown Clone the Git repo to the Colab runtime
CLONE = True #@param {type:"boolean"}

#@markdown Pull the changes from the Git server to Colab runtime
PULL_GIT = True #@param {type:"boolean"}

#@markdown Set DVC’s user configurations for DagsHub user (will be set locally - should only done **per runtime**)
SET_DVC_USER = True #@param {type:"boolean"}

#@markdown Pull the changes from the DagsHub storage to Colab runtime
PULL_DVC = True #@param {type:"boolean"}

#@markdown Configure MLflow remote tracking server
MLFLOW = True #@param {type:"boolean"}

# Additional information 💡

**DagsHub**

In [2]:
import getpass
DAGSHUB_TOKEN = getpass.getpass('Please enter your DAGsHub token or password: ')

Please enter your DAGsHub token or password: ··········


# Help Functions 🚁

In [3]:
# Imports
import requests
import datetime
import os
from pathlib import Path

In [4]:
def git_push():
  "Push files to remote Git server on DAGsHub or GitHub"
  !git push https://{DAGSHUB_USER_NAME}:{DAGSHUB_TOKEN}@dagshub.com/{DAGSHUB_REPO_OWNER}/{DAGSHUB_REPO_NAME}.git

# 0. Environment Setup

**Configure Git**

In [5]:
!git config --global user.email {DAGSHUB_EMAIL}
!git config --global user.name {DAGSHUB_USER_NAME}

**Clone the Repository**

In [6]:
if CLONE:
  !git clone https://{DAGSHUB_USER_NAME}:{DAGSHUB_TOKEN}@dagshub.com/{DAGSHUB_REPO_OWNER}/{DAGSHUB_REPO_NAME}.git
  %cd {DAGSHUB_REPO_NAME}
if PULL_GIT:
  !git pull

Cloning into 'mlops-zoomcamp'...
remote: Enumerating objects: 197, done.[K
remote: Counting objects: 100% (197/197), done.[K
remote: Compressing objects: 100% (145/145), done.[K
remote: Total 197 (delta 50), reused 106 (delta 34), pack-reused 0[K
Receiving objects: 100% (197/197), 11.36 MiB | 16.78 MiB/s, done.
Resolving deltas: 100% (50/50), done.
/content/mlops-zoomcamp
Already up to date.


In [7]:
!git switch -C {WEEK}

Switched to a new branch 'week4'


**Install Requirements**

In [8]:
from pathlib import Path

!pip install --upgrade pip --quiet

req_path = Path("requirements.txt")
if req_path.is_file():
  !pip install -qr requirements.txt

!cat requirements.txt

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.4/421.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.1/18.1 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.2/112.2 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.7/54.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.7/72.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

**Configure MLflow**

In [9]:
if MLFLOW:

  mlflow_installed = !pip list -v | grep mlflow
  if not mlflow_installed:
    print("Installing MLflow")
    !pip install mlflow --quiet

  import mlflow

  os.environ['MLFLOW_TRACKING_USERNAME'] = DAGSHUB_USER_NAME
  os.environ['MLFLOW_TRACKING_PASSWORD'] = DAGSHUB_TOKEN
  os.environ['MLFLOW_TRACKING_URI'] = f'https://dagshub.com/{DAGSHUB_REPO_OWNER}/{DAGSHUB_REPO_NAME}.mlflow'
  mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])
  os.environ['MLFLOW_EXPERIMENT_NAME'] = 'default'

In [10]:
!mkdir -p {WEEK}
%cd {WEEK}
!mkdir -p {data,models}

/content/mlops-zoomcamp/week4


# 1. Deployment as containers

Download and preprocess data:

In [12]:
import pickle
import pandas as pd

with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')

    return df

In [13]:
year, month = 2022, 2
df = read_data(f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-0{month}.parquet')

In [14]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

### Q1: Standard deviation of predicted duration

In [15]:
import numpy as np
np.std(y_pred)

5.28140357655334

### Q2: Preparing output parquet with pyarrow

In [16]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [17]:
df_result = pd.DataFrame({'ride_id': df['ride_id'], 'predicted_duration': y_pred})

In [18]:
output_file = 'data/2022-02-predictions.parquet'
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [19]:
!ls -lhtra data/*.parquet

-rw-r--r-- 1 root root 58M Jun 20 11:57 data/2022-02-predictions.parquet


### Q3. Convert to script

In [None]:
!jupyter nbconvert --to python homework.ipynb

Note: I added a print mean statement for Q5 in the resulting `score.py` script.

### Q4. Creating a virtual environment

In [21]:
!pip install --quiet pipenv

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m88.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.5/468.5 kB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [22]:
!python3 -m pipenv install mlflow scikit-learn==1.2.2

[1mCreating a virtualenv for this project...[0m
Pipfile: [33m[1m/content/mlops-zoomcamp/week4/Pipfile[0m
[1mUsing default python from[0m [33m[1m/usr/bin/python3[0m [32m(3.10.12)[0m [1mto create virtualenv...[0m
[2K[32m⠦[0m Creating virtual environment...[36mcreated virtual environment CPython3.10.12.final.0-64 in 1822ms
  creator CPython3Posix(dest=/root/.local/share/virtualenvs/week4-C3sBiBYm, clear=False, no_vcs_ignore=False, global=False)
  seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/root/.local/share/virtualenv)
    added seed packages: pip==23.1.2, setuptools==67.8.0, wheel==0.40.0
  activators BashActivator,CShellActivator,FishActivator,NushellActivator,PowerShellActivator,PythonActivator
[0m
✔ Successfully created virtual environment!
[2K[32m⠧[0m Creating virtual environment...
[1A[2K[32mVirtualenv location: /root/.local/share/virtualenvs/week4-C3sBiBYm[0m
[1mCreating a Pipfile for this projec

### Q5. Mean predicted duration for March 2022 Yellow dataset

In [23]:
!python3 -m pipenv run python score.py --year 2022 --month 3

Mean of predicted duration 2022-03: 12.758556818790902


Note: had to change `models/model.bin` back to `model.bin` for next exercise, Dockerfile.

In [25]:
# Just to check the difference between Docker
!python3 -m pipenv run python score.py --year 2022 --month 4

Mean of predicted duration 2022-04: 12.865128336784926


### Q6. Mean of April 2022 Yellow dataset with Docker

Had to finish this task on local, here is the Dockerfile:
```yaml
FROM svizor/zoomcamp-model:mlops-3.10.0-slim

WORKDIR /app
RUN ["mkdir", "data"]

RUN ["pip", "install", "pipenv"]
COPY [ "Pipfile", "Pipfile.lock", "./" ]
RUN pipenv install --system --deploy

COPY notebooks/starter.py .
ENTRYPOINT ["python", "starter.py"]
```

Then I executed
```bash
$ docker build --tag nyc_taxi_deploy:0.0.1 .
[+] Building 54.6s (12/12) FINISHED
 => [internal] load build definition from Dockerfile                                                               0.0s
 => => transferring dockerfile: 291B                                                                               0.0s
 => [internal] load .dockerignore                                                                                  0.0s
 => => transferring context: 2B                                                                                    0.0s
 => [internal] load metadata for docker.io/svizor/zoomcamp-model:mlops-3.10.0-slim                                 1.9s
 => [internal] load build context                                                                                  0.0s
 => => transferring context: 90B                                                                                   0.0s
 => [1/7] FROM docker.io/svizor/zoomcamp-model:mlops-3.10.0-slim@sha256:595bf690875f5b9075550b61c609be10f05e69156  0.0s
 => CACHED [2/7] WORKDIR /app                                                                                      0.0s
 => [3/7] RUN ["mkdir", "data"]                                                                                    0.5s
 => [4/7] RUN ["pip", "install", "pipenv"]                                                                         6.5s
 => [5/7] COPY [ Pipfile, Pipfile.lock, ./ ]                                                                       0.0s
 => [6/7] RUN pipenv install --system --deploy                                                                    40.8s
 => [7/7] COPY score.py .                                                                                          0.0s
 => exporting to image                                                                                             4.6s
 => => exporting layers                                                                                            4.6s
 => => writing image sha256:5e2a2ef27ac4bc4a1c4b4a4c0c10c0dbcc6baec7d767a5ff0764bb19757dae25                       0.0s
 => => naming to docker.io/library/nyc_taxi_deploy:0.0.1                                                           0.0s
$ docker run --rm --name nyc_taxi nyc_taxi_deploy:0.0.1 --year=2022 --month=4
Mean of predicted duration 2022-04: 12.827242870079969
```
We see the result is indeed closer to the answer.

**Push the files to the remotes** 🏁

In [26]:
!git add .
!git commit -m 'homework: week4'
git_push()

[week4 715c0cb] homework: week4
 4 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 week4/Dockerfile
 rename week4/data/{{year:04d}-{month:02d}-predictions.parquet => 2022-03-predictions.parquet} (100%)
 create mode 100644 week4/data/2022-04-predictions.parquet
Enumerating objects: 11, done.
Counting objects: 100% (11/11), done.
Delta compression using up to 2 threads
Compressing objects: 100% (7/7), done.
Writing objects: 100% (7/7), 14.48 MiB | 6.15 MiB/s, done.
Total 7 (delta 2), reused 0 (delta 0)
To https://dagshub.com/wonhyeongseo/mlops-zoomcamp.git
   8505871..715c0cb  week4 -> week4
