In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.1.1


In [2]:
import pickle
import pandas as pd

In [3]:
with open('model.bin', 'rb') as f_in:
    dv, lr = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
categorical = ['PUlocationID', 'DOlocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [5]:
df = read_data('https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet')

In [6]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = lr.predict(X_val)

In [7]:
y_pred.mean()

16.191691679979066

In [8]:
year = 2021
month = 2
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [9]:
# write the ride id and the predictions to a dataframe with results
df_result = pd.DataFrame({'ride_id': df.ride_id, 'prediction': y_pred})
df_result.to_parquet(
    'output_file',
    engine='pyarrow',
    compression=None,
    index=False
)

In [11]:
!du -sh output_file

19M	output_file


ANS: 19M

### Q3. Creating the scoring script
Now let's turn the notebook into a script.

Which command you need to execute for that?

In [13]:
!jupyter nbconvert --to script starter.ipynb

[NbConvertApp] Converting notebook starter.ipynb to script
[NbConvertApp] Writing 860 bytes to starter.py


### Q4. Virtual environment
Now let's put everything into a virtual environment. We'll use pipenv for that.

Install all the required libraries. Pay attention to the Scikit-Learn version: check the starter notebook for details.

After installing the libraries, pipenv creates two files: Pipfile and Pipfile.lock. The Pipfile.lock file keeps the hashes of the dependencies we use for the virtual env.

What's the first hash for the Scikit-Learn dependency?

In [14]:
!pipenv install scikit-learn==1.1.1 flask --python=3.9 prefect==2.0b6 mlflow pandas boto3

[39m[1mInstalling [32m[1mscikit-learn==1.1.1[39m[22m...[39m[22m
[K[39m[1mAdding[39m[22m [32m[1mscikit-learn[39m[22m [39m[1mto Pipfile's[39m[22m [33m[1m[packages][39m[22m[39m[1m...[39m[22m
[K[?25h✔ Installation Succeeded[0m 
[39m[1mInstalling [32m[1mflask[39m[22m...[39m[22m
[K[39m[1mAdding[39m[22m [32m[1mflask[39m[22m [39m[1mto Pipfile's[39m[22m [33m[1m[packages][39m[22m[39m[1m...[39m[22m
[K[?25h✔ Installation Succeeded[0m 
[39m[1mInstalling [32m[1mprefect==2.0b6[39m[22m...[39m[22m
[K[39m[1mAdding[39m[22m [32m[1mprefect[39m[22m [39m[1mto Pipfile's[39m[22m [33m[1m[packages][39m[22m[39m[1m...[39m[22m
[K[?25h✔ Installation Succeeded[0m 
[39m[1mInstalling [32m[1mmlflow[39m[22m...[39m[22m
[K[39m[1mAdding[39m[22m [32m[1mmlflow[39m[22m [39m[1mto Pipfile's[39m[22m [33m[1m[packages][39m[22m[39m[1m...[39m[22m
[K[?25h✔ Installation Succeeded[0m 
[39m[1mInstalling [32m

In [17]:
!pipenv shell

[39m[1mLoading .env environment variables...[39m[22m
Launching subshell in virtual environment...
 . /home/ubuntu/.local/share/virtualenvs/mlops-zoomcamp-DKPaRvwM/bin/activate
[?2004h(base) ]0;ubuntu@ip-172-31-4-164: ~/mlops-zoomcamp[01;32mubuntu@ip-172-31-4-164[00m:[01;34m~/mlops-zoomcamp[00m$  . /home/ubuntu/.local/share/virtualenvs/mlops-zoomcamp-DKPaRvwM/bin/activate
[?2004h(mlops-zoomcamp) (base) ]0;ubuntu@ip-172-31-4-164: ~/mlops-zoomcamp[01;32mubuntu@ip-172-31-4-164[00m:[01;34m~/mlops-zoomcamp[00m$ ^C[?2004l
[?2004l
[?2004h(mlops-zoomcamp) (base) ]0;ubuntu@ip-172-31-4-164: ~/mlops-zoomcamp[01;32mubuntu@ip-172-31-4-164[00m:[01;34m~/mlops-zoomcamp[00m$ 

 "scikit-learn": {
            "hashes": [
                "sha256:0403ad13f283e27d43b0ad875f187ec7f5d964903d92d1ed06c51439560ecea0",
                "sha256:102f51797cd8944bf44a038d106848ddf2804f2c1edf7aea45fba81a4fdc4d80",
                "sha256:22145b60fef02e597a8e7f061ebc7c51739215f11ce7fcd2ca9af22c31aa9f86",
                "sha256:33cf061ed0b79d647a3e4c3f6c52c412172836718a7cd4d11c1318d083300133",
                "sha256:3be10d8d325821ca366d4fe7083d87c40768f842f54371a9c908d97c45da16fc",
                "sha256:3e77b71e8e644f86c8b5be7f1c285ef597de4c384961389ee3e9ca36c445b256",
                "sha256:45c0f6ae523353f1d99b85469d746f9c497410adff5ba8b24423705b6956a86e",
                "sha256:47464c110eaa9ed9d1fe108cb403510878c3d3a40f110618d2a19b2190a3e35c",
                "sha256:542ccd2592fe7ad31f5c85fed3a3deb3e252383960a85e4b49a629353fffaba4",
                "sha256:723cdb278b1fa57a55f68945bc4e501a2f12abe82f76e8d21e1806cbdbef6fc5",
                "sha256:8fe80df08f5b9cee5dd008eccc672e543976198d790c07e5337f7dfb67eaac05",
                "sha256:8ff56d07b9507fbe07ca0f4e5c8f3e171f74a429f998da03e308166251316b34",
                "sha256:b2db720e13e697d912a87c1a51194e6fb085dc6d8323caa5ca51369ca6948f78",
                "sha256:b928869072366dc138762fe0929e7dc88413f8a469aebc6a64adc10a9226180c",
                "sha256:c2dad2bfc502344b869d4a3f4aa7271b2a5f4fe41f7328f404844c51612e2c58",
                "sha256:e851f8874398dcd50d1e174e810e9331563d189356e945b3271c0e19ee6f4d6f",
                "sha256:e9d228ced1214d67904f26fb820c8abbea12b2889cd4aa8cda20a4ca0ed781c1",
                "sha256:f2d5b5d6e87d482e17696a7bfa03fe9515fdfe27e462a4ad37f3d7774a5e2fd6"
            ],
            "index": "pypi",
            "version": "==1.1.1"
        },