# Setup

The livecodes of the lecture are based on the code used by the students during the challenges.

We will use the **data-lecture-cloud-training** challenge for all the livecodes of the lecture.

Myriad batches:

``` bash
cd data-lecture-cloud-training
```

Legacy batches:

``` bash
cd data-challenges/07-ML-Ops/02-Cloud-training/00-Lecture-livecode
```

Download data:

``` bash
curl https://storage.googleapis.com/datascience-mlops/taxi-fare-ny/train_10k.csv > ~/.lewagon/mlops/data/raw/train_10k.csv
curl https://storage.googleapis.com/datascience-mlops/taxi-fare-ny/val_10k.csv > ~/.lewagon/mlops/data/raw/val_10k.csv
```

Then use VSCode:

``` bash
code .
```

# Application parameters

Make sure the model trains:

``` bash
make run_preprocess run_train
```

equivalent to running **interface/main.py** with

In [None]:
if __name__ == '__main__':
    preprocess()
    preprocess(source_type='val')
    train()
    # pred()
    # evaluate()


### Setting up `direnv` to manage environment variables

Install using instructions for your machine [in direnv docs](https://direnv.net/docs/installation.html)

1. After installation, add `direnv` to the list of oh-my-zsh plugins in your `~/.zshrc` (run `zsh` or open a new terminal window in order to apply the change)
2. If `direnv` does not load in the shell you can run `eval "$(direnv hook zsh)"` (add it to the `~/.zshrc` as well)

**model_target/local_model.py**

In [None]:
import os

def save_local_model(model, suffix):

    if model:

        model_path = os.path.join(os.environ.get("LOCAL_REGISTRY_PATH"), "models",
                                  suffix + ".pickle")

        print(f"- model path: {model_path}")

        model.save(model_path)


**model_target/cloud_model.py**

In [None]:
def save_cloud_model(model, suffix):

    print("TODO: save model in the cloud 🧬")


**data_sources/cloud_data.py**

In [None]:
def get_cloud_chunk(path,
                    index,
                    chunk_size,
                    dtypes,
                    columns):

    print("TODO: get cloud chunk 🧩")


**ml_logic/registry.py**

In [None]:
from taxifare.model_target.local_model import save_local_model
from taxifare.model_target.cloud_model import save_cloud_model

        if os.environ["MODEL_TARGET"] == "local":
            save_local_model(model, suffix)
        elif os.environ["MODEL_TARGET"] == "cloud":
            save_cloud_model(model, suffix)
        else:
            raise ValueError(f"Invalid .env config for model: {os.environ['MODEL_TARGET']} 🤕")


**ml_logic/data.py**

In [None]:
from taxifare.data_sources.cloud_data import get_cloud_chunk

    if os.environ["DATA_SOURCE"] == "local":
        chunk_df = get_pandas_chunk(path=source_name,
                                    index=index,
                                    chunk_size=chunk_size,
                                    dtypes=dtypes,
                                    columns=columns)
    elif os.environ["DATA_SOURCE"] == "cloud":
        chunk_df = get_cloud_chunk(table=source_name,
                                   index=index,
                                   chunk_size=chunk_size,
                                   dtypes=dtypes)
    else:
        raise NameError(f"Invalid .env conf for data: {os.environ['DATA_SOURCE']} 😬")


# Model in the cloud

**raw code**

In [None]:
from google.cloud import storage

BUCKET_NAME = "my-bucket"

storage_filename = "models/random_forest_model.joblib"
local_filename = "model.joblib"

client = storage.Client()
bucket = client.bucket(BUCKET_NAME)
blob = bucket.blob(storage_filename)
blob.upload_from_filename(local_filename)


**model_target/cloud_model.py**

In [None]:
from google.cloud import storage

import glob
import os

def save_cloud_model(model, suffix):

    # save the model
    if model:

        model_path = os.path.join(os.environ.get("LOCAL_REGISTRY_PATH"), "models",
                                  suffix + ".pickle")

        model.save(model_path)

        # list model files
        files = glob.glob(f"{model_path}/**/*.*", recursive=True)

        for file in files:
            storage_filename = file[17:]

            client = storage.Client()
            bucket = client.bucket(os.environ["BUCKET_NAME"])
            blob = bucket.blob(storage_filename)
            blob.upload_from_filename(file)


# Data in the cloud

**raw code**

In [None]:
from google.cloud import bigquery

client = bigquery.Client()
rows = client.list_rows(table, start_index=index, max_results=chunk_size)
big_query_df = rows.to_dataframe()

if big_query_df.shape[0] == 0:
    return None  # end of data

big_query_df = big_query_df.astype(dtypes)

return big_query_df

**data_sources/cloud_data.py**

In [None]:
from google.cloud import bigquery

import os


def get_cloud_chunk(table, index, chunk_size, dtypes):

    table = f"{os.environ['PROJECT']}.{os.environ['DATASET']}.{table}"

    client = bigquery.Client()

    rows = client.list_rows(table, start_index=index, max_results=chunk_size)

    big_query_df = rows.to_dataframe()

    if big_query_df.shape[0] == 0:
        return None  # end of data

    big_query_df = big_query_df.astype(dtypes)

    print(f"Data loaded from BQ 🔥")
    print(big_query_df.head())

    return big_query_df


# Training in the cloud

Setup a VM by following the **training in the cloud** challenge

# Code essentials

## Cloud Storage

In [None]:
from google.cloud import storage

BUCKET_NAME = "my-bucket"

storage_filename = "models/random_forest_model.joblib"
local_filename = "model.joblib"

client = storage.Client()
bucket = client.bucket(BUCKET_NAME)
blob = bucket.blob(storage_filename)
blob.upload_from_filename(local_filename)


## Big Query

In [None]:
from google.cloud import bigquery

client = bigquery.Client()
rows = client.list_rows(table, start_index=index, max_results=chunk_size)
big_query_df = rows.to_dataframe()

if big_query_df.shape[0] == 0:
    return None  # end of data

big_query_df = big_query_df.astype(dtypes)

return big_query_df