In this notebook, we perform the training process for the environment classifier, defined in the `model.py` script.

We will use two dataset for this purpose:

- `config/bpe/dataset.csv`. This dataset was preprocessed with the BPE algorithm, and its vocab is defined in config/bpe/vocab.json
- `config/simple_proc/dataset.csv`. This dataset preprocessing only consists in lowercasing and number remotion. Its vocab is defined in config/simple_proc/vocab.json
<hr>

In [None]:
import os
import json

import numpy as np
import pandas as pd
import tensorflow as tf

from pprint import pprint
from pathlib import Path

In [None]:
# Here you must have the paths of both conabio_ml and conabio_ml_text libs
!echo $PYTHONPATH

In [None]:
# Remember to update the PYTHON_PATH to
# export PYTHONPATH=`pwd`:`pwd`/conabio_ml_text/conabio_ml:`pwd`/conabio_ml_text

from conabio_ml_text.datasets.dataset import Dataset, Partitions
from conabio_ml_text.preprocessing.preprocessing import Tokens
from conabio_ml_text.preprocessing.transform import Transform

from conabio_ml_text.trainers.bcknds.tfkeras import TFKerasTrainer, TFKerasTrainerConfig
from conabio_ml_text.trainers.bcknds.tfkeras import CHECKPOINT_CALLBACK, TENSORBOARD_CALLBACK

from conabio_ml.evaluator.generic.evaluator import Evaluator, Metrics

from conabio_ml_text.utils.constraints import TransformRepresentations as TR, LearningRates as LR
from conabio_ml_text.trainers.builders import create_learning_rate

from model import EnvironmentClassifier,  multilabel_topk, HammingLoss, multilabel_converter

We start by using a config file that contains all the params/hyperparams to load the dataset and to train the model.

In [None]:
config_file = Path("configs/simple_proc_multilabel/config.json")
config = {}
with open(config_file) as _f:
    config = json.load(_f)
pprint(config)

In [None]:
results_path = Path(f"results")
dataset_filepath = Path(f"{config['dataset']}")
vocab_filepath = Path(f"{config['vocab']}")

# Model layers
layers = config["layers"]

#Params
initial_lr = config["params"]["initial_learning_rate"]     # Learning rate
decay_steps = config["params"]["decay_steps"]              # Decay steps for the lr
batch_size = config["params"]["batch_size"]                # Batch size
epochs = config["params"]["epochs"]                        # Epochs of training

hamming_loss_th = config["params"]["hamming_loss_threshold"]

multilabel_th = config["params"]["multilabel_threshold"]
multilabel_k_classes = config["params"]["multilabel_classes"]

In [None]:
layers

We load the dataset and the vocab file that coresponds to each preprocessing method.

- Simple preprocessing: `configs/simple_proc`
- BPE preprocessing: `configs/bpe`

In [None]:
dataset = Dataset.from_csv(dataset_filepath)
dataset = Transform.as_data_generator(dataset,
                                      vocab=vocab_filepath,
                                      shuffle=True,
                                      transform_args={
                                          "pad_length": layers["input"]["T"],
                                          "unk_token": Tokens.UNK_TOKEN
                                      })

In [None]:
## Just for testing purposes
#vocab = dataset.representations["vocab"]
#test_datagen = dataset.representations["data_generators"]["test"]
#test = dataset.get_partition("test")["item"]
#pprint("TEST")
#pprint(test.iloc[0])
#pprint("-------")
#pprint("FROM DATAGEN")
#print([vocab[x] for x in next(test_datagen())[0]])

In [None]:
TRAIN_REPRESENTATION = TR.DATA_GENERATORS
lr_schedule = create_learning_rate({"initial_learning_rate": initial_lr,
                                    "decay_steps": decay_steps },
                                   learning_rate_name=LR.EXPONENTIAL_LR)

trainer_config = TFKerasTrainerConfig.create(config={
    "strategy": None,
    "callbacks": {
        CHECKPOINT_CALLBACK: {
            "filepath": os.path.join(results_path, "checkpoints"),
            "save_best_only": False
        },
        TENSORBOARD_CALLBACK: {
            "log_dir": os.path.join(results_path, "tb_logs")
        }}
})

In [None]:
# At this point we had created a set of properties in the Dataset
# - representations[TRAIN_REPRESENTATION]: Datagenerator
# - representations["vocab"]: Dictionary to convert to tensors the dataset,
#   calculated using the custom preprocessing BPE
# debug("dataset", dataset)

print(f'Vocab size {layers["embedding"]["V"]}')
print(f"From dataset {len(dataset.representations['vocab'])}")

In [None]:
model = EnvironmentClassifier.create(model_config={
    "ENV_CLASSIFIER": {
        "layers": {
            "input": layers["input"],
            "embedding": layers["embedding"],
            "lstm_1": layers["lstm1"],
            "lstm_2": layers["lstm2"],
            "dense_1": layers["dense_1"],
            "dense_2": layers["dense_2"]
        }
    }})

In [None]:
trained_model = TFKerasTrainer.train(dataset=dataset,
                                     model=model,
                                     execution_config=trainer_config,
                                     train_config={
                                         "ENV_CLASSIFIER": {
                                             "representation": TRAIN_REPRESENTATION,
                                             'optimizer': tf.keras.optimizers.Adam(learning_rate=lr_schedule),
                                             'loss': tf.keras.losses.BinaryCrossentropy(),
                                             "batch_size": batch_size,
                                             "epochs": epochs,
                                             "metrics": [multilabel_topk(multilabel_k_classes), 'accuracy']
                                         }})

In [None]:
trained_model.model.summary()

In [None]:
predict_dataset = trained_model.predict(dataset=dataset,
                                        execution_config=None,
                                        prediction_config={
                                            "pred_converter_fn": multilabel_converter(multilabel_th)
                                        })

In [None]:
metrics = Evaluator.eval(dataset,
                         predict_dataset,
                         {"metrics_set": {
                             Metrics.Sets.MULTILABEL: {
                                 'per_class': True,
                                 'average': 'micro',
                                 "zero_division": 1.0
                             }
                         },
                          "dataset_partition": Partitions.TEST
                         })
pprint(metrics.results)