In this notebook, we prepare 2 types of preprocessing for the envrironments dataset.
<hr>

Remember to add paths of `conabio_ml` and `conabio_ml_text` to your PYTHONPATH with
`export PYTHONPATH=`pwd`:`pwd`/conabio_ml_text/conabio_ml:`pwd`/conabio_ml_text`

In [None]:
# Here you must have the paths of both conabio_ml and conabio_ml_text libs
!echo $PYTHONPATH

In [None]:
import numpy as np
import pandas as pd
import pydash
import json

import conabio_ml

from pathlib import Path
from pprint import pprint

from conabio_ml_text.datasets.dataset import Dataset, Partitions
from conabio_ml_text.preprocessing.preprocessing import Tokens, PreProcessing
from conabio_ml_text.preprocessing.transform import Transform

from conabio_ml.utils.logger import get_logger, debugger
from conabio_ml_text.utils.constraints import TransformRepresentations, LearningRates, Optimizers

from model import simple_preprocess
from preprocessing import BPE

In [None]:
log = get_logger(__name__)
debug = debugger.debug

To compare the model performance using the BPE preproc, we will create 2 datasets based on the `prunned_dataset_X.csv` file produced in `eda.ipynb`.

Both of them will be partitioned and the use in the `envs.ipynb/pipeline.py`.

In [None]:
# Base paths
base_dataset_path = Path(f"dataset")
base_config_path = Path("configs")

dataset_path = Path(base_dataset_path) / 'dataset_multilabel.csv'
results_path = Path(f"results")

First, we only create a dataset using a simple preprocessing that only makes the following:
- lowercase
- number remotion

In [None]:
dataset = Dataset.from_csv(dataset_path)
dataset = PreProcessing.preprocess(dataset,
                                build_vocab=False,
                                preprocess_fn=simple_preprocess)

In [None]:
labels = pd.unique(dataset.data["label"])
pprint(labels)
NUM_LABELS = len(labels)

In [None]:
items = dataset.data["item"]
tokens = pydash.chain(items)\
    .map(lambda x: set(x.split()))\
    .reduce(lambda x, y: x.union(y), set())\
    .value()
total_tokens = len(tokens)
pprint(f"It broadly contains {total_tokens} tokens. They will be considered to build the vocabulary")

In [None]:
# To create the basic config
VOCAB_SIZE = 10000

# Sentence length. Getting by \approx mean(word_count) + std(word_count)
# Getting from `eda.ipynb`
SPAN_SENTENCES = 450

In [None]:
len(dataset.data), len(dataset.data["item"].unique())

In [None]:
destination_path = base_config_path / "simple_proc_multilabel"
destination_path.mkdir(parents=True, exist_ok=True)

In [None]:
non_processed_dataset = Dataset.from_csv(dataset_path)
non_processed_dataset = PreProcessing.preprocess(non_processed_dataset,
                                                 build_vocab=True,
                                                 preprocess_fn=simple_preprocess,
                                                 vocab_args = {
                                                     "word_size": VOCAB_SIZE,
                                                     "field": "item"
                                                 })
non_processed_dataset = Dataset.split(non_processed_dataset,
                        train_perc=0.8,
                        test_perc=0.1,
                        val_perc=0.1)


`simple_proc_multilabel/dataset.csv` only contains tokens with simple processing and is constrained to the top `20K` most frequent words.

In [None]:
dataset_filepath = f'{destination_path}/dataset.csv'
non_processed_dataset.to_csv(destination_path)

We also persist the vocabulary obtained in the preprocessing stage.

In [None]:
vocab = non_processed_dataset.representations["vocab"]
VOCAB_SIZE = len(vocab)

vocab_filepath = f"{destination_path}/vocab"
with open(vocab_filepath, mode="w") as _f:
    _f.write("\n".join(vocab))
    
pprint(VOCAB_SIZE)

Finally, we create the basic config template to train the model.

Note: Some params will be changed in the actual training stage.

In [None]:
config_filepath = f"{destination_path}/config.json"
CONFIG_SETTINGS = {
    "vocab": vocab_filepath,
    "dataset": dataset_filepath,
    "layers": {
        "input": {
            "T": SPAN_SENTENCES
        },
        "embedding": {
            "V": VOCAB_SIZE,
            "D": 200
        },
        "lstm1": {
            "M": 16,
            "dropout": 0.5
        },
        "lstm2": None,
        "dense_1": {
            "M": 64,
            "dropout": 0.5
        },
        "dense_2":{
            "K": NUM_LABELS
        }
    },
    "params": {
        "initial_learning_rate": 1e-4,
        "decay_steps": 200,
        "batch_size": 32,
        "epochs": 7,
        "hamming_loss_threshold": 0.7,
        "multilabel_threshold": 0.7,
        "multilabel_classes": 3
    }
}

with open(config_filepath, mode="w") as _f:
    json.dump(dict(CONFIG_SETTINGS), _f)

In [None]:
ds = Dataset.from_csv(CONFIG_SETTINGS["dataset"])
ds.data.head(10)

We will also preprocess the original dataset using the BPE algorithm.

Then, in training we will compare the performance of both methods.

In [None]:
# Just adding and extra param: `num_merges`
NUM_MERGES = 200

In [None]:
destination_path = base_config_path / "bpe_multilabel"
destination_path.mkdir(parents=True, exist_ok=True)

In [None]:
processed_dataset = BPE.preprocess(non_processed_dataset,
                                   preprocess_args={"field": "item",
                                                    "num_merges": NUM_MERGES},
                                   vocab_args = {
                                                     "word_size": VOCAB_SIZE,
                                                     "field": "item"
                                                 })

In [None]:
vocab =processed_dataset.representations["vocab"]

VOCAB_SIZE = len(vocab)
V = VOCAB_SIZE

pprint(VOCAB_SIZE)

In [None]:
dataset_filepath = f'{destination_path}/dataset.csv'
processed_dataset.to_csv(destination_path)

And again, persisting auxiliar files.

In [None]:
vocab_filepath = f"{destination_path}/vocab"
with open(vocab_filepath, mode="w") as _f:
    _f.write("\n".join(vocab))
    
pprint(VOCAB_SIZE)

config_filepath = f"{destination_path}/config.json"
CONFIG_SETTINGS = {
    "vocab": vocab_filepath,
    "dataset": dataset_filepath,
    "preprocessing": {
        "num_merges": NUM_MERGES
    },
    "layers": {
        "input": {
            "T": SPAN_SENTENCES
        },
        "embedding": {
            "V": VOCAB_SIZE,
            "D": 200
        },
        "lstm1": {
            "M": 8
        },
        "lstm2": None,
        "dense": {
            "K": NUM_LABELS
        }
    },
    "params": {
        "initial_learning_rate": 0.02,
        "decay_steps": 200,
        "clipvalue": 0.3,
        "batch_size": 32,
        "epochs": 7,
        "multilabel_threshold": 0.3,
        "multilabel_classes": 3
    }
}

with open(config_filepath, mode="w") as _f:
    json.dump(dict(CONFIG_SETTINGS), _f)

In [None]:
ds = Dataset.from_csv(CONFIG_SETTINGS["dataset"])
ds.data.head(10)