In [None]:
from pathlib import Path

from mltrainer import tokenizer, Trainer, metrics
from mltrainer.rnn_models import NLPmodel, AttentionNLP

import torch
from torch.utils.data import DataLoader
from torch import optim


from mads_datasets import DatasetFactoryProvider, DatasetType
import gin
gin.parse_config_file("imdb.gin")

We load the streamers from the datasetfactory

In [None]:
imdbdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.IMDB)

In [None]:
datasets = imdbdatasetfactory.create_dataset()

In [None]:
traindataset = datasets["train"]

In [None]:
imdbdatasetfactory.settings

In [None]:
from mltrainer.tokenizer import IMDBTokenizer

tokenizer = IMDBTokenizer.fromSettings(
    traindataset=traindataset,
    settings=imdbdatasetfactory.settings
)

In [None]:
streamers = imdbdatasetfactory.create_datastreamer(batchsize=32, preprocessor=tokenizer)

In [None]:
train = streamers["train"]
batch = train.batchloop()
tokenizer(batch)

In [None]:
train = streamers["train"]
print(f"number of batches {len(train)}")
trainstreamer = train.stream()
validstreamer = streamers["valid"].stream()
X, y = next(iter(trainstreamer))
X.shape, y.shape

In [None]:
X

The full dataset has 782 batches of 32 examples

Setup accuracy and loss_fn (this is a classification problem with two classes, 0 and 1)

In [None]:
accuracy = metrics.Accuracy()
loss_fn = torch.nn.CrossEntropyLoss()
log_dir = Path("modellogs").resolve()


Basic config. We need to specify the vocabulary lenght for the embedding layer.
Trainsteps are set to just 100 batches for speedup in the demo.

In [None]:
from mltrainer import TrainerSettings, ReportTypes

settings = TrainerSettings(
    epochs=10,
    metrics=[accuracy],
    logdir=log_dir,
    train_steps=100,
    valid_steps=25,
    reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.GIN],
    scheduler_kwargs={"factor": 0.5, "patience": 5},
)
settings

In [None]:
assert gin.get_bindings("NLPmodel")["config"]["vocab"] == imdbdatasetfactory.settings.maxvocab

In [None]:
model = NLPmodel()
model

The base NLP model is just a GRU, with an embedding as a first layer.


In [None]:
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print("Using MPS")
elif torch.cuda.is_available():
    device = "cuda:0"
    print("using cuda")
else:
    device = "cpu"
    print("using cpu")

In [None]:
optimizer = optim.Adam
scheduler = optim.lr_scheduler.ReduceLROnPlateau

trainer = Trainer(
    model=model,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optimizer,
    traindataloader=trainstreamer,
    validdataloader=validstreamer,
    scheduler=scheduler,
    device=device,
    )

In [None]:
trainer.loop()

Compare the impact of attention

In [None]:
attentionmodel = AttentionNLP()

attentiontrainer = Trainer(
    model=attentionmodel,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=trainstreamer,
    validdataloader=validstreamer,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau,
    device=device,
    )

attentiontrainer.loop()