In [None]:
from banhxeo.data import IMDBDataset
from banhxeo.model import MLP
from banhxeo.core import NLTKTokenizer, Vocabulary
from banhxeo.data.transforms import RemoveHTMLTag, RemoveURL, Strip
from banhxeo.train.config import OptimizerConfig, LossConfig, TrainerConfig
from banhxeo.train.trainer import Trainer
from banhxeo.train.callbacks import AccuracyCallback
from banhxeo import GPU_DEVICE, CPU_DEVICE
import torch

### Create dataset

In [None]:
train_raw = IMDBDataset(root_dir="./", split_name="train")
test_raw = IMDBDataset(root_dir="./", split_name="test")

In [None]:
train_raw.get_data()

In [None]:
test_raw.get_data()

### Create Tokenizer and Vocabulary

In [None]:
tokenizer = NLTKTokenizer()

In [None]:
# vocab = Vocabulary.build(
#     corpus=train_raw.get_all_texts(),
#     tokenizer=tokenizer,
#     pad_tok="<pad>",
#     unk_tok="<unk>",
#     sep_tok="<eos>",
#     bos_tok="bos_tok",
# )
vocab = Vocabulary.load("./vocab/imdb_2.json", tokenizer=tokenizer)

In [None]:
# vocab.save("./vocab/imdb_2.json")

### Move raw dataset to torch dataset

In [None]:
config = {
    "add_special_tokens": True,
    "max_length": 256,
    "truncation": True,
    "padding": True,
    "is_classification": True,
    "transforms": [RemoveURL(), RemoveHTMLTag(), Strip(metadata={"lower": True})],
    "label_map": {"pos": 1, "neg": 0},
    "text_column_name": "content",
    "label_column_name": "label"
}

In [None]:
train_dataset = train_raw.to_torch_dataset(tokenizer=tokenizer, vocab=vocab, **config)

In [None]:
test_dataset = test_raw.to_torch_dataset(tokenizer=tokenizer, vocab=vocab, **config)

### Create MLP model

In [None]:
mlp_config = {
    "embedding_dim": 128,
    "output_size": 2,  # Binary classification (pos/neg)
    "hidden_sizes": [256, 128],
    "activation_fn": "relu",
    "dropout_rate": 0.3,
    "aggregate_strategy": "average",
}

In [None]:
model = MLP(vocab=vocab, **mlp_config)

In [None]:
model.summary()

In [None]:
### Remember to move model to GPU
model = model.to_gpu()

### Define training function for **one** step

In [None]:
optim_config = OptimizerConfig(
    name="SGD",
    optimizer_kwargs={
        "lr": 1e-4,  # Learning rate
        "weight_decay": 0.01,
    },
)

In [None]:
loss_config = LossConfig(name="CrossEntropyLoss")

In [None]:
trainer_config = TrainerConfig(
    num_train_epochs=3,
    per_device_eval_batch_size=32,
    per_device_train_batch_size=32,
    training_shuffle=True,
    optim=optim_config,
    loss=loss_config,
)

In [None]:
def mlp_train_step(
    trainer,
    model,
    batch,
):
    # Remember to move data to device each time we loop
    input_ids = batch["input_ids"].to(GPU_DEVICE)
    attention_mask = batch["attention_mask"].to(GPU_DEVICE)
    labels = batch["labels"].to(
        GPU_DEVICE
    )  # For classification task, we usually have labels

    # Output of forward pass (return a dictionary)
    outputs = model(input_ids, attention_mask)

    # Calculate loss
    loss = trainer.loss(outputs["logits"], labels)

    # Calculate correct
    _, predicted = torch.max(outputs["logits"], dim=1)
    correct = (predicted == labels).sum().item()
    total = labels.size(0)

    # Backward pass
    loss.backward()

    # Step and then zero optimizer
    trainer.optimizer.step()
    trainer.optimizer.zero_grad()

    return {"loss": loss.item(), "correct": correct, "total": total}

In [None]:
def mlp_eval_step(trainer,model,batch):
    # Remember to move data to device each time we loop
    input_ids = batch["input_ids"].to(GPU_DEVICE)
    attention_mask = batch["attention_mask"].to(GPU_DEVICE)
    labels = batch["labels"].to(
        GPU_DEVICE
    )  # For classification task, we usually have labels

    # Output of forward pass (return a dictionary)
    outputs = model(input_ids, attention_mask)

    # Calculate loss
    loss_val = trainer.loss(outputs["logits"], labels).item()

    return {"eval_loss": loss_val}

In [None]:
trainer = Trainer(
    model=model,
    config=trainer_config,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    train_step_fn=mlp_train_step,
    eval_step_fn=mlp_eval_step,
    callbacks=[AccuracyCallback()],
    device=GPU_DEVICE,
)

In [None]:
trainer.train()

In [None]:
# Testing
trainer.evaluate()