In [None]:
# %pip install banhxeo (run on Colab or Kaggle)

In [None]:
from banhxeo.dataset import IMDBDataset
from banhxeo.core.tokenizer import NLTKTokenizer
from banhxeo.core.vocabulary import Vocabulary
from banhxeo.dataset.transforms import (
    RemoveHTMLTag,
    RemovePunctuation,
    RemoveURL,
    Strip,
)
from banhxeo.models.neural.mlp import MLP
from torch.utils.data import DataLoader

### Create raw dataset

In [None]:
# Load raw data
raw_imdb = IMDBDataset("./", split="train")

In [None]:
print(raw_imdb.data)

In [None]:
raw_imdb.text_data[:5]

### Create tokenizer and vocabulary

In [None]:
tokenizer = NLTKTokenizer()

In [None]:
#vocab = Vocabulary.build(corpus=raw_imdb.text_data, tokenizer=tokenizer)
vocab = Vocabulary.load(path="vocab/imdb.json", tokenizer=tokenizer)

In [None]:
vocab.idx_to_token[100:120]

In [None]:
vocab.save("vocab/imdb.json")

### Create torch dataset and dataloader

In [None]:
train_imdb = raw_imdb.to_torch_dataset(
    vocab=vocab,
    tokenizer=tokenizer,
    add_special_tokens=True,
    max_length=128,
    padding=True,
    truncation=True,
    transforms=[
        RemoveURL(),
        RemoveHTMLTag(),
        RemovePunctuation(),
        Strip(metadata={"lower": True}),
    ],
)

In [None]:
train_loader = DataLoader(train_imdb, batch_size=32, num_workers=2, shuffle=True)

### Train model

In [None]:
model = MLP(vocab=vocab, output_size=2, hidden_sizes=[256, 128])

In [None]:
model.to_gpu()