In [1]:
# set current path
PATH = !pwd
PATH = PATH[0]

import sys
sys.path.append("/nethome/mmosbach/projects/languagemodels/")

In [2]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

In [None]:
# download a dateset
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip -P {PATH}/datasets

In [None]:
# unzip the data
!unzip {PATH}/datasets/wikitext-103-raw-v1.zip -d {PATH}/datasets/

In [None]:
# let's look at the data first
with open(f"{PATH}/datasets/wikitext-103-raw/wiki.valid.raw", encoding="utf-8") as f:
    data = f.readlines()

print(data[:10]) # let's print the first 10 lines. notice that a line can contain one more sentences

In [None]:
# let's train a word level tokenizer on the data
tokenizer = Tokenizer(model=WordLevel(unk_token="<unk>"))
trainer = WordLevelTrainer(vocab_size=30000, special_tokens=["<unk>", "<s>", "</s>", "<pad>"]) # the special_tokens will be usueful later
tokenizer.pre_tokenizer = Whitespace()

files = [f"{PATH}/datasets/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
tokenizer.train(files, trainer)

In [None]:
# save the trained tokenizer
tokenizer.save(f"{PATH}/datasets/wikitext-103-raw/tokenizer-wiki.json")

In [None]:
# let's load our trained tokenizer
tokenizer = Tokenizer.from_file(f"{PATH}/datasets/wikitext-103-raw/tokenizer-wiki.json")

In [None]:
# let's tokenize and encode some text
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output)
print(output.ids)
print(output.tokens)
print(output.offsets)

In [None]:
# we can use a post processor to surround each sequence with special symbols 
tokenizer.post_processor = TemplateProcessing(single="<s> $A </s>", special_tokens=[(t, tokenizer.token_to_id(t)) for t in ["<s>", "</s>"]])

In [None]:
# let's tokenize the same text again
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output)
print(output.ids)
print(output.tokens)
print(output.offsets)

In [None]:
# let's encode a few sequences from our data
outputs = tokenizer.encode_batch(data[:10])
for output in outputs:
    # print(output)
    print(output.ids)
    print(output.tokens)
    print(output.offsets)
    print()

In [None]:
# let's enable padding (to make sure all encoded sequences have at least n tokens)
# NOTE: auto-regressive LMs typically do not use padding tokens
tokenizer.enable_padding(pad_id=tokenizer.token_to_id("<pad>"), pad_token="<pad>", length=100)

In [None]:
# let's again encode a few sequences from our data
outputs = tokenizer.encode_batch(data[:10])
for output in outputs:
    # print(output)
    print(len(output.ids))
    print(output.ids)
    print(output.attention_mask) # takes padding into account
    print(output.tokens)
    print(output.offsets)
    print()

# --> based on this we we create batches to train our language model on

In [None]:
from languagemodels.tokenization import train_word_level_tokenizer

In [None]:
files = [f"{PATH}/datasets/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]

tokenizer = train_word_level_tokenizer(vocab_size=1000, files=files)

In [None]:
# let's create some toy data
data = [
    "This is the first example sentence .",
    "Another one, but shorter ."
]

outputs = tokenizer.encode_batch(data)

In [None]:
for output in outputs:
    # print(output)
    print(len(output.ids))
    print(output.ids)
    print(output.attention_mask) # takes padding into account
    print(output.tokens)
    print(output.offsets)
    print()