In [1]:
# set current path
PATH = !pwd
PATH = PATH[0]

import sys
sys.path.append("/nethome/mmosbach/projects/languagemodels/")

In [2]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

In [3]:
# download a dateset
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip -P {PATH}/datasets

--2022-09-26 14:38:39--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.42.160
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.42.160|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191984949 (183M) [application/zip]
Saving to: ‘/nethome/mmosbach/projects/languagemodels/examples/datasets/wikitext-103-raw-v1.zip.1’



In [6]:
# unzip the data
!unzip {PATH}/datasets/wikitext-103-raw-v1.zip -d {PATH}/datasets/

Archive:  /nethome/mmosbach/projects/languagemodels/examples/datasets/wikitext-103-raw-v1.zip
replace /nethome/mmosbach/projects/languagemodels/examples/datasets/wikitext-103-raw/wiki.test.raw? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [7]:
# let's look at the data first
with open(f"{PATH}/datasets/wikitext-103-raw/wiki.valid.raw", encoding="utf-8") as f:
    data = f.readlines()

print(data[:10]) # let's print the first 10 lines. notice that a line can contain one more sentences

[' \n', ' = Homarus gammarus = \n', ' \n', ' Homarus gammarus , known as the European lobster or common lobster , is a species of clawed lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of the Black Sea . It is closely related to the American lobster , H. americanus . It may grow to a length of 60 cm ( 24 in ) and a mass of 6 kilograms ( 13 lb ) , and bears a conspicuous pair of claws . In life , the lobsters are blue , only becoming " lobster red " on cooking . Mating occurs in the summer , producing eggs which are carried by the females for up to a year before hatching into planktonic larvae . Homarus gammarus is a highly esteemed food , and is widely caught using lobster pots , mostly around the British Isles . \n', ' \n', ' = = Description = = \n', ' \n', ' Homarus gammarus is a large crustacean , with a body length up to 60 centimetres ( 24 in ) and weighing up to 5 – 6 kilograms ( 11 – 13 lb ) , although the lobsters caught in lobster pots are usually 23 – 38

In [8]:
# let's train a word level tokenizer on the data
tokenizer = Tokenizer(model=WordLevel(unk_token="<unk>"))
trainer = WordLevelTrainer(vocab_size=30000, special_tokens=["<unk>", "<s>", "</s>", "<pad>"]) # the special_tokens will be usueful later
tokenizer.pre_tokenizer = Whitespace()

files = [f"{PATH}/datasets/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
tokenizer.train(files, trainer)

In [9]:
# save the trained tokenizer
tokenizer.save(f"{PATH}/datasets/wikitext-103-raw/tokenizer-wiki.json")

In [10]:
# let's load our trained tokenizer
tokenizer = Tokenizer.from_file(f"{PATH}/datasets/wikitext-103-raw/tokenizer-wiki.json")

In [11]:
# let's tokenize and encode some text
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output)
print(output.ids)
print(output.tokens)
print(output.offsets)

Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
[14046, 5, 5660, 15, 76, 517, 4812, 40, 308, 0, 801]
['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '<unk>', '?']
[(0, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13), (14, 17), (18, 21), (22, 25), (26, 27), (28, 29)]


In [12]:
# we can use a post processor to surround each sequence with special symbols 
tokenizer.post_processor = TemplateProcessing(single="<s> $A </s>", special_tokens=[(t, tokenizer.token_to_id(t)) for t in ["<s>", "</s>"]])

In [13]:
# let's tokenize the same text again
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output)
print(output.ids)
print(output.tokens)
print(output.offsets)

Encoding(num_tokens=13, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
[1, 14046, 5, 5660, 15, 76, 517, 4812, 40, 308, 0, 801, 2]
['<s>', 'Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '<unk>', '?', '</s>']
[(0, 0), (0, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13), (14, 17), (18, 21), (22, 25), (26, 27), (28, 29), (0, 0)]


In [14]:
# let's encode a few sequences from our data
outputs = tokenizer.encode_batch(data[:10])
for output in outputs:
    # print(output)
    print(output.ids)
    print(output.tokens)
    print(output.offsets)
    print()

[1, 2]
['<s>', '</s>']
[(0, 0), (0, 0)]

[1, 12, 0, 0, 12, 2]
['<s>', '=', '<unk>', '<unk>', '=', '</s>']
[(0, 0), (1, 2), (3, 10), (11, 19), (20, 21), (0, 0)]

[1, 2]
['<s>', '</s>']
[(0, 0), (0, 0)]

[1, 0, 0, 5, 138, 22, 4, 681, 24414, 50, 554, 24414, 5, 27, 11, 271, 7, 0, 24414, 29, 4, 912, 1039, 2018, 5, 2768, 1090, 8, 789, 7, 4, 839, 1090, 6, 66, 27, 2111, 1003, 9, 4, 137, 24414, 5, 1020, 6, 0, 6, 66, 171, 2994, 9, 11, 890, 7, 975, 1091, 26, 420, 10, 25, 8, 11, 1343, 7, 150, 8582, 26, 324, 2524, 25, 5, 8, 5294, 11, 12774, 1430, 7, 14126, 6, 36, 200, 5, 4, 0, 40, 1679, 5, 79, 897, 13, 24414, 960, 13, 19, 8690, 6, 0, 2661, 10, 4, 928, 5, 2408, 3146, 37, 40, 878, 24, 4, 2627, 21, 80, 9, 11, 101, 99, 16452, 65, 0, 8014, 6, 0, 0, 27, 11, 1270, 0, 959, 5, 8, 27, 1557, 1958, 401, 24414, 24995, 5, 1005, 169, 4, 152, 6371, 6, 2]
['<s>', '<unk>', '<unk>', ',', 'known', 'as', 'the', 'European', 'lobster', 'or', 'common', 'lobster', ',', 'is', 'a', 'species', 'of', '<unk>', 'lobster', 'from'

In [15]:
# let's enable padding (to make sure all encoded sequences have at least n tokens)
# NOTE: auto-regressive LMs typically do not use padding tokens
tokenizer.enable_padding(pad_id=tokenizer.token_to_id("<pad>"), pad_token="<pad>", length=100)

In [16]:
# let's again encode a few sequences from our data
outputs = tokenizer.encode_batch(data[:10])
for output in outputs:
    # print(output)
    print(len(output.ids))
    print(output.ids)
    print(output.attention_mask) # takes padding into account
    print(output.tokens)
    print(output.offsets)
    print()

# --> based on this we we create batches to train our language model on

100
[1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['<s>', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 

In [3]:
from languagemodels.tokenization import train_word_level_tokenizer

In [4]:
files = [f"{PATH}/datasets/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]

tokenizer = train_word_level_tokenizer(vocab_size=1000, files=files)

In [8]:
# let's create some toy data
data = [
    "This is the first example sentence .",
    "Another one, but shorter ."
]

outputs = tokenizer.encode_batch(data)

In [9]:
for output in outputs:
    # print(output)
    print(len(output.ids))
    print(output.ids)
    print(output.attention_mask) # takes padding into account
    print(output.tokens)
    print(output.offsets)
    print()

9
[1, 101, 26, 3, 42, 646, 0, 5, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1]
['<s>', 'This', 'is', 'the', 'first', 'example', '<unk>', '.', '</s>']
[(0, 0), (0, 4), (5, 7), (8, 11), (12, 17), (18, 25), (26, 34), (35, 36), (0, 0)]

8
[1, 0, 50, 4, 38, 0, 5, 2]
[1, 1, 1, 1, 1, 1, 1, 1]
['<s>', '<unk>', 'one', ',', 'but', '<unk>', '.', '</s>']
[(0, 0), (0, 7), (8, 11), (11, 12), (13, 16), (17, 24), (25, 26), (0, 0)]

