## Run the following pip commands if necessary

In [1]:
#!pip install datasets

In [2]:
#!pip install tokenizers

In [3]:
#!pip install transformers

## Create directories to store dataset and tokenizer files

In [4]:
import os
os.mkdir("./oscar")
os.mkdir("./new_tokenizer")

## Select the dataset we want to use

In [5]:
from datasets import load_dataset
dataset = load_dataset("nthngdy/oscar-small", "unshuffled_deduplicated_en",split="train")

## Break up data into smaller files

In [6]:
from tqdm.auto import tqdm

text_data = []
file_count = 0
for sample in tqdm(dataset):
    sample = sample["text"].replace("\n", " ")
    text_data.append(sample)
    # This will save data into multiple files
    if len(text_data) == 5000:
        with open(f"./oscar/file_{file_count}.txt", "w", encoding="utf-8") as fp:
            fp.write("\n".join(text_data))
        text_data = []
        file_count += 1
# This will save whatever data is left into a file
with open(f"./oscar/file_{file_count}.txt", "w", encoding="utf-8") as fp:
    fp.write("\n".join(text_data))

  0%|          | 0/595810 [00:00<?, ?it/s]

## Get list of files we will feed the tokenizer

In [7]:
from pathlib import Path
paths = [str(x) for x in Path("./oscar").glob("**/*.txt")]
paths[:5]

['oscar\\file_0.txt',
 'oscar\\file_1.txt',
 'oscar\\file_10.txt',
 'oscar\\file_100.txt',
 'oscar\\file_101.txt']

In [8]:
len(paths)

120

## Initialize tokenizer

In [9]:
from tokenizers import BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True, # Removes obvious characters that we don't want. Converts all white space into spaces
    handle_chinese_chars=False, # It allows chinese characters to be better represented
    strip_accents=False, # Gets rid of accents in languages like spanish (Ã© -> e)
    lowercase=True # Convert uppercase letters to lowercase
)

## Train tokenizer

In [10]:
tokenizer.train(files = paths,
                vocab_size=10000, # Number of tokens we can have in our tokenizer
                min_frequency=2, # Minimum number of times we must see 2 different tokens or characters
                                 # together in order to be considered as a token by themselves
                special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], # Special tokens used by Bert
                limit_alphabet=1000, # number of single character tokens that we can see within our vocab
                wordpieces_prefix="##" # Indicates a piece of a word rather than a full word
               )

## Save tokenizer

In [11]:
tokenizer.save_model("./new_tokenizer")

['./new_tokenizer\\vocab.txt']

## Load Tokenizer

In [12]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("./new_tokenizer")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


## Tokenize a sentence

In [13]:
tokenizer("hello! how are you?")

{'input_ids': [2, 6684, 5, 2195, 2017, 1962, 35, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

## Split tokenizer words and save into variable

In [14]:
with open("./new_tokenizer/vocab.txt", "r", errors="ignore") as fp:
    vocab = fp.read().split("\n")

In [15]:
vocab[:5]

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']

## Get the token values from a sentence

In [16]:
tokenizer("hello! how are you?")["input_ids"]

[2, 6684, 5, 2195, 2017, 1962, 35, 3]

## Recreate a sentence with tokens

In [17]:
for i in tokenizer("hello! how are you?")["input_ids"]:
    print(vocab[i], end=" ")

[CLS] hello ! how are you ? [SEP] 

In [18]:
for i in tokenizer("I understood nothing")["input_ids"]:
    print(vocab[i], end=" ")

[CLS] i understood nothing [SEP] 

## Use a word that will have pieces

In [19]:
for i in tokenizer("responsability")["input_ids"]:
    print(vocab[i], end=" ")

[CLS] respons ##ability [SEP] 