# Download and unzip the given two datasets.

In [3]:
!wget http://www.gutenberg.org/cache/epub/16457/pg16457.txt

--2022-03-15 13:32:11--  http://www.gutenberg.org/cache/epub/16457/pg16457.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.gutenberg.org/cache/epub/16457/pg16457.txt [following]
--2022-03-15 13:32:11--  https://www.gutenberg.org/cache/epub/16457/pg16457.txt
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 617622 (603K) [text/plain]
Saving to: ‘pg16457.txt’


2022-03-15 13:32:11 (4.85 MB/s) - ‘pg16457.txt’ saved [617622/617622]



In [4]:
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip

--2022-03-15 13:32:11--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.85.53
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.85.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191984949 (183M) [application/zip]
Saving to: ‘wikitext-103-raw-v1.zip’


2022-03-15 13:32:15 (54.8 MB/s) - ‘wikitext-103-raw-v1.zip’ saved [191984949/191984949]



In [5]:
!unzip wikitext-103-raw-v1.zip

Archive:  wikitext-103-raw-v1.zip
   creating: wikitext-103-raw/
  inflating: wikitext-103-raw/wiki.test.raw  
  inflating: wikitext-103-raw/wiki.valid.raw  
  inflating: wikitext-103-raw/wiki.train.raw  


# Installing the required tokenizers

In [6]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 2.6 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.11.6


## Importing the tokenizer and subword BPE trainer

In [7]:
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece
from tokenizers.trainers import BpeTrainer, WordPieceTrainer

## A pretokenizer to segment the text into words

In [8]:
from tokenizers.pre_tokenizers import Whitespace

# Define the special tokens familar to BERT pre-trained model.

In [9]:
Unknown_token = "<UNK>" # Unknown words
Special_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"]  # Special tokens used in BERT model

# Prepares the tokenizer and trainer 

In [10]:
def prepare_tokenizer_trainer(algorithm):
    if algorithm == 'BPE': # Stands for Binary Pair Encoding.
        tokenizer = Tokenizer(BPE(unk_token = Unknown_token))
        trainer = BpeTrainer(special_tokens = Special_tokens)

    elif algorithm == 'WPC': # Stands for Word Piece.
        tokenizer = Tokenizer(WordPiece(unk_token = Unknown_token))
        trainer = WordPieceTrainer(special_tokens = Special_tokens)
    else:
      print("Invalid Algorithm, Try again !")
    
    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer, trainer

# Takes the files and trains the tokenizer.

In [11]:
def train_tokenizer(input_files, algorithm='BPE'):
 
    tokenizer, trainer = prepare_tokenizer_trainer(algorithm)
    tokenizer.train(input_files, trainer) # training the tokenzier
    #tokenizer.save("./tokenizer-trained.json")
    #tokenizer = Tokenizer.from_file("./tokenizer-trained.json")
    return tokenizer

# Tokenizes the input string using the trained tokenizer.

In [12]:
def tokenize(input_string, tokenizer):
    output = tokenizer.encode(input_string)
    return output

# Training each model on the available datasets.

## Define our datasets.

In [13]:
small_file = ['pg16457.txt'] # single .txt file

In [14]:
large_files = [f"./wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]] # Consist of three parts,tarin set, test set and validation set.

## Define an input text.

In [15]:
input_string = "This is a deep learning tokenization tutorial. Tokenization is the first step in a deep learning NLP pipeline. We will be comparing the tokens generated by each tokenization model. Excited much?!😍"

## Create an empty dictionary to append tokenized letters.

In [19]:
def return_tokenized(input_string ,input_files):
  tokens_dict = {}
  len_tokens=[]
  for file in input_files:
      print(f"========Using vocabulary from {file}=======")
      for algorithm in ['BPE','WPC']:
          trained_tokenizer = train_tokenizer(file, algorithm)
          output = tokenize(input_string, trained_tokenizer)
          tokens_dict[algorithm] = output.tokens
          len_tokens.append(len(output.tokens))
          print("----", algorithm, "----")
          print(output.tokens, "->", len(output.tokens))
  return len_tokens

In [20]:
len_tokens=return_tokenized(input_string ,[small_file, large_files])

---- BPE ----
['This', 'is', 'a', 'deep', 'learning', 'to', 'ken', 'ization', 't', 'ut', 'or', 'ial', '.', 'T', 'ok', 'en', 'ization', 'is', 'the', 'first', 'step', 'in', 'a', 'deep', 'learning', 'N', 'L', 'P', 'pi', 'pe', 'line', '.', 'We', 'will', 'be', 'comparing', 'the', 'to', 'k', 'ens', 'generated', 'by', 'each', 'to', 'ken', 'ization', 'model', '.', 'Ex', 'c', 'ited', 'much', '?', '!', '<UNK>'] -> 55
---- WPC ----
['This', 'is', 'a', 'deep', 'learning', 'to', '##ken', '##ization', 't', '##ut', '##oria', '##l', '.', 'To', '##ken', '##ization', 'is', 'the', 'first', 'step', 'in', 'a', 'deep', 'learning', 'N', '##L', '##P', 'pip', '##el', '##ine', '.', 'We', 'will', 'be', 'comparing', 'the', 'to', '##ken', '##s', 'generated', 'by', 'each', 'to', '##ken', '##ization', 'model', '.', 'Ex', '##ci', '##ted', 'much', '<UNK>'] -> 52
---- BPE ----
['This', 'is', 'a', 'deep', 'learning', 'to', 'ken', 'ization', 'tut', 'orial', '.', 'Tok', 'en', 'ization', 'is', 'the', 'first', 'step', 'in',

In [23]:
data = [['  ','BPE', "WPC"], ['pg16457.txt',len_tokens[0],len_tokens[2]], 

             ['wiki.raw',len_tokens[1],len_tokens[3]]]

In [24]:
from tabulate import tabulate
print(tabulate(data, headers='firstrow', tablefmt='fancy_grid'))

╒═════════════╤═══════╤═══════╕
│             │   BPE │   WPC │
╞═════════════╪═══════╪═══════╡
│ pg16457.txt │    55 │    47 │
├─────────────┼───────┼───────┤
│ wiki.raw    │    52 │    48 │
╘═════════════╧═══════╧═══════╛


In [37]:
with open('pg16457.txt') as f:
    lines = f.readlines()

In [38]:
new_input=" ".join(str(x) for x in lines)

In [39]:
len_tokens=return_tokenized(new_input ,[small_file, large_files])

Output hidden; open in https://colab.research.google.com to view.

In [40]:
data = [['  ','BPE', "WPC"], ['pg16457.txt',len_tokens[0],len_tokens[2]], 

             ['wiki.raw',len_tokens[1],len_tokens[3]]]

In [41]:
print(tabulate(data, headers='firstrow', tablefmt='fancy_grid'))

╒═════════════╤════════╤════════╕
│             │    BPE │    WPC │
╞═════════════╪════════╪════════╡
│ pg16457.txt │ 122739 │ 140872 │
├─────────────┼────────┼────────┤
│ wiki.raw    │ 122739 │ 140735 │
╘═════════════╧════════╧════════╛
