### Language Model (Tokenization and BLEU)

Wei Li

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import sacrebleu
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k
from typing import Tuple
import torchdata
import spacy
print(torch.__version__)
print(sacrebleu.__version__)

1.12.1
2.3.1


In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import Multi30k
from collections import Counter
from tqdm import tqdm

### Tokenization

In [3]:
# Initialize tokenizers for English and German using spaCy models
# 'spacy' indicates the type of tokenizer (from spaCy)
# 'en_core_web_sm' is the small English model, 'de_core_news_sm' is the small German model
en_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
#  it returns a spaCy tokenizer instance.
de_tokenizer = get_tokenizer("spacy", language="de_core_news_sm")

# Define a function to tokenize English text
# Uses the English tokenizer to split the text into tokens
def tokenize_en(text):
    # convert text to string
    doc = en_tokenizer(str(text))
    return [token for token in doc]

sample="Hello, world! This is a test sentence."
print(en_tokenizer(sample))
print(tokenize_en(sample))


['Hello', ',', 'world', '!', 'This', 'is', 'a', 'test', 'sentence', '.']
['Hello', ',', 'world', '!', 'This', 'is', 'a', 'test', 'sentence', '.']


In [4]:
# Define a class to create a vocabulary from text
class VOCAB:
    # Initializer for the VOCAB class
    # tokenizer: a function for tokenizing text
    # min_freq: minimum frequency for a word to be included in the vocabulary
    # data: the text data to build the vocabulary from
    # special_tokens: a list of special tokens (like <pad>, <sos>, etc.)
    def __init__(self, tokenizer, min_freq=2, data = None, special_tokens=['<pad>', '<sos>', '<eos>', '<unk>']):
        self.tokenizer = tokenizer
        self.min_freq = min_freq
        self.special_tokens = special_tokens
        self.build_vocab(data)

    # Method to build the vocabulary
    def build_vocab(self, data):
        counter = Counter()
        # Iterate over the data, tokenize each text and update the counter
        for text in tqdm(data):
            tokens = self.tokenizer(text)
            counter.update(tokens)

        # Filter tokens that meet the minimum frequency threshold
        voc_ls = [token for token, freq in counter.items() if freq >= self.min_freq]

        # Add special tokens to the start of the tokens list
        voc_ls = self.special_tokens + voc_ls

        # Create string-to-index mapping
        self.stoi = {token: index for index, token in enumerate(voc_ls)}
        self.itos = voc_ls  # Also create index-to-string mapping

    # Return the length of the vocabulary
    def __len__(self):
        return len(self.stoi)

    # Retrieve an item index from the vocabulary; return index for '<unk>' for unknown tokens
    def __getitem__(self, token):
        return self.stoi.get(token, self.stoi['<unk>'])
        # This line attempts to retrieve the index corresponding to the token from the stoi dictionary. 
        # If the token is not found in the dictionary, it returns the index of a special token <unk> instead.

#### counter

When `counter.update(tokens)` is called, it iterates over each token in the tokens list.
For each token, if it's not already in the Counter, it's added with a count of 1. If it's already present, its count is incremented by 1.

In [5]:
# Sample data: a list of sentences
data = ["hello world", "hello python world", "hello python!"]

# Create a Counter object for counting token frequencies
counter = Counter()

# Define the minimum frequency threshold for tokens
min_freq = 2

# Iterate over each text in the data, tokenize it, and update the counter
for text in tqdm(data):
    tokens = en_tokenizer(text)  # Tokenize the current text
    print(tokens)
    counter.update(tokens)  # Update the counter with the tokens

# Filter out tokens that meet or exceed the minimum frequency threshold
voc_ls = [token for token, freq in counter.items() if freq >= min_freq]
# for token, freq in counter.items(): This iterates over each key-value pair in the Counter object.

# Print the filtered tokens and the full counter for comparison
print("Tokens above frequency threshold:", voc_ls)
print("Full token frequency counter:", counter)
print()
# Create string-to-index mapping
stoi = {token: index for index, token in enumerate(voc_ls)}
print(stoi)
# index-to-string mapping
itos= voc_ls 
print(itos)


100%|██████████| 3/3 [00:00<00:00, 2949.58it/s]

['hello', 'world']
['hello', 'python', 'world']
['hello', 'python', '!']
Tokens above frequency threshold: ['hello', 'world', 'python']
Full token frequency counter: Counter({'hello': 3, 'world': 2, 'python': 2, '!': 1})

{'hello': 0, 'world': 1, 'python': 2}
['hello', 'world', 'python']





In [6]:
# File paths for the English and German training data
en_file = "../data/Multi30k/train/train.en"

# Open and read the English training data file
with open(en_file, "r", encoding="utf8") as f:
    train_data_en = [text.strip() for text in f.readlines()]

# train_data_en is a list of strings
# print the first three strings, and the size of the data
print(train_data_en[0:3], len(train_data_en))


# Create vocabulary objects for English and German training data
# Here, min_freq is set to 1, meaning all tokens are included
EN_VOCAB = VOCAB(tokenize_en, min_freq=1, data = train_data_en)

# Print the sizes of the created English and German vocabularies
print("\nVocab Size English", len(EN_VOCAB))

['Two young, White males are outside near many bushes.', 'Several men in hard hats are operating a giant pulley system.', 'A little girl climbing into a wooden playhouse.'] 29001


100%|██████████| 29001/29001 [00:00<00:00, 46783.64it/s]


Vocab Size English 10837





In [7]:
# example of converting padded sequence (with special tokens) to original sequence of tokens

# Define a simple vocabulary mapping words to token indices
DE_VOCAB = {'<pad>': 0, '<sos>': 1, '<eos>': 2, 'Hallo': 3, 'Welt': 4, 'wie': 5, 'geht': 6, 'es': 7, 'dir': 8, '?': 9}

# Create a list for index-to-string (itos) mapping
itos = [word for word, index in sorted(DE_VOCAB.items(), key=lambda x: x[1])]

# Example sequence of token indices (tgt)
tgt = [1, 3, 4, 5, 6, 7, 8, 9, 2, 0, 0]  # Corresponds to: "<sos> Hallo Welt wie geht es dir ? <eos> <pad> <pad>"

# Convert the sequence of token indices to a string, excluding special tokens
ref = ' '.join([itos[t] for t in tgt if t not in (DE_VOCAB['<pad>'], DE_VOCAB['<eos>'], DE_VOCAB['<sos>'])])

print(itos) # list of tokens
print(ref)  # Output the converted sentence


['<pad>', '<sos>', '<eos>', 'Hallo', 'Welt', 'wie', 'geht', 'es', 'dir', '?']
Hallo Welt wie geht es dir ?


### BLEU (Billingual Evaluation Understudy) Score

Note that there are two modules that compute BLEU score, one is `sacrebleu` and one is from
`nltk.translate.bleu_score`. We use `sacrebleu` here.

https://www.nltk.org/api/nltk.translate.gleu_score.html

https://github.com/mjpost/sacrebleu

# BLEU Score Explanation with Example

BLEU (Bilingual Evaluation Understudy) score is a metric for evaluating a machine-translated text against a set of high-quality reference translations. Let's understand this using a simple example.

Example Setup
- **Reference Sentence**: "The quick brown fox jumps over the lazy dog."
- **Machine-Translated Sentence**: "The fast brown fox jumped over the lazy dog."

N-gram Precision: we compare n-grams (continuous sequences of n items from the text) between the machine-translated sentence and the reference sentence.

1-gram (Unigram) Precision
- Reference: ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
- Translation: ["The", "fast", "brown", "fox", "jumped", "over", "the", "lazy", "dog"]
- Matches: 7 out of 9 ("The", "brown", "fox", "over", "the", "lazy", "dog")
- **Precision**: 7/9

2-gram (Bigram) Precision
- Reference bigrams: ["The quick", "quick brown", ... , "lazy dog"]
- Translation bigrams: ["The fast", "fast brown", ... , "lazy dog"]
- Matches: 4 out of 8 ("brown fox", "over the", "the lazy", "lazy dog")
- **Precision**: 4/8

(Similar calculations for 3-gram and 4-gram.)

Brevity Penalty (BP): BP penalizes translations that are too short.

- **Hypothesis Length (h)**: 9 (Length of the machine-translated sentence)
- **Reference Length (r)**: 9 (Length of the reference sentence)
- **BP Calculation**:
  - If h ≤ r, BP = exp(1 - r/h)
  - If h > r, BP = 1
- In our example, h = r, so BP = 1 (no penalty).

Calculate BLEU Score:
$$
\text{BLEU} = \text{BP} \cdot \big(\prod_{n=1}^N p_n\big)^{1/N}= \text{BP} \cdot \exp\left(  \frac{1}{N}\sum_{n=1}^{N} \log p_n \right)
$$

$p_n$ is the precision for n-grams.; $\sum_{n=1}^{N}$ represents the sum over all n-grams (from 1-gram to N-gram), $\frac{1}{N}$ is the weight given to each n-gram (which is equal for all n-grams in the standard BLEU score calculation).

The geometric mean $\big(\prod_{n=1}^N p_n\big)^{1/N}$ ensures that a translation needs to perform well across all these different n-gram sizes to achieve a high score. The geometric mean is particularly sensitive to low values in a way that the arithmetic mean is not. If a translation scores very poorly in any n-gram category, the geometric mean will significantly lower the overall score. 

The BLEU score is a value between 0 and 100, where higher scores indicate better translations. It considers precision for up to 4-grams and adjusts for translation length. However, it does not account for semantic accuracy or grammatical correctness.


In [8]:
import sacrebleu
print(sacrebleu.__version__)
# sacrebleu 2.3.1

# Define the reference and the machine-translated sentences
reference = ["The quick brown fox jumps over the lazy dog."]
translation = "The fast brown fox jumped over the lazy dog."

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu([translation], [reference])

# Print the BLEU score and additional details
print(f"BLEU Score: {bleu.score}")
print(f"Individual n-gram scores: {bleu.precisions}")
print(f"Brevity Penalty: {bleu.bp}")
print(f"Translation length ratio: {bleu.sys_len / bleu.ref_len}")

# The sacrebleu tool is designed to apply some more sophisticated counting rules, so 
# the nuanced handling of n-grams leads to the differences in precision scores obtained here.

2.3.1
BLEU Score: 46.713797772819994
Individual n-gram scores: [80.0, 55.55555555555556, 37.5, 28.571428571428573]
Brevity Penalty: 1.0
Translation length ratio: 1.0


In [9]:
# We can also use more than one reference set

refs = [ # First set of references
             ['The dog bit the man.', 'It was not unexpected.', 'The man bit him first.'],
            # Second set of references
             ['The dog had bit the man.', 'No one was surprised.', 'The man had bitten the dog.'],
       ]
sys = ['The dog bit the man.', "No one was surprised.", 'The man bit him first.']

sacrebleu.corpus_bleu(sys, refs)


BLEU = 100.00 100.0/100.0/100.0/100.0 (BP = 1.000 ratio = 1.000 hyp_len = 17 ref_len = 17)

In [10]:
refs = [ # First set of references
             ['The dog bit the man.', 'It was not unexpected.', 'The man bit him first.'],
            # Second set of references
             ['The dog had bit the man.', 'No one was surprised.', 'The man had bitten the dog.'],
       ]
sys = ['The dog bit the man.', "It was not unexpected.", 'The man had bitten the dog.']

sacrebleu.corpus_bleu(sys, refs)

BLEU = 100.00 100.0/100.0/100.0/100.0 (BP = 1.000 ratio = 1.000 hyp_len = 18 ref_len = 18)

1. References (refs):

refs is a list of lists, where each inner list contains different reference translations for the same set of sentences.
There are two sets of reference translations provided for each sentence in this example.
For instance, the first sentence has two reference translations: "The dog bit the man." and "The dog had bit the man." If the transalted sentence matches either one of the two exactly, then it is a perfect match.

2. System Output (sys):

sys is a list containing the hypothesis sentences (or system translations).
These are the sentences produced by the translation model that you want to evaluate.
In this case, there are three sentences in sys.

3. BLEU Score Calculation:

`sacrebleu.corpus_bleu(sys, refs)` calculates the BLEU score by comparing each sentence in sys against the corresponding set of reference sentences in refs.
The BLEU score is calculated over the entire corpus (all sentences) rather than individual sentences.
The BLEU score includes several components:

- BLEU = 48.53: This is the overall BLEU score, a weighted average of the n-gram precisions, adjusted by the Brevity Penalty.
- 82.4/50.0/45.5/37.5: These are the individual n-gram precision scores for 1-gram, 2-gram, 3-gram, and 4-gram matches, respectively.
- BP = 0.943: The Brevity Penalty value. A BP of 1 means no penalty, and a value less than 1 penalizes shorter translations.
- ratio = 0.944: The ratio of the hypothesis length to the reference length.
- hyp_len = 17: The total length of the hypothesis (system output).
- ref_len = 18: The total length of the reference translations.

The BLEU score and its components offer insights into various aspects of translation accuracy and fluency.

In [11]:
# by replacing using either None or the empty string '', we can remove the reference sentence

refs = [ # First set of references
             ['The dog bit the man...ouch','',''],
            # Second set of references
             ['The dog bit the man.','','']
       ]
sys = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']

sacrebleu.corpus_bleu(sys, refs)

BLEU = 36.21 35.3/35.7/36.4/37.5 (BP = 1.000 ratio = 2.833 hyp_len = 17 ref_len = 6)

In [12]:
# by replacing using either None or the empty string '', we can remove the reference sentence

refs = [ # First set of references
             ["It wasn't surprising.",'',''],
            # Second set of references
             ['The dog bit the man.','','']
       ]
sys = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']

sacrebleu.corpus_bleu(sys, refs)

BLEU = 36.21 35.3/35.7/36.4/37.5 (BP = 1.000 ratio = 2.833 hyp_len = 17 ref_len = 6)

In [13]:
# what happens if we omit the reference entirely by even dropping ''. 
# it seems then, it would mean do not make comparison for those sentences at all.

refs = [ # First set of references
             ["It wasn't surprising."],
            # Second set of references
             ['The dog bit the man.']
       ]
sys = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']

sacrebleu.corpus_bleu(sys, refs)

BLEU = 100.00 100.0/100.0/100.0/100.0 (BP = 1.000 ratio = 1.000 hyp_len = 6 ref_len = 6)

#### playground: explain sacrebleu.corpus_bleu in translation

In [14]:
# Example to illustrate ground_truth_sentences.extend([[tgt] for tgt in tgt_sentences])

# Let's assume we have two batches of ground truth sentences (tgt_sentences) from a test dataset
batch1_tgt_sentences = ["Der Hund läuft.", "Das Wetter ist schön."]
batch2_tgt_sentences = ["Das Auto ist neu.", "Das Haus ist groß."]

# Initialize an empty list for ground_truth_sentences
ground_truth_sentences = []

# Extending ground_truth_sentences with the first batch
ground_truth_sentences.extend([[tgt] for tgt in batch1_tgt_sentences])
# Extending ground_truth_sentences with the second batch
ground_truth_sentences.extend([[tgt] for tgt in batch2_tgt_sentences])

# Print the ground_truth_sentences list
ground_truth_sentences

[['Der Hund läuft.'],
 ['Das Wetter ist schön.'],
 ['Das Auto ist neu.'],
 ['Das Haus ist groß.']]

In [15]:
# Translated sentences
translated_sentences = [
    "Der Hund rennt.",
    "Das Wetter ist gut.",
    "Ein neues Auto.",
    "Das Haus ist großartig.",
]

# Ground truth sentences
ground_truth_sentences = [
    [
        "Der Hund rennt.",
        "Das Wetter ist gut.",
        "Ein neues Auto.",
        "Das Haus ist großartig.",
    ]
]

# Calculate the BLEU score using sacrebleu
bleu_score = sacrebleu.corpus_bleu(translated_sentences, ground_truth_sentences).score

print(f"BLEU score: {bleu_score}")

BLEU score: 100.00000000000004
