# Imports

In [1]:
import math
import random
import numpy as np
import pandas as pd
import requests
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Load and Preprocess Data

### Load the data

In [2]:
# with open("https://raw.githubusercontent.com/yeesem/NLP_Dataset/main/en_US.twitter.txt", "r") as f:
#     data = f.read()

# URL of the raw file
# Extract the text from the
url = "https://raw.githubusercontent.com/yeesem/NLP_Dataset/main/en_US.twitter.txt"

# Send a GET request to fetch the raw file content
response = requests.get(url)

data = response.text

print("Data type:", type(data))
print("Number of letters:", len(data))
print("First 300 letters of the data")
print("-------")
display(data[0:300])
print("-------")

print("Last 300 letters of the data")
print("-------")
display(data[-300:])
print("-------")

Data type: <class 'str'>
Number of letters: 3383438
First 300 letters of the data
-------


"How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.\r\nWhen you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.\r\nthey've decided its more fun if I don't.\r\nSo Tired D; Played Lazer Tag & Ran"

-------
Last 300 letters of the data
-------


"had one a few weeks back....hopefully we will be back soon! wish you the best yo\r\nColombia is with an 'o'...“: We now ship to 4 countries in South America (fist pump). Please welcome Columbia to the Stunner Family”\r\n#GutsiestMovesYouCanMake Giving a cat a bath.\r\nCoffee after 5 was a TERRIBLE idea.\r\n"

-------


### Pre-process the Data

### Split the sentences

In [3]:
# Split the sentences
def split_to_sentences(data):

  sentences = data.split('\n')

  # - Remove leading and trailing spaces from each sentence
  # - Drop sentences if they are empty strings.
  sentences = [s.strip() for s in sentences]
  sentences = [s for s in sentences if len(s) > 0]

  return sentences

### Tokenize sentences

In [4]:
def tokenize_sentences(sentences):

  # Initialize the list of lists of tokenzed sentences
  tokenized_sentences = []

  for sentence in sentences:

    # Convert to lowercase letters
    sentence = sentence.lower()

    # Convert into a list of word
    tokenized = nltk.tokenize.word_tokenize(sentence)

    # Append the list of words to the list of lists
    tokenized_sentences.append(tokenized)

  return tokenized_sentences

### Get tokenized data

In [5]:
def get_tokenized_data(data):

  # Get the sentences by splitting up the data
  sentences = split_to_sentences(data)

  # Get the list of lists of tokens by tokenizing the sentences
  tokenized_sentences = tokenize_sentences(sentences)

  return tokenized_sentences

In [6]:
tokenized_data = get_tokenized_data(data)
random.seed(87)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [7]:
print("{} data are split into {} train and {} test set".format(
    len(tokenized_data), len(train_data), len(test_data)))

print("First training sample:")
print(train_data[0])

print("First test sample")
print(test_data[0])

47961 data are split into 38368 train and 9593 test set
First training sample:
['i', 'personally', 'would', 'like', 'as', 'our', 'official', 'glove', 'of', 'the', 'team', 'local', 'company', 'and', 'quality', 'production']
First test sample
['that', 'picture', 'i', 'just', 'seen', 'whoa', 'dere', '!', '!', '>', '>', '>', '>', '>', '>', '>']


### count_words

In [8]:
def count_words(tokenized_sentences):
  word_counts = {}

  # Loop through each sentence
  for sentence in tokenized_sentences:

    # Go through each token in the sentence
    for token in sentence:

      if token not in word_counts:
        word_counts[token] = 1

      else:
        word_counts[token] += 1

  return word_counts

### get_words_with_nplus_frequency

In [9]:
def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):

    # Initialize an empty list to contain the words that
    # appear at least 'minimum_freq' times.
    closed_vocab = []

    # Get the word couts of the tokenized sentences
    # Use the function that you defined earlier to count the words
    word_counts = count_words(tokenized_sentences)

    # for each word and its count
    for word, cnt in word_counts.items(): # complete this line

        # check that the word's count
        # is at least as great as the minimum count
        if cnt >= count_threshold: # complete this line with the proper condition

            # append the word to the list
            closed_vocab.append(word)

    return closed_vocab

### Replace oov_words by unk

In [10]:
def replace_oov_words_by_unk(tokenized_sentences, vocabulary, unknown_token="<unk>"):

    # Place vocabulary into a set for faster search
    vocabulary = set(vocabulary)

    # Initialize a list that will hold the sentences
    # after less frequent words are replaced by the unknown token
    replaced_tokenized_sentences = []

    # Go through each sentence
    for sentence in tokenized_sentences:

        # Initialize the list that will contain
        # a single sentence with "unknown_token" replacements
        replaced_sentence = []
        ### START CODE HERE (Replace instances of 'None' with your code) ###

        # for each token in the sentence
        for token in sentence: # complete this line

            # Check if the token is in the closed vocabulary
            if token in vocabulary: # complete this line with the proper condition
                # If so, append the word to the replaced_sentence
                replaced_sentence.append(token)
            else:
                # otherwise, append the unknown token instead
                replaced_sentence.append(unknown_token)

        # Append the list of tokens to the list of lists
        replaced_tokenized_sentences.append(replaced_sentence)
    return replaced_tokenized_sentences

### Preprocess data

In [11]:
def preprocess_data(train_data, test_data, count_threshold, unknown_token="<unk>", get_words_with_nplus_frequency=get_words_with_nplus_frequency, replace_oov_words_by_unk=replace_oov_words_by_unk):

    # Get the closed vocabulary using the train data
    vocabulary = get_words_with_nplus_frequency(train_data, count_threshold)

    # For the train data, replace less common words with "<unk>"
    train_data_replaced = replace_oov_words_by_unk(train_data,vocabulary,unknown_token)

    # For the test data, replace less common words with "<unk>"
    test_data_replaced = replace_oov_words_by_unk(test_data,vocabulary,unknown_token)

    return train_data_replaced, test_data_replaced, vocabulary

In [12]:
minimum_freq = 2
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data,
                                                                        test_data,
                                                                        minimum_freq)

In [13]:
print("First preprocessed training sample:")
print(train_data_processed[0])
print()
print("First preprocessed test sample:")
print(test_data_processed[0])
print()
print("First 10 vocabulary:")
print(vocabulary[0:10])
print()
print("Size of vocabulary:", len(vocabulary))

First preprocessed training sample:
['i', 'personally', 'would', 'like', 'as', 'our', 'official', 'glove', 'of', 'the', 'team', 'local', 'company', 'and', 'quality', 'production']

First preprocessed test sample:
['that', 'picture', 'i', 'just', 'seen', 'whoa', 'dere', '!', '!', '>', '>', '>', '>', '>', '>', '>']

First 10 vocabulary:
['i', 'personally', 'would', 'like', 'as', 'our', 'official', 'glove', 'of', 'the']

Size of vocabulary: 14823


# Develop n-gram based Language Models

### count_n_grams

In [14]:
def count_n_grams(data, n, start_token='<s>', end_token = '<e>'):

    # Initialize dictionary of n-grams and their counts
    n_grams = {}

    # Go through each sentence in the data
    for sentence in data:

        # prepend start token n times, and  append the end token one time
        sentence = [start_token] * n + sentence + [end_token]

        # convert list to tuple
        # So that the sequence of words can be used as
        # a key in the dictionary
        sentence = tuple(sentence)

        # Use 'i' to indicate the start of the n-gram
        # from index 0
        # to the last index where the end of the n-gram
        # is within the sentence.

        for i in range(len(sentence) - n + 1):

            # Get the n-gram from i to i+n
            n_gram = sentence[i : i + n]

            # check if the n-gram is in the dictionary
            if n_gram in n_grams: # complete this line with the proper condition

                # Increment the count for this n-gram
                n_grams[n_gram] += 1
            else:
                # Initialize this n-gram count to 1
                n_grams[n_gram] = 1

    return n_grams

In [15]:
# Testing
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
print("Uni-gram:")
print(count_n_grams(sentences, 1))
print("Bi-gram:")
print(count_n_grams(sentences, 2))

Uni-gram:
{('<s>',): 2, ('i',): 1, ('like',): 2, ('a',): 2, ('cat',): 2, ('<e>',): 2, ('this',): 1, ('dog',): 1, ('is',): 1}
Bi-gram:
{('<s>', '<s>'): 2, ('<s>', 'i'): 1, ('i', 'like'): 1, ('like', 'a'): 2, ('a', 'cat'): 2, ('cat', '<e>'): 2, ('<s>', 'this'): 1, ('this', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1}


### Estimate Probability

In [16]:
def estimate_probability(word, previous_n_gram,
                         n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):

    # convert list to tuple to use it as a dictionary key
    previous_n_gram = tuple(previous_n_gram)

    # Set the denominator
    # If the previous n-gram exists in the dictionary of n-gram counts,
    # Get its count.  Otherwise set the count to zero
    # Use the dictionary that has counts for n-grams
    previous_n_gram_count = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts else 0

    # Calculate the denominator using the count of the previous n gram
    # and apply k-smoothing
    denominator = previous_n_gram_count + k * vocabulary_size

    # Define n plus 1 gram as the previous n-gram plus the current word as a tuple
    n_plus1_gram = previous_n_gram + (word,)

    # Set the count to the count in the dictionary,
    # otherwise 0 if not in the dictionary
    # use the dictionary that has counts for the n-gram plus current word
    n_plus1_gram_count = n_plus1_gram_counts[n_plus1_gram] if n_plus1_gram in n_plus1_gram_counts else 0

    # Define the numerator use the count of the n-gram plus current word,
    # and apply smoothing
    numerator = n_plus1_gram_count + k

    # Calculate the probability as the numerator divided by denominator
    probability = numerator / denominator

    return probability

In [17]:
# testing
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
tmp_prob = estimate_probability("cat", ["a"], unigram_counts, bigram_counts, len(unique_words), k=1)

print(f"The estimated probability of word 'cat' given the previous n-gram 'a' is: {tmp_prob:.4f}")

The estimated probability of word 'cat' given the previous n-gram 'a' is: 0.3333


### Estimate probabilities for all words

In [18]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token='<e>', unknown_token="<unk>",  k=1.0):

    # convert list to tuple to use it as a dictionary key
    previous_n_gram = tuple(previous_n_gram)

    # add <e> <unk> to the vocabulary
    # <s> is not needed since it should not appear as the next word
    vocabulary = vocabulary + [end_token, unknown_token]
    vocabulary_size = len(vocabulary)

    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary_size, k=k)

        probabilities[word] = probability

    return probabilities

In [19]:
# Testing
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)

estimate_probabilities(["a"], unigram_counts, bigram_counts, unique_words, k=1)

{'is': 0.09090909090909091,
 'cat': 0.2727272727272727,
 'dog': 0.09090909090909091,
 'like': 0.09090909090909091,
 'a': 0.09090909090909091,
 'this': 0.09090909090909091,
 'i': 0.09090909090909091,
 '<e>': 0.09090909090909091,
 '<unk>': 0.09090909090909091}

In [20]:
# Additional test
trigram_counts = count_n_grams(sentences, 3)
estimate_probabilities(["<s>", "<s>"], bigram_counts, trigram_counts, unique_words, k=1)

{'is': 0.09090909090909091,
 'cat': 0.09090909090909091,
 'dog': 0.09090909090909091,
 'like': 0.09090909090909091,
 'a': 0.09090909090909091,
 'this': 0.18181818181818182,
 'i': 0.18181818181818182,
 '<e>': 0.09090909090909091,
 '<unk>': 0.09090909090909091}

### Count and probability matrices

In [21]:
def make_count_matrix(n_plus1_gram_counts, vocabulary):
    # add <e> <unk> to the vocabulary
    # <s> is omitted since it should not appear as the next word
    vocabulary = vocabulary + ["<e>", "<unk>"]

    # obtain unique n-grams
    n_grams = []
    for n_plus1_gram in n_plus1_gram_counts.keys():
        n_gram = n_plus1_gram[0:-1]
        n_grams.append(n_gram)
    n_grams = list(set(n_grams))

    # mapping from n-gram to row
    row_index = {n_gram:i for i, n_gram in enumerate(n_grams)}
    # mapping from next word to column
    col_index = {word:j for j, word in enumerate(vocabulary)}

    nrow = len(n_grams)
    ncol = len(vocabulary)
    count_matrix = np.zeros((nrow, ncol))
    for n_plus1_gram, count in n_plus1_gram_counts.items():
        n_gram = n_plus1_gram[0:-1]
        word = n_plus1_gram[-1]
        if word not in vocabulary:
            continue
        i = row_index[n_gram]
        j = col_index[word]
        count_matrix[i, j] = count

    count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocabulary)
    return count_matrix

In [22]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_n_grams(sentences, 2)

print('bigram counts')
display(make_count_matrix(bigram_counts, unique_words))

bigram counts


Unnamed: 0,is,cat,dog,like,a,this,i,<e>,<unk>
"(dog,)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(this,)",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
"(a,)",0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(i,)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"(<s>,)",0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
"(cat,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
"(like,)",0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
"(is,)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Show trigram counts
print('\ntrigram counts')
trigram_counts = count_n_grams(sentences, 3)
display(make_count_matrix(trigram_counts, unique_words))


trigram counts


Unnamed: 0,is,cat,dog,like,a,this,i,<e>,<unk>
"(<s>, <s>)",0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
"(a, cat)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
"(like, a)",0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(dog, is)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"(i, like)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"(is, like)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"(this, dog)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>, this)",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>, i)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [25]:
def make_probability_matrix(n_plus_gram_counts, vocabulary, k):
  count_matrix = make_count_matrix(n_plus_gram_counts, vocabulary)
  count_matrix += k
  prob_matrix = count_matrix.div(count_matrix.sum(axis = 1), axis = 0)
  return prob_matrix

In [26]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_n_grams(sentences, 2)
print("bigram probabilities")
display(make_probability_matrix(bigram_counts, unique_words, k=1))

bigram probabilities


Unnamed: 0,is,cat,dog,like,a,this,i,<e>,<unk>
"(dog,)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(this,)",0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1
"(a,)",0.090909,0.272727,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(i,)",0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1
"(<s>,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.181818,0.181818,0.090909,0.090909
"(cat,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909
"(like,)",0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909,0.090909,0.090909
"(is,)",0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1


In [27]:
print("trigram probability")
trigram_counts = count_n_grams(sentences, 3)
display(make_probability_matrix(trigram_counts, unique_words, k=1))

trigram probability


Unnamed: 0,is,cat,dog,like,a,this,i,<e>,<unk>
"(<s>, <s>)",0.090909,0.090909,0.090909,0.090909,0.090909,0.181818,0.181818,0.090909,0.090909
"(a, cat)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909
"(like, a)",0.090909,0.272727,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(dog, is)",0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1
"(i, like)",0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1
"(is, like)",0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1
"(this, dog)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(<s>, this)",0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1
"(<s>, i)",0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1


# Perplexity

In this section, you will generate the perplexity score to evaluate your model on the test set.
- You will also use back-off when needed.
- Perplexity is used as an evaluation metric of your language model.
- To calculate the perplexity score of the test set on an n-gram model, use:

$$ PP(W) =\sqrt[N]{ \prod_{t=n+1}^N \frac{1}{P(w_t | w_{t-n} \cdots w_{t-1})} } \tag{4}$$

- where $N$ is the length of the sentence.
- $n$ is the number of words in the n-gram (e.g. 2 for a bigram).
- In math, the numbering starts at one and not zero.

In code, array indexing starts at zero, so the code will use ranges for $t$ according to this formula:

$$ PP(W) =\sqrt[N]{ \prod_{t=n}^{N-1} \frac{1}{P(w_t | w_{t-n} \cdots w_{t-1})} } \tag{4.1}$$

The higher the probabilities are, the lower the perplexity will be.
- The more the n-grams tell us about the sentence, the lower the perplexity score will be.

### Calculate perplexity