In [1]:
import spacy
import numpy as np
import re
import dill
from collections import defaultdict
from sklearn.datasets import fetch_20newsgroups
import nltk
from tqdm import tqdm

In [2]:
# Make sure that the spacy english model is installed
if not spacy.util.is_package("en_core_web_sm"):
    spacy.cli.download("en_core_web_sm")
    
# Load the english model
nlp = spacy.load("en_core_web_sm", disable=["ner"])

## N-Gram

In [3]:
# Define the nGram function
def nGram(doc_string_or_list, n, docIsAlreadyTokenized):
    """
    This function takes a string or list of tokens and returns a list of nGrams.
    
    Args:
        doc_string_or_list (str or list): The string or list of tokens to be nGrammed
        n (int): The number of tokens per nGram
        docIsAlreadyTokenized (bool): Whether or not the doc is already tokenized
        
    Returns:
        ngrams (list): A list of nGrams
    """
    
    doc = doc_string_or_list
    
    # If the doc is not already tokenized, tokenize it
    if not docIsAlreadyTokenized:
        doc = nlp(doc_string_or_list)
        doc = [token.text for token in doc if not token.is_punct and not token.is_space]
        
    # Lowercase all the tokens
    doc = [token.lower() for token in doc]
    
    # Create the nGrams
    ngrams = []
    for i in range(len(doc) - n + 1):
        ngrams.append(doc[i:i+n])
        
    return ngrams

In [6]:
# Test the function
doc = "Don't stop me now, I'm having such a good time, I'm having a ball"
print(nGram(doc, 2, False))

[['do', "n't"], ["n't", 'stop'], ['stop', 'me'], ['me', 'now'], ['now', 'i'], ['i', "'m"], ["'m", 'having'], ['having', 'such'], ['such', 'a'], ['a', 'good'], ['good', 'time'], ['time', 'i'], ['i', "'m"], ["'m", 'having'], ['having', 'a'], ['a', 'ball']]


In [7]:
# Create a function to build the N-Gram model
def build_model(corpus, n, docIsAlreadyTokenized):
    """
    This function takes a corpus and returns a N-Gram model.
    
    Args:
        corpus (list): A list of documents
        n (int): The number of tokens per nGram
        docIsAlreadyTokenized (bool): Whether or not the doc is already tokenized
        
    Returns:
        model (defaultdict): A N-Gram model
    """
    
    # Create a placeholder for the model
    model = defaultdict(lambda: defaultdict(lambda: 0))
    
    # Loop through all the documents
    print("Building the model...")
    for doc in tqdm(corpus):
        # Get the nGrams
        ngrams = nGram(doc, n, docIsAlreadyTokenized)
        
        # Update the model
        for ngram in ngrams:
            ngram = tuple(ngram)
            model[ngram[:-1]][ngram[-1]] += 1
            
    # Transform the counts to probabilities
    print("Transforming the counts to probabilities...")
    for previous_tokens in tqdm(model):
        # Get the total count of this previous token
        total_count = float(sum(model[previous_tokens].values()))
        
        # Loop through all the next tokens
        for next_token in model[previous_tokens]:
            # Normalize the count
            model[previous_tokens][next_token] /= total_count

    return model

In [8]:
# Test the function
corpus = ["Don't stop me now, I'm having such a good time, I'm having a ball"]
model = build_model(corpus, 2, False)

# Print the model to see what it looks like
for previous_tokens in model:
    for next_token in model[previous_tokens]:
        print(previous_tokens, next_token, model[previous_tokens][next_token])

Building the model...


100%|██████████| 1/1 [00:00<00:00, 226.23it/s]


Transforming the counts to probabilities...


100%|██████████| 12/12 [00:00<00:00, 165564.63it/s]

('do',) n't 1.0
("n't",) stop 1.0
('stop',) me 1.0
('me',) now 1.0
('now',) i 1.0
('i',) 'm 1.0
("'m",) having 1.0
('having',) such 0.5
('having',) a 0.5
('such',) a 1.0
('a',) good 0.5
('a',) ball 0.5
('good',) time 1.0
('time',) i 1.0





In [9]:
def save_model(model, filename):
    """
    This function saves a model to a file.

    Args:
        model (defaultdict): The model to be saved
        filename (str): The filename of the file to be saved
    """
    
    with open(filename, 'wb') as f:
        dill.dump(model, f)

In [10]:
def load_model(filename):
    """
    This function loads a model from a file.
    
    Args:
        filename (str): The filename of the file to be loaded
        
    Returns:
        model (defaultdict): The loaded model
    """
    
    with open(filename, 'rb') as f:
        model = dill.load(f)
        
    return model

In [11]:
# Save the model
save_model(model, 'ngram_model.dill')

In [12]:
# Load the model
dill_model = load_model('ngram_model.dill')

In [13]:
# Print the model to see what it looks like
for previous_tokens in dill_model:
    for next_token in dill_model[previous_tokens]:
        print(previous_tokens, next_token, dill_model[previous_tokens][next_token])

('do',) n't 1.0
("n't",) stop 1.0
('stop',) me 1.0
('me',) now 1.0
('now',) i 1.0
('i',) 'm 1.0
("'m",) having 1.0
('having',) such 0.5
('having',) a 0.5
('such',) a 1.0
('a',) good 0.5
('a',) ball 0.5
('good',) time 1.0
('time',) i 1.0


### The language model with a bigger dataset