In [46]:
import string
import random
import time
import os
from typing import List, Dict, Tuple


class NGramModel(object):
    def __init__(self, n: int) -> None:
        self.n = n
        self.n_grams = dict() # Stores unique n-grams
        self.context_count = dict() # Stores count of contexts
        self.ngram_count = dict() # Stores count of ngrams: (context, token)
        
        
    def tokenize(self, text: str) -> List[str]:
        # Treat punctuation as a separate token.
        # Add space before punctuation.
        curr=text
        punct="!\"#$%&'()*+,-—./:;<=>?@[\]^_`{|}~" #’
        for p in punct:
            curr=curr.replace(p,' '+p)
        # Split using spaces.
        tokens=curr.split(' ')
        return tokens
        

    def generate_n_grams(self, tokens: List[str]) -> List[Tuple[List[str], str]]:
        """
        n_grams is a list where each element is
        ([n-1 context tokens], token)
        """
        # Prefix the <START> tokens before each sentence
        tokens = (self.n-1)*["<START>"] + tokens
        n_grams = list()
        n=self.n
        for i in range(len(tokens)-n+1):
            context=tokens[i:i+n-1]
            tok=tokens[i+n-1]
            n_grams.append((context,tok))
        return n_grams

    def fit(self, text: str) -> None:
        new_n_grams = self.generate_n_grams(self.tokenize(text))
        for context, target in new_n_grams:
            # Add context to context dict and store count 
            if tuple(context) in self.context_count:
                self.context_count[tuple(context)] += 1.0
            else:
                self.context_count[tuple(context)] = 1.0

            # Save unique n_grams.
            if tuple(context) in self.n_grams:        
                if target not in self.n_grams[tuple(context)]:
                    self.n_grams[tuple(context)].append(target)
            else:
                self.n_grams[tuple(context)] = [target]

            # Store n_gram counts
            new_n_gram = (tuple(context), target)
            if new_n_gram in self.ngram_count:
                self.ngram_count[new_n_gram] += 1.0
            else:
                self.ngram_count[new_n_gram] = 1.0


    def get_prob(self, context: List[str], target: str) -> float:
        """
        Calculates the probability of each token
        associated with the context.
        """
        ctxt_tup=tuple(context)
        denominator=self.context_count[ctxt_tup]
        numerator=self.ngram_count[(ctxt_tup,target)]
        prob=numerator/denominator

        return prob

    def predict_token(self, context: List[str]) -> str:
        """
        Predicts token.
        A slight randomness ensures we generate a diverse token
        with the same context.
        """
        r = random.random()
        # store the probability of each token.
        token_probs = dict()

        tokens_of_interest = self.n_grams[tuple(context)]
        for token in tokens_of_interest:
            token_probs[token] = self.get_prob(context, token)

        sum = 0.0
        for key in sorted(token_probs):
            sum += token_probs[key]
            # When the probability sum is 
            # greater than the random number
            # we return the current token.
            if sum > r:
                return key

def generate_text(model: NGramModel, n_outs: int) -> str:
    """
    Genertes n_outs words using the trained
    ngram model.
    """
    n = model.n
    # All sentence are initialized with the <START> token
    context_queue = (n-1) * ["<START>"]
    result = list()

    for _ in range(n_outs):
        pred_token = model.predict_token(context_queue)
        result.append(pred_token)

        context_queue.pop(0)
        
        if pred_token == ".":
            # If sentence done. Start a new sentence.
            context_queue = (n-1) * ["<START>"]
        else:
            context_queue.append(pred_token)

    return " ".join(result)


def create_ngram_model(model: NGramModel, path: str) -> None:
    """
    fits the ngram model.
    """
    with open(path, 'r', encoding="utf8") as f:
        text = f.read()
        # Split into sentences.
        text = text.split('.')
        for sentence in text:
            # add back the fullstop
            sentence += '.'
            model.fit(sentence)


In [47]:
def writetext(leng):
    start = time.time()
    
    m = NGramModel(n=leng) # Initialize NGram model

    # Fit the ngram model on Shakespeare plays
    ROOT="./Shakespeare"
    for root, dird, files in os.walk(ROOT):
        for text_file in files:
            path = os.path.join(root, text_file)
            print("Loading ", path)
            create_ngram_model(m, path)

    print (f'Language Model creating time: {time.time() - start}')
    
    # Generate Text using the trained ngram model.
    num_gen_words = 100
    
    print(f'{"="*50}\nGenerated text:')
    print("\n")
    print(generate_text(m, num_gen_words))
    print(f'{"="*50}')

In [48]:
if __name__ == "__main__":
    leng=[2,3,4,5]
    for n in leng:
        print("====================================================================================================")
        print("========================================== n=",n," ==============================================")
        writetext(n)

Loading  ./Shakespeare\Hamlet.txt
Loading  ./Shakespeare\Macbeth.txt
Loading  ./Shakespeare\Othello.txt
Loading  ./Shakespeare\Romeo_and_Juliet.txt
Loading  ./Shakespeare\Tempest.txt
Language Model creating time: 0.6798264980316162
Generated text:




 Enter Ghost . 

IAGO .  There’s a back to pieces .   That both friend ?

CLOWN .    Too much is well , my soule her fair for my old cakes of . 

OPHELIA . 

Burthen : Then gave it needs be wrencht with the wall . 

    Nurse .  [ _Sings .  Oh , that is some other shelter hereabout ,
And Princesse can it hard , wormwood on mine own kisses sin . 

HORATIO .  You speak to the dust ; say to be brain 'd in love
Loading  ./Shakespeare\Hamlet.txt
Loading  ./Shakespeare\Macbeth.txt
Loading  ./Shakespeare\Othello.txt
Loading  ./Shakespeare\Romeo_and_Juliet.txt
Loading  ./Shakespeare\Tempest.txt
Language Model creating time: 0.7434020042419434
Generated text:


 I tremble at this haste ?
  Ross .  I did stay .  I that name , Iago and Gentlemen .  

In [17]:
def tokenize(text: str) -> List[str]:
    # Treat punctuation as a separate token.
    # Add space before punctuation.
    punct="!\"#$%&'()*+,-—./:;<=>?@[\]^_`{|}~"
    curr=text
    for p in punct:
        curr=curr.replace(p,' '+p)
    # Split using spaces.
    return curr.split(' ')

print(tokenize("BARNARDO. Looks it not like the King? Mark it, Horatio."))

['BARNARDO', '.', 'Looks', 'it', 'not', 'like', 'the', 'King', '?', 'Mark', 'it', ',', 'Horatio', '.']
