In [346]:
import numpy as np
from collections import Counter
from collections import defaultdict
import string

In [449]:
def normalize(counter):
    """ Converts a letter -> count counter to a list of (letter, 
    frequency) pairs, sorted in descending order of frequency.
    
        Parameters
        -----------
        counter : collections.Counter
            letter -> count
            
        Returns
        -------
        A list of (letter, frequency) pairs, sorted in descending 
        order of frequency. """

    total = sum(counter.values())
    return [(char, cnt/total) for char, cnt in counter.most_common()]

In [450]:
def train_lm(text, n):
    """ Trains a character-based n-gram language model.
        
        Parameters
        -----------
        text: str 
            
        n: int
            the length of the n-gram to analyze.
        
        Returns
        -------
        A dictionary that maps history to a list of tuples that 
        describes the probability of each following character. """
    
    raw_lm = defaultdict(Counter)
    history = text[:n]
    
    for char in text[n:]:
        raw_lm[history][char] += 1
        history = history[1:] + char
    
    lm = {history : normalize(counter) for history, counter in raw_lm.items()}
    return lm

In [692]:
def generate_letter(lm, history):
    """ Randomly generates a letter according to the probability 
    distribution associated with the specified history.
        
        Parameters
        ----------
        lm: Dict[str, List[Tuple[str, float]]] 
            the n-gram language model. 
        
        history: str
            a string of length (n-1) to use as history when generating 
            the next character.
        
        Returns
        -------
        The predicted character. """
    
    if not history in lm:
        return chr(np.random.randint(97, 97 + 26))
    letters, probs = tuple(zip(*lm[history]))
    i = np.random.choice(letters, p=probs)
    return i

In [693]:
def generate_phrase(lm, n, total_words = 1):
    """ Randomly generates a phrase by drawing from the probability 
    distributions stored in the n-gram language model.
    
        Parameters
        ----------
        lm: Dict[str, List[Tuple[str, float]]]
            the n-gram language model. 
            
        n: int
            order of n-gram model.
            
        total_words : int
            the number of words to be generated
        
        Returns
        -------
        Model-generated phrase. """

    word_start_hist = [hist for hist in lm.keys() if hist.startswith(' ')]
    i = np.random.randint(len(word_start_hist))
    history = word_start_hist[i]
    
    text = []
    text.extend(history[1:])
    
    spaces = 0
    
    while True:
        c = generate_letter(lm, history)
        if c == ' ':
            spaces += 1
            if spaces == total_words:
                break
        text.append(c)
        history = history[1:] + c
        
    return "".join(text)

In [694]:
def n_gram_jokes(path_to_nouns, path_to_verbs, n):
    """ Generates a really funny joke based on a text file of words.
    
        Parameters
        ----------
        path_to_words : string
            a string with the path to the text file of words to be
            including in the jokes
            
        n : scalar
            the value of n for the n-gram model
                                
        Returns
        -------
        A really funny joke. """

    with open(path_to_nouns, "r") as f:
        nouns = f.read()
    nouns = " ".join(nouns.split())
    
    with open(path_to_verbs, "r") as f:
        verbs = f.read()
    verbs = " ".join(verbs.split())
    
    lm_noun = train_lm(nouns, n)
    lm_verb = train_lm(verbs, n)
        
    jokes = ["Knock knock. \nWho's there? \n{0} \n{0} who? \n{0} {1} ".format(generate_phrase(lm_noun, n, np.random.randint(1, 3)).capitalize(), generate_phrase(lm_noun, n, np.random.randint(1, 3))),                
             "Why did the {} {} the {}? \nTo {} {}!".format(generate_phrase(lm_noun, n, np.random.randint(1, 3)), generate_phrase(lm_verb, n), generate_phrase(lm_noun, n, np.random.randint(1, 3)), generate_phrase(lm_verb, n), generate_phrase(lm_noun, n, np.random.randint(1, 3))),
             "How many {} does it take to {} a {}? \n{}".format(generate_phrase(lm_noun, n, np.random.randint(1, 3)), generate_phrase(lm_verb, n), generate_phrase(lm_noun, n, np.random.randint(1, 3)), generate_phrase(lm_noun, n).capitalize()),
             "*slaps roof of {0}* \nThis {0} can fit so much {1} in it".format(generate_phrase(lm_noun, n), generate_phrase(lm_noun, n)),
             "Thank you {} very {}".format(generate_phrase(lm_noun, n, np.random.randint(1, 3)), generate_phrase(lm_noun, n, np.random.randint(1, 3)))]    
    i = np.random.randint(len(jokes))    
    return jokes[i]

In [701]:
print(n_gram_jokes("nouns.txt", "verbs.txt", 5))

Knock knock. 
Who's there? 
Necessary 
Necessary who? 
Necessary death 
