In [84]:
import numpy as np
from collections import Counter
from collections import defaultdict

In [2]:
def normalize(counter):
    """ Converts a letter -> count counter to a list of (letter, 
    frequency) pairs, sorted in descending order of frequency.
    
        Parameters
        -----------
        counter : collections.Counter
            letter -> count
            
        Returns
        -------
        A list of (letter, frequency) pairs, sorted in descending 
        order of frequency. """

    total = sum(counter.values())
    return [(char, cnt/total) for char, cnt in counter.most_common()]

In [3]:
def train_lm(text, n):
    """ Trains a character-based n-gram language model.
        
        Parameters
        -----------
        text: str 
            
        n: int
            the length of the n-gram to analyze.
        
        Returns
        -------
        A dictionary that maps history to a list of tuples that 
        describes the probability of each following character. """
    
    raw_lm = defaultdict(Counter)
    # padding
    history = "~" * (n - 1)
    
    for char in text:
        raw_lm[history][char] += 1
        history = history[1:] + char
    
    lm = {history : normalize(counter) for history, counter in raw_lm.items()}
    return lm

In [4]:
def generate_letter(lm, history):
    """ Randomly generates a letter according to the probability 
    distribution associated with the specified history.
        
        Parameters
        ----------
        lm: Dict[str, List[Tuple[str, float]]] 
            the n-gram language model. 
        
        history: str
            a string of length (n-1) to use as history when generating 
            the next character.
        
        Returns
        -------
        A tuple containing the predicted character and the history. """
    
    if not history in lm:
        if history[-1] == '\n':
            return ('A', history)
        elif history[-1] == 'A':
            return (':', history)
        elif history[-1] == ':':
            return (' ', history)
        else:
            # forcibly change history
            A_list = [hist for hist in lm.keys() if hist.endswith('\nA: ')]
            A_i = np.random.randint(len(A_list))
            history = A_list[A_i]
    letters, probs = tuple(zip(*lm[history]))
    i = np.random.choice(letters, p=probs)
    return (i, history)

In [5]:
def generate_text(lm, n, nletters = 200):
    """ Randomly generates text by drawing from the probability 
    distributions stored in the n-gram language model.
    
        Parameters
        ----------
        lm: Dict[str, List[Tuple[str, float]]]
            the n-gram language model. 
            
        n: int
            order of n-gram model.
            
        nletters: int
            number of letters to randomly generate.
        
        Returns
        -------
        Model-generated text. """
    
    history = "~" * (n - 1)
    text = []
    finished = False
    for i in range(nletters):
        # keeps joke in Q&A format
        if history[-1] == '\n':
            c = 'A'
            finished = True
        else:
            c, history = generate_letter(lm, history)
        if finished and c == '\n':
            break
        text.append(c)
        history = history[1:] + c
    return "".join(text)

In [128]:
with open("jokes.txt", "r") as f:
    jokes = f.read()

lm_jokes = train_lm(jokes, 8)

In [129]:
import pickle

with open("lm_jokes.pkl", mode="wb") as f:
    pickle.dump(lm_jokes, f)

In [130]:
def gen_joke(n):
    with open("lm_jokes.pkl", mode="rb") as f:
        lm_jokes = pickle.load(f)
        
    joke = generate_text(lm_jokes, n)
    joke = joke.replace('Q: ', '')
    joke = joke.replace('A: ', '')
    return joke

In [136]:
print(gen_joke(8))

What do you call the sound a dog makes while trying to hold a yoga pose? 
The dande-lion.
