In [3]:
from pathlib import Path
import numpy as np
from collections import defaultdict, Counter
from pickle import load, dump

In [46]:
def globData(root = 'data'):
    
    '''
    Collects text files from three folders and returns them as a list.
    INPUT:
        root (optional): the folder containing the poem folders.
        
    OUTPUT:
        ret: 3-tuple with (normal, happy, sad) data.
        Each object is a list. Each list item is one text file found in the folders.
    '''
    
    root = Path(root)
    
    assert root.exists(), 'Path input does not lead anywhere.'
    
    ret = [[], [], []]
    
    for file in (root / 'regularpoems').glob('*.txt'):
        
        with open(file, mode = 'rb') as f:
            ret[0].append(f.read().decode(errors = 'ignore'))
        
    for file in (root / 'happypoems').glob('*.txt'):
        
        with open(file, mode = 'rb') as f:
            ret[1].append(f.read().decode(errors = 'ignore'))
        
    for file in (root / 'sadpoems').glob('*.txt'):
        
        with open(file, mode = 'rb') as f:
            ret[2].append(f.read().decode(errors = 'ignore'))
    
    return ret

In [5]:
norm, happy, sad = globData()

In [6]:
from collections import defaultdict, Counter

In [7]:
def normalize(counter):
    """ Convert a `letter -> count` counter to a list 
       of (letter, frequency) pairs, sorted in descending order of 
       frequency.
    
        Parameters
        -----------
        counter : collections.Counter
            letter -> count
            
        Returns
        -------
        List[Tuple[str, int]]
           A list of tuples - (letter, frequency) pairs in order
           of descending-frequency
        
        Examples
        --------
        >>> from collections import Counter
        >>> letter_count = Counter({"a": 1, "b": 3})
        >>> letter_count
        Counter({'a': 1, 'b': 3})
        
        >>> normalize(letter_count)
        [('b', 0.75), ('a', 0.25)]
    """
    total = sum(counter.values())
    return [(char, cnt/total) for char, cnt in counter.most_common()]

In [8]:
def train_lm(texts, n):
    """ Train character-based n-gram language model.
        
        This will learn: given a sequence of n-1 characters, what the probability
        distribution is for the n-th character in the sequence.
        
        For example if we train on the text:
            text = "cacao"
        
        Using a n-gram size of n=3, then the following dict would be returned.
        See that we *normalize* each of the counts for a given history
        
            {'ac': [('a', 1.0)],
             'ca': [('c', 0.5), ('o', 0.5)],
             '~c': [('a', 1.0)],
             '~~': [('c', 1.0)]}

        Tildas ("~") are used for padding the history when necessary, so that it's 
        possible to estimate the probability of a seeing a character when there 
        aren't (n - 1) previous characters of history available.
        
        So, according to this text we trained on, if you see the sequence 'ac',
        our model predicts that the next character should be 'a' 100% of the time.
        
       For generatiing the padding, recall that Python allows you to generate 
        repeated sequences easily: 
           `"p" * 4` returns `"pppp"`
           
        Parameters
        -----------
        text: str 
            A string (doesn't need to be lowercased).
        n: int
            The length of n-gram to analyze.
        
        Returns
        -------
        Dict[str, List[Tuple[str, float]]]
          {n-1 history -> [(letter, normalized count), ...]}
        A dict that maps histories (strings of length (n-1)) to lists of (char, prob) 
        pairs, where prob is the probability (i.e frequency) of char appearing after 
        that specific history.

        Examples
        --------
        >>> train_lm("cacao", 3)
        {'ac': [('a', 1.0)],
         'ca': [('c', 0.5), ('o', 0.5)],
         '~c': [('a', 1.0)],
         '~~': [('c', 1.0)]}
    """
    raw_lm = defaultdict(Counter)
    
    # count number of times characters appear following different histories
    # `raw_lm`: {history -> Counter}
    for text in texts:
        history = "~" * (n - 1)
        for char in text:
            raw_lm[history][char] += 1
            # slide history window to the right by one character
            history = history[1:] + char
    
    # create final dictionary, normalizing the counts for each history
    lm = {history : normalize(counter) for history, counter in raw_lm.items()}
    
    return lm

In [9]:
def unzip(pairs):
    """
    "unzips" of groups of items into separate lists.
    
    Example: pairs = [("a", 1), ("b", 2), ...] --> (("a", "b", ...), (1, 2, ...))
    """
    return tuple(zip(*pairs))

In [10]:
def generate_letter(lm, history):
    """ Randomly picks letter according to probability distribution associated with 
        the specified history, as stored in your language model.
    
        Note: returns dummy character "~" if history not found in model.
    
        Parameters
        ----------
        lm: Dict[str, List[Tuple[str, float]]] 
            The n-gram language model. 
            I.e. the dictionary: history -> [(char, freq), ...]
        
        history: str
            A string of length (n-1) to use as context/history for generating 
            the next character.
        
        Returns
        -------
        str
            The predicted character. '~' if history is not in language model.
    """
    if not history in lm:
        return "~"
    letters, probs = unzip(lm[history])
    i = np.random.choice(letters, p=probs)
    return i

In [11]:
def generate_text(lm, n, nletters=100):
    """ Randomly generates `nletters` of text by drawing from 
        the probability distributions stored in a n-gram language model 
        `lm`.
    
        Parameters
        ----------
        lm: Dict[str, List[Tuple[str, float]]]
            The n-gram language model. 
            I.e. the dictionary: history -> [(char, freq), ...]
        n: int
            Order of n-gram model.
        nletters: int
            Number of letters to randomly generate.
        
        Returns
        -------
        str
            Model-generated text.
    """
    history = "~" * (n - 1)
    text = []
    for i in range(nletters):
        c = generate_letter(lm, history)
        text.append(c)
        history = history[1:] + c
    return "".join(text)    

In [13]:
#Adds the normal (base) data to the happy and sad bases.
happy += norm
sad += norm

In [14]:
#Trains and saves the model.
happy_lm = train_lm(happy, N)
dump(happy_lm, open('happy.dat', 'wb'))

In [15]:
#Trains the model and saves the model.
sad_lm = train_lm(sad, N)
dump(sad_lm, open('sad.dat', 'wb'))

In [16]:
#See it work!
print(generate_text(happy_lm, N, 500))

Chapter I 


AN UNEXPECTED PARTY 


In a different at first seem. Certainly not going to."
"No."
"I see."

"My family all died and I came into this dive 
you get held in this sort of questions here," he said, 
"I've got great news! I've located the ship's artificial night closed in they were after him. Even temporarily refracted 
into a tense crouch, feeling for the moment floating soggily on the 
drawing-room and thought. The word yellow wandered through the dark rustling
trees. 


In [17]:
#See it work!
print(generate_text(sad_lm, N, 500))

﻿The Angel of the Odd,when this hog, I say, which hitherto had
been driving a little, things for myself only, and trying to find out what they had seen his parents were brave.... I killed your father gave me no clew in this respect, in as good order at the expiration of an hour, the fair and debonair, that now so lowly lies,
The life upon her yellow hair but not without some claim to attention, and get me to come
from over the hill, and there was a knock. 

"Hello," said Harry, trying to hi


In [45]:
def genHappy(letters = 500, N = 13):
    '''
    Generates a short, happy story.
    
    INPUTS:
        letters (optional): the numbers of letters to generate before processing.
        NOTE: This argument does not necessarily guarantee the length of the story.
        However, it will generally be close to this.
        N (optional): the N value used in training the data.
        
    OUTPUT:
        text: a string containing the happy story.
    '''
    text = generate_text(happy_lm, N, letters)
    if '.' in text:
        while text[-1] != '.':
            text = text[:-1]
    return text
print(genHappy())

﻿Far out in the end, not quite sure neither of which tended to make breakfast. 

They all nodded. A favorite game in quarry had been quite expensive. 

"I mean it," he added.
"You know sometimes politer in word than in deed. The time would come out of it, first on one side. "Right, then," said Adam, who had accepted his
hospitality and so become authorities on his past, had increased
her value in his eyes. Now he felt 
he ought to first sort out," said Arthur.


In [39]:
def genSad(letters = 500, N = 13):
    '''
    Generates a short, sad story.
    
    INPUTS:
        letters (optional): the numbers of letters to generate before processing.
        NOTE: This argument does not necessarily guarantee the length of the story.
        However, it will generally be close to this.
        N (optional): the N value used in training the data.
        
    OUTPUT:
        text: a string containing the sad story.
    '''
    text = generate_text(sad_lm, N, letters)
    if '.' in text:
        while text[-1] != '.':
            text = text[:-1]
    return text 
print(genSad())

CHAPTER ONE 

THE BOY WHO LIVED 

Mr. and Mrs. Dursley on the cheek, and her hand was still very 
skilled. 

As they stood, the
upper limbs being partially closed, precautions. You shall conquer, for my sake."

This I sat engaged in the chamber, down the stairs, crawling up them like a black and ominous crow. The only 
sound was the shadow of the Mountain. Booking 
down they saw that he must be about His Father's business,
the service of a vast, vulgar, and meretricious beauty.
