In [4]:
import nltk
import numpy as np

In [8]:
corpus = "Temp corpus"

In [10]:
tokens = nltk.word_tokenize(corpus)

In [None]:
import numpy as np
import time
from collections import Counter
from collections import defaultdict

def unzip(pairs):
    """Splits list of pairs (tuples) into separate lists.
    
    Example: pairs = [("a", 1), ("b", 2)] --> ["a", "b"] and [1, 2]
    
    This should look familiar from our review back at the beginning of week 1
    :)
    """
    return tuple(zip(*pairs))

def normalize(counter):
    """ Convert counter to a list of (letter, frequency) pairs, sorted in descending order of frequency.
    
        Parameters
        -----------
        counter: A Counter-instance

        Returns
        -------
        A list of tuples - (letter, frequency) pairs. 
        
        For example, if counter had the counts:
        
            {'a': 1, 'b': 3}
        
        `normalize(counter)` will return:
        
            [('b', 0.75), ('a', 0.25)]
    """
    total = sum(counter.values())
    return [(char, cnt/total) for char, cnt in counter.most_common()]

def train_lm(text, n):
    """ Train character-based n-gram language model.
        
        This will learn: given a sequence of n-1 characters, what the probability
        distribution is for the n-th character in the sequence.
        
        For example if we train on the text:
            text = "cacao"
        
        Using a n-gram size of n=3, then the following dict would be returned:
        
            {'ac': [('a', 1.0)],
             'ca': [('c', 0.5), ('o', 0.5)],
             '~c': [('a', 1.0)],
             '~~': [('c', 1.0)]}

        Tildas ("~") are used for padding the history when necessary, so that it's 
        possible to estimate the probability of a seeing a character when there 
        aren't (n - 1) previous characters of history available.
        
        So, according to this text we trained on, if you see the sequence 'ac',
        our model predicts that the next character should be 'a' 100% of the time.
        
       For generatiing the padding, recall that Python allows you to generate 
        repeated sequences easily: 
           `"p" * 4` returns `"pppp"`
           
        Parameters
        -----------
        text: str 
            A string (doesn't need to be lowercased).
        n: int
            The length of n-gram to analyze.
        
        Returns
        -------
        A dict that maps histories (strings of length (n-1)) to lists of (char, prob) 
        pairs, where prob is the probability (i.e frequency) of char appearing after 
        that specific history. For example, if

    """
    raw_lm = defaultdict(Counter)
    history = "~" * (n - 1)
    
    # count number of times characters appear following different histories
    for x in text:
        raw_lm[history][x] += 1
        history = history[1:] + x
    
    # create final dictionary by normalizing
    lm = { history : normalize(counter) for history, counter in raw_lm.items() }
    
    return lm

def generate_letter(lm, history):
    """ Randomly picks letter according to probability distribution associated with 
        the specified history.
    
        Note: returns dummy character "~" if history not found in model.
    
        Parameters
        ----------
        lm: Dict[str, Tuple[str, float]] 
            The n-gram language model. I.e. the dictionary: history -> (char, freq)
        
        history: str
            A string of length (n-1) to use as context/history for generating 
            the next character.
        
        Returns
        -------
        str
            The predicted character. '~' if history is not in language model.
    """
    if not history in lm:
        return "~"
    letters, probs = unzip(lm[history])
    i = np.random.choice(letters, p=probs)
    return i

def generate_text(lm, n, nletters=100):
    """ Randomly generates nletters of text with n-gram language model lm.
    
        Parameters
        ----------
        lm: Dict[str, Tuple[str, float]] 
            The n-gram language model. I.e. the dictionary: history -> (char, freq)
        n: int
            Order of n-gram model.
        nletters: int
            Number of letters to randomly generate.
        
        Returns
        -------
        str
            Model-generated text.
    """
    history = "~" * (n - 1)
    text = []
    for i in range(nletters):
        c = generate_letter(lm, history)
        text.append(c)
        history = history[1:] + c
    return "".join(text)

"""path_to_wikipedia = "wikipedia2text-extracted.txt"
with open(path_to_wikipedia, "rb") as f:
    wikipedia = f.read().decode().lower()
    wikipedia
print(str(len(wikipedia)) + " character(s)")

counter = Counter(wikipedia)
letters = "abcdefghijklmnopqrstuvwxyz"
counts = [(char, cnt) for char, cnt in counter.most_common() if char in letters]

total = sum(cnt for _, cnt in counts)
freqs = [(char, cnt/total) for char, cnt in counts]

tokens = wikipedia.split()
word_counter = Counter(tokens)"""