# Markov Chains

## Implementation 1

In [1]:
import string
import numpy as np

In [2]:
# Path of the text file containing the training data
training_data_file = 'eminem_songs_lyrics.txt'

In [3]:
def remove_punctuation(line):
    '''
    Given a string, this function will remove the punctuation associated to it
    
    args:
        line (String) : The sentence you want to remomve punctuations from
    
    example:
        line = 'bonjour! hello world@~ whatt a time to, be alive~!@#$%^&*'
        remove_punctuation(line)
        >> 'bonjour hello world whatt a time to be alive'
    '''
    return line.translate(str.maketrans('','', string.punctuation))

In [4]:
def add2dict(dictionary, key, value):
    '''
    If the key is not present in the dictionary, it will create a value of an empty list associated to the key. If the key is present in the dicitonary
    this function will append the value associated to that key 
    '''
    if key not in dictionary:
        dictionary[key] = []
    dictionary[key].append(value)

In [5]:
def list2probabilitydict(given_list):
    '''
    Takes a list of words and creates a dictionary of probabilities associated to the words in the list
    '''
    probability_dict = {}
    given_list_length = len(given_list)
    for item in given_list:
        probability_dict[item] = probability_dict.get(item, 0) + 1
    for key, value in probability_dict.items():
        probability_dict[key] = value / given_list_length
    return probability_dict

In [6]:
initial_word = {}
second_word = {}
transitions = {}

In [7]:
# Trains a Markov model based on the data in training_data_file
def train_markov_model():
    for line in open(training_data_file):
        tokens = remove_punctuation(line.rstrip().lower()).split()
        tokens_length = len(tokens)
        for i in range(tokens_length):
            token = tokens[i]
            if i == 0:
                initial_word[token] = initial_word.get(token, 0) + 1
            else:
                prev_token = tokens[i - 1]
                if i == tokens_length - 1:
                    add2dict(transitions, (prev_token, token), 'END')
                if i == 1:
                    add2dict(second_word, prev_token, token)
                else:
                    prev_prev_token = tokens[i - 2]
                    add2dict(transitions, (prev_prev_token, prev_token), token)
    
    # Normalize the distributions
    initial_word_total = sum(initial_word.values())
    for key, value in initial_word.items():
        initial_word[key] = value / initial_word_total
        
    for prev_word, next_word_list in second_word.items():
        second_word[prev_word] = list2probabilitydict(next_word_list)
        
    for word_pair, next_word_list in transitions.items():
        transitions[word_pair] = list2probabilitydict(next_word_list)
    
    print('Training successful.')

In [8]:
train_markov_model()

Training successful.


In [9]:
def sample_word(dictionary):
    p0 = np.random.random()
    cumulative = 0
    for key, value in dictionary.items():
        cumulative += value
        if p0 < cumulative:
            return key
    assert(False)

In [11]:
# Function to generate sample text
def generate(number_of_sentences):
    for i in range(number_of_sentences):
        sentence = []
        # Initial word
        word0 = sample_word(initial_word)
        sentence.append(word0)
        # Second word
        word1 = sample_word(second_word[word0])
        sentence.append(word1)
        # Subsequent words untill END
        while True:
            word2 = sample_word(transitions[(word0, word1)])
            if word2 == 'END':
                break
            sentence.append(word2)
            word0 = word1
            word1 = word2
        print(' '.join(sentence))

In [12]:
generate(number_of_sentences = 5)

enough rhymes to
what else can i say if life was a highway
but as rude and as indecent as all hell
to meet rundmc and induct them
its not hiphop and i’m just not eminem


## Implementation 2

In [1]:
from collections import defaultdict
import string
import random

In [16]:
class Markov():
    def __init__(self, file_path):
        self.file_path = file_path
        
        self.text = self.remove_punctuations(self.get_text())
        self.model = self.model()
        
    def get_text(self):
        '''
        This function will read the input file and return the text associated to the file line by line in a list
        '''
        text = []
        for line in open(self.file_path):
            text.append(line)
        return ' '.join(text)
    
    def remove_punctuations(self, text):
        '''
        Given a string of text this function will return the same input text without any punctuations
        '''
        return text.translate(str.maketrans('','', string.punctuation))
    
    def model(self):
        '''
        This function will take a block of text as the input and map each word in the text to a key where the
        values associated to that key are the words which proceed it

        args:
            text (String) : The string of text you wish to train your markov model around

        example:
            text = 'hello my name is V hello my name is G hello my current name is F world today is a good day'
            markov_model(text)
            >> {'F': ['world'],
                'G': ['hello'],
                'V': ['hello'],
                'a': ['good'],
                'current': ['name'],
                'good': ['day'],
                'hello': ['my', 'my', 'my'],
                'is': ['V', 'G', 'F', 'a'],
                'my': ['name', 'name', 'current'],
                'name': ['is', 'is', 'is'],
                'today': ['is'],
                'world': ['today']}
        '''

        # split the input text into individual words seperated by spaces
        words = self.text.split(' ')

        markov_dict = defaultdict(list)

        # create list of all word pairs
        for current_word, next_word in zip(words[0:-1], words[1:]):
            markov_dict[current_word].append(next_word)

        markov_dict = dict(markov_dict)
        print('Successfully Trained')
        return markov_dict

In [20]:
def predict_words(chain, first_word, number_of_words=5):
    '''
    Given the input result from the markov_model function and the nunmber of words, this function will allow you to predict the next word
    in the sequence
    
    args:
        chain (Dictionary) : The result of the markov_model function
        first_word (String) : The word you want to start your prediction from, note this word must be available in chain
        number_of_words (Integer) : The number of words you want to predict
    
    example:
        chain = markov_model(text)
        generate_sentence(chain, first_word = 'do', number_of_words = 3)
        >> Do not fail.
    '''
    
    if first_word in list(chain.keys()):
        word1 = str(first_word)
        
        predictions = word1.capitalize()

        # Generate the second word from the value list. Set the new word as the first word. Repeat.
        for i in range(number_of_words-1):
            word2 = random.choice(chain[word1])
            word1 = word2
            predictions += ' ' + word2

        # End it with a period
        predictions += '.'
        return predictions
    else:
        return "Word not in corpus"
        

In [24]:
if __name__ == '__main__':
    m = Markov(file_path='eminem_songs_lyrics.txt')
    chain = m.model
    print(predict_words(chain, first_word = 'do'))

Successfully Trained
Do not get one spot.


## Implementation 3

In [1]:
import random
from collections import Counter
import re


class MarkovModel:
    """
    A simple discrete-time, discrete space first-order Markov model.
    The probability matrix is a square matrix represented this way:
    ```
          +-----+-----+-----+
          |  A  |  B  |  C  |
    +-----+-----+-----+-----+
    |  A  |  a  |  b  |  c  |
    +-----+-----+-----+-----+
    |  B  |  d  |  e  |  f  |
    +-----+-----+-----+-----+
    |  C  |  i  |  j  |  k  |
    +-----+-----+-----+-----+
    ```
    with:
     - `a` the probability for the state A to got to state A
     - `b` the probability for the state A to got to state B
     - `c` the probability for the state A to got to state C
     - ...
    Instead of using a 2D array, we use a dictionary of counters.
    The dictionary contains the rows indexed by each state, each row contains counters indexed again by each state.
    Using dictionary is usually simpler (we do not have to handle hash the elements), and faster than using an array
    (O(1) instead of O(n) to access it, operation we use a lot).
    Using a 2D array + a separate index + a hash function would be a bit faster, and a lot less memory consuming,
    but more confusing and less generic.
    """

    def __init__(self, states):
        """
        Create a markov chain
        :param states: a set of all the different states
        """
        self.states = states
        # We create the matrix
        self.matrix = {state: Counter() for state in self.states}

    def next_state(self, current_state):
        """
        Generate a next state according to the matrix's probabilities
        :param current_state: the state to start with
        :return: a next state
        """
        row = self.matrix[current_state]  # We get the row associated with the current state

        # Here, we want to get an random element in respect to the probabilities in the row. We do this in O(n) by
        # selecting a random number between 0 and 1, walking though the elements and their probability in the list,
        # subtracting the probabilities from our number until it is 0 or less.
        # But since the probabilities in the row do not add up to 1 (it is only a part of the matrix), we generate a
        # number between 0 and the sum of probabilities in the row
        total = sum(row.values())
        number = random.uniform(0.0, total)  # Generate a number in [0, total] with equal probability
        for state, probability in row.items():
            number -= probability
            if number <= 0:
                return state

    def probability_of_chain(self, chain):
        """
        Compute the probability for a given chain of text to occur.
        :param chain: the chain of states as an ordered list
        :return: the probability for it to happen
        """
        # If the chain is empty, we return a null probability
        if len(chain) == 0:
            return 0

        # If the chain is made of a single state, we return 1 if the state exists, 0 otherwise
        if len(chain) == 1:
            if chain[0] in self.matrix:
                return 1
            else:
                return 0

        probability = 1.0
        for state, next_state in zip(chain, chain[1:]):
            row = self.matrix[state]  # The row associated with the state

            # If the transition between state and next_state is impossible, the probability of the chain is 0
            if next_state not in row:
                return 0

            probability *= row[next_state]
        return probability

    def generate_chain(self, start_state, size):
        """
        Generate of probable chain of state, respecting the probabilities in the matrix
        :param start_state: the starting state of the chain
        :param size: the size of the chain
        :return: the chain as an ordered list
        """
        chain = [start_state]
        state = start_state
        for n in range(0, size):
            state = self.next_state(state)
            chain.append(state)
        return chain

    def train(self, chain):
        """
        Train the model on an example chain
        :param chain: the chain of state as an ordered list
        """
        # We read the text two words by two words
        for s1, s2 in zip(chain, chain[1:]):
            self.matrix[s1][s2] += 1

        # We normalize the matrix, transforming occurrences into probabilities
        factor = 1.0 / (len(chain) - 1)  # Instead of dividing by the number of words - 1, we use a multiplication
        for row in self.matrix.values():
            for state, occurences in row.items():
                row[state] *= factor

In [2]:
class TextMarkovModel(MarkovModel):
    """
    A HMM that can be trained with a text and that is able to generate sentences from it.
    Here the states are the words in the vocabulary of the text.
    """

    def __init__(self, text):
        # We split the text into words
        self.words = self._lex(text)
        # The vocabulary is the set of different states
        self.states = set(self.words)
        super().__init__(self.states)

    def train(self):
        super().train(self.words)

    def _lex(self, text):
        """
        Splits the text into words
        :param text: the text
        :return: a list of words
        """
        # Split at each character or sequence of character that is not a valid word character (in the \w regex class)
        return re.compile('[^\w]+').split(text)

In [3]:
text = '''
Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had died too long ago for her to have more than an indistinct
remembrance of her caresses; and her place had been supplied
by an excellent woman as governess, who had fallen little short
of a mother in affection.

Sixteen years had Miss Taylor been in Mr. Woodhouse's family,
less as a governess than a friend, very fond of both daughters,
but particularly of Emma.  Between _them_ it was more the intimacy
of sisters.  Even before Miss Taylor had ceased to hold the nominal
office of governess, the mildness of her temper had hardly allowed
her to impose any restraint; and the shadow of authority being
now long passed away, they had been living together as friend and
friend very mutually attached, and Emma doing just what she liked;
highly esteeming Miss Taylor's judgment, but directed chiefly by
her own.
'''
hmm = TextMarkovModel(text)
hmm.train()
print(' '.join(hmm.generate_chain("the", 7)))

the two daughters of a most affectionate indulgent
