# Markov Chain

Use markov chain to generate authors names from the Collection of Poems from Poetry Foundation. 

##  Import and config

In [1]:
import string
import re
import pandas as pd
import numpy as np
import random
import os
from tqdm import tqdm
import wget
import sys
import math

ORDER = 2

## Load Data

In [2]:
def download_file(url, filename):
    def bar_progress(current, total, width=80):
      progress_msg = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
      sys.stdout.write("\r" + progress_msg)
      sys.stdout.flush()

    if os.path.exists('_downloaded')==False:
        os.mkdir('_downloaded')
        
    filepath = os.path.join('_downloaded', filename)
    if os.path.exists(filepath)==True:
        return filepath, 0
    
    wget.download(url, filepath, bar=bar_progress)
    
    return filepath, 0

def load_names_from_poetry_foundations():
    url = 'https://raw.githubusercontent.com/vincentbonnetcg/Numerical-Bric-a-Brac/master/data/datasets/kaggle_poem_dataset.csv'
    filename = 'kaggle_poem_dataset.csv'
    filepath, num_bytes = download_file(url, filename)
    
    # from https://www.kaggle.com/tgdivy/poetry-foundation-poems  should be downloaded
    #filepath = os.path.join(os.getcwd(), "kaggle_poem_dataset.csv")
    data = pd.read_csv(filepath)
    first_names, family_names = set(), set()
    for author_names in data['Author']:
        names = re.sub('[^a-z]+', ' ', author_names.lower()).split()
        if len(names) == 2:
            first_names.add(names[0])
            family_names.add(names[1])

    return list(first_names), list(family_names)

first_names, family_names = load_names_from_poetry_foundations()

## Word embedding

Word embedding consists in converting string into integer values

In [3]:
def max_word_embedding():
    return len(string.ascii_lowercase) ** ORDER

'''
Word embedding functions
'''
def subword_embedding(subword):
    # encode the subword
    # order_2 : 26^1 + 26^0
    # order_3 : 26^2 + 26^1 + 26^0
    assert len(subword)==ORDER
    char_embedding = 0
    for i, char in enumerate(subword):
        char_int = ord(char)-ord('a')
        base = len(string.ascii_lowercase) ** (ORDER-1-i)
        char_embedding += char_int * base
    return char_embedding

def word_embedding(word):
    embedding = []
    for i in range(len(word) - ORDER + 1):
        embedding.append(subword_embedding(word[i:i+ORDER]))
    return embedding

'''
Word embedding reverse functions
'''
def subword_reverse_embedding(embedded):
    subword = []
    for i in range(ORDER):
        base = len(string.ascii_lowercase) ** (ORDER-i-1)
        char_int = math.floor(embedded / base)
        embedded = embedded - (char_int * base)
        subword.append(string.ascii_lowercase[char_int])
    return subword

def word_reverse_embedding(embedded):
    word = []
    for i, v in enumerate(embedded):
        subword = subword_reverse_embedding(v)
        if i==0:
            for char in subword:
                word.append(char)
        else:
            word.append(subword[-1])
    return ''.join(word)

encoded = word_embedding('testembedding')
decoded = word_reverse_embedding(encoded)
print("encoded ", encoded)
print("decoded ", decoded)

encoded  [498, 122, 487, 498, 116, 313, 30, 107, 81, 86, 221, 344]
decoded  testembedding


## Transition matrix

A **transition matrix** (also known as a stochastic matrix) is a square matrix that describes the transitions in a Markov chain $M$. Each entry $M_{i, j}$ is a positive real number that represents the likelihood of moving from i to j in a single time step.

In [4]:
def transition_matrix(names):
    # create transition matrix
    N = max_word_embedding()
    mat = np.zeros((N, N))
    for name in names:
        for jj in range(1, len(name) - ORDER + 1):
            ii = jj-1
            i = subword_embedding(name[ii:ii+ORDER])
            j = subword_embedding(name[jj:jj+ORDER])
            mat[i,j] += 1
            
    # compute the probabilities
    for i in range(N):
        s = np.sum(mat[i,:])
        if s!=0.0:
            mat[i,:] /= s
  
    return mat

## Generate author names

In [7]:
def generate_name(names, num_words):
    N = max_word_embedding()
    # compute transition matrix
    transition = transition_matrix(names)
    # get random start
    subword = random.choice(names)[:ORDER]
    txt = [v for v in subword]
    # generate random name
    i = word_embedding(subword)[0]
    for _ in range(num_words):
        probability_row = transition[i,:]
        value = np.random.choice(range(N), replace=True,p=probability_row)
        txt.append(subword_reverse_embedding(value)[-1])
        i = value
    return ''.join(txt) 

first_name = generate_name(first_names, 5)
family_name = generate_name(family_names, 8)
print('generated name : {} {}'.format(first_name, family_name))

generated name : centede hillegerne
