## Text Generation
Using markov chains to generate text

In [1]:
import pandas as pd

corpus_pd = pd.read_pickle('corpus.pkl')
corpus_pd

Unnamed: 0,transcript,full_names
ali,"Ladies and gentlemen, please welcome to the st...",Ali Wong
anthony,"Thank you. Thank you. Thank you, San Francisco...",Anthony Jeselnik
bert,[electronic music playing] [male announcer] La...,Bill Burr
bill,"[cheers and applause] All right, thank you! Th...",Bo Burnham
dave,Sticks & Stones is Dave Chappelle’s fifth Netf...,Dave Chappelle
hasan,[theme music: orchestral hip-hop] [crowd roars...,Hasan Minhaj
jim,[Car horn honks] [Audience cheering] [Announce...,Jim Jefferies
joe,[rock music playing] [audience cheering] [anno...,Joe Rogan
john,[organ music playing] Welcome to Radio City Mu...,John Mulaney
louis,Intro\nFade the music out. Let’s roll. Hold th...,Louis C.K.


In [3]:
russell_text = corpus_pd.transcript.loc['russell']
russell_text[:100]

'-Yeah, Russell! Oi, Russell! Your mother is so fat. What the hell did you say to me? Do I look like '

# Build the Markov Chain

The keys should be all of the words in the corpus

The values should be a list of the words that follow the keys

In [4]:
from collections import defaultdict

def markov_chain(text):
    '''The input is a string of text and the output will be a dictionary with each word as
       a key and each value as the list of words that come after the key in the text.'''
    
    # Tokenize the text by word, though including punctuation
    words = text.split(' ')
    
    # Initialize a default dictionary to hold all of the words and next words
    m_dict = defaultdict(list)
    
    # Create a zipped list of all of the word pairs and put them in word: list of next words format
    for current_word, next_word in zip(words[0:-1], words[1:]):
        m_dict[current_word].append(next_word)

    # Convert the default dict back into a dictionary
    m_dict = dict(m_dict)
    return m_dict

In [5]:
russell_dict = markov_chain(russell_text)
russell_dict

{'-Yeah,': ['Russell!', 'bro.'],
 'Russell!': ['Oi,', 'Your'],
 'Oi,': ['Russell!'],
 'Your': ['mother',
  'mom',
  'mom',
  'good',
  'first',
  'son’s',
  'wife?',
  'parents'],
 'mother': ['is'],
 'is': ['so',
  'in',
  'like',
  'a',
  'fucking',
  'impressive.',
  'here.',
  'all',
  'more',
  'add',
  'gonna',
  'inflamed',
  'an',
  'hiring.”',
  'a',
  'be',
  'like',
  '’cause',
  'like',
  'like',
  'when',
  'that',
  'that?',
  'my',
  'my',
  'that?',
  'because…',
  'just',
  'fantastic.',
  'in',
  'half',
  'literally',
  '28.',
  'that',
  'what',
  'determination…',
  'very',
  'very',
  'everything',
  'your',
  'weird.',
  'as',
  'a',
  'race-based.',
  'I',
  'because',
  'the'],
 'so': ['fat.',
  'glad',
  'their',
  'scared!',
  'much',
  'mad,',
  'cheap,',
  'we',
  'they',
  'mad.',
  'I’m',
  'I',
  'slowly.',
  'slow',
  'the',
  'you',
  'violently',
  'hard',
  'hard',
  'bad.',
  'weird,',
  'he',
  'you',
  'casually.'],
 'fat.': ['What'],
 'What': ['th

# Create the text generator

In [14]:
### import random

def generate_sentence(chain, count=10):
    '''Input a dictionary in the format of key = current word, value = list of next words
       along with the number of words you would like to see in your generated sentence.'''

    # Capitalize the first word
    word1 = random.choice(list(chain.keys()))
    sentence = word1.capitalize()

    # Generate the second word from the value list. Set the new word as the first word. Repeat.
    for i in range(count-1):
        word2 = random.choice(chain[word1])
        word1 = word2
        sentence += ' ' + word2

    # End it with a period
    sentence += '.'
    return(sentence)

In [15]:
generate_sentence(russell_dict)

'Nickname to either of the air when you wanted to.'