# Setup

In [1]:
import re
import random
from collections import defaultdict

In [2]:
TXT_FILE = 'data/pride_and_prejudice.txt'
PUNCTUATION = '.;,-“’”:?—‘!()_'
LINE_TO_EXCLUDE = r'(?i)^\s*CHAPTER\s*[IVXLCDM]+\s*$'
SPLITTING_TO_SENTENCES_PATTERN = r'/(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s/gm'

# Bigram model

In [3]:
# create successor map
successor_map = defaultdict(list)
window = []

with open(TXT_FILE) as f:
    text = f.read()
    text = re.sub(LINE_TO_EXCLUDE, '', text, flags=re.MULTILINE | re.IGNORECASE)
    for sentence in re.split(SPLITTING_TO_SENTENCES_PATTERN, text):

        for word in sentence.split():
            word = word.strip(PUNCTUATION).lower()
            window.append(word)

            if len(window) == 2:
                successor_map[window[0]].append(window[1])
                window.pop(0)

print(successor_map['affect'])

['them', 'concern', 'his', 'me']


In [4]:
# generate text
random.seed(15)
N_WORDS = 12

word = 'you'
print(word, end=' ')
for _ in range(N_WORDS):
  print(word :=random.choice(successor_map[word]), end=' ')

you hold her approbation need not know your feelings she should be acceptable 

# N-gram model

In [5]:
# create successor map
N_GRAM = 4
successor_map = defaultdict(list)
window = []

with open(TXT_FILE) as f:
    text = f.read()
    text = re.sub(LINE_TO_EXCLUDE, '', text, flags=re.MULTILINE | re.IGNORECASE)
    for sentence in re.split(SPLITTING_TO_SENTENCES_PATTERN, text):

        for word in sentence.split():
            word = word.strip(PUNCTUATION).lower()
            window.append(word)

            if len(window) == N_GRAM:
                n_gram_key = tuple([window[i] for i in range(N_GRAM-1)])
                successor_map[n_gram_key].append(window[N_GRAM-1])
                window.pop(0)

print(successor_map[('you', 'are', 'a')])

['young', 'great', 'very', 'good', 'good', 'gentleman’s']


In [6]:
# generate text
random.seed(15)
N_WORDS = 15

word_sequence = ['you', 'are', 'a']
result = word_sequence.copy()
for _ in range(N_WORDS):
  next_word = random.choice(successor_map[tuple(word_sequence)])
  word_sequence.pop(0)
  word_sequence.append(next_word)
  result.append(next_word)

print(' '.join(result))

you are a great deal nor did she give herself the trouble of talking or of listening much
