In [1]:
from collections import *

In [2]:
lm = defaultdict(Counter)
lm

defaultdict(collections.Counter, {})

In [3]:
with open('shakespeare_input.txt','r') as f:
    data = f.read()

In [4]:
data[:500]

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor"

In [5]:
# history
hist = data[0:0+4]
hist

'Firs'

In [6]:
# char
char = data[0+4]
char

't'

In [7]:
lm[hist][char] += 1
lm

defaultdict(collections.Counter, {'Firs': Counter({'t': 1})})

In [8]:
def train_char_lm(fname, order=4):
    data = open(fname).read()
    lm = defaultdict(Counter)
    pad = "~" * order
    data = pad + data
    for i in range(len(data)-order):
        history, char = data[i:i+order], data[i+order]
        lm[history][char]+=1
    def normalize(counter):
        s = float(sum(counter.values()))
        return [(c,cnt/s) for c,cnt in counter.items()]
    outlm = {hist:normalize(chars) for hist, chars in lm.items()}
    return outlm

In [9]:
lm = train_char_lm(fname="shakespeare_input.txt", order=4)

In [10]:
type(lm)

dict

In [11]:
[*lm.keys()][:20]

['~~~~',
 '~~~F',
 '~~Fi',
 '~Fir',
 'Firs',
 'irst',
 'rst ',
 'st C',
 't Ci',
 ' Cit',
 'Citi',
 'itiz',
 'tize',
 'izen',
 'zen:',
 'en:\n',
 'n:\nB',
 ':\nBe',
 '\nBef',
 'Befo']

In [12]:
lm['ello']

[('r', 0.059625212947189095),
 ('w', 0.817717206132879),
 ('u', 0.03747870528109029),
 (',', 0.027257240204429302),
 (' ', 0.013628620102214651),
 ('.', 0.0068143100511073255),
 ('?', 0.0068143100511073255),
 (':', 0.005110732538330494),
 ('n', 0.0017035775127768314),
 ("'", 0.017035775127768313),
 ('!', 0.0068143100511073255)]

In [13]:
chars, values = [*zip(*lm['ello'])]
values

(0.059625212947189095,
 0.817717206132879,
 0.03747870528109029,
 0.027257240204429302,
 0.013628620102214651,
 0.0068143100511073255,
 0.0068143100511073255,
 0.005110732538330494,
 0.0017035775127768314,
 0.017035775127768313,
 0.0068143100511073255)

In [14]:
sum(values)

1.0

In [15]:
lm['Firs']

[('t', 1.0)]

In [16]:
lm['rst ']

[('C', 0.09550561797752809),
 ('f', 0.011235955056179775),
 ('i', 0.016853932584269662),
 ('t', 0.05377207062600321),
 ('u', 0.0016051364365971107),
 ('S', 0.16292134831460675),
 ('h', 0.019261637239165328),
 ('s', 0.03290529695024077),
 ('R', 0.0008025682182985554),
 ('b', 0.024879614767255216),
 ('c', 0.012841091492776886),
 ('O', 0.018459069020866775),
 ('w', 0.024077046548956663),
 ('a', 0.02247191011235955),
 ('m', 0.02247191011235955),
 ('n', 0.020064205457463884),
 ('I', 0.009630818619582664),
 ('L', 0.10674157303370786),
 ('M', 0.0593900481540931),
 ('l', 0.01043338683788122),
 ('o', 0.030497592295345103),
 ('H', 0.0040128410914927765),
 ('d', 0.015248796147672551),
 ('W', 0.033707865168539325),
 ('K', 0.008025682182985553),
 ('q', 0.0016051364365971107),
 ('G', 0.0898876404494382),
 ('g', 0.011235955056179775),
 ('k', 0.0040128410914927765),
 ('e', 0.0032102728731942215),
 ('y', 0.002407704654895666),
 ('r', 0.0072231139646869984),
 ('p', 0.00882825040128411),
 ('A', 0.0056179

In [17]:
from random import random

def generate_letter(lm, history, order):
        history = history[-order:]
        dist = lm[history]
        x = random()
        for c,v in dist:
            x = x - v
            if x <= 0: return c

def generate_text(lm, order, nletters=1000):
    history = "~" * order
    out = []
    for i in range(nletters):
        c = generate_letter(lm, history, order)
        history = history[-order:] + c
        out.append(c)
    return "".join(out)

In [18]:
lm['ello']

[('r', 0.059625212947189095),
 ('w', 0.817717206132879),
 ('u', 0.03747870528109029),
 (',', 0.027257240204429302),
 (' ', 0.013628620102214651),
 ('.', 0.0068143100511073255),
 ('?', 0.0068143100511073255),
 (':', 0.005110732538330494),
 ('n', 0.0017035775127768314),
 ("'", 0.017035775127768313),
 ('!', 0.0068143100511073255)]

In [19]:
x = random()
x

0.5213106194809699

In [20]:
for c,v in lm['ello']:
    print("\nx initial: {:.3f}".format(x))
    x = x - v
    print("c: {}, x: {:.3f}, v: {:.3f}\n".format(c,x,v))


x initial: 0.521
c: r, x: 0.462, v: 0.060


x initial: 0.462
c: w, x: -0.356, v: 0.818


x initial: -0.356
c: u, x: -0.394, v: 0.037


x initial: -0.394
c: ,, x: -0.421, v: 0.027


x initial: -0.421
c:  , x: -0.434, v: 0.014


x initial: -0.434
c: ., x: -0.441, v: 0.007


x initial: -0.441
c: ?, x: -0.448, v: 0.007


x initial: -0.448
c: :, x: -0.453, v: 0.005


x initial: -0.453
c: n, x: -0.455, v: 0.002


x initial: -0.455
c: ', x: -0.472, v: 0.017


x initial: -0.472
c: !, x: -0.479, v: 0.007



In [21]:
def generate_text_order(input_text,order):
    lm = train_char_lm("shakespeare_input.txt", order=order)
    print(generate_text(lm, order))

In [22]:
generate_text_order('shakespeare_input.txt',order=2)

Firrome
Ford, mus he my on agapee ne.

QUESTAVING Clood,
Wit of and thein salf: the ead thearalser, grearientrey peres whin frie!--foree eared lood day, of Ming Rome antels on, forrompaithus anse terequill neve thereard-glive halry like wor not thelly live PAGO:
I dell be pats tog: timpeof Chave me
This ther oned, onbot afte wour wouse but goodesir lor to knothe there, re nathic she to thoselear the gome gotheir.

Dost I stied usippin'd,'
eve not be I'll annotheen this telf: age?

JOHN:
PORD:
Youncen ift?

PEDWARET:
GLO:
But havent
talf is fory?

My youl to din's hat dosed he pulcorless.

CAM:
Ant:
Havio,
PEMISTAVILIND:
If whicklenefuld tre win ould cand,
Ant. O my ou ous.

Garwark that loris'--

PETHUMBELISALVI:
Int hat thathy,
Thold Lones the wer, Peake eve do HUR:
The all hich thee thalf:
And hathome'staing mostere so
Spon to stend
There of yout were mend he is PATRICK:
God his' brin threfort ther! amucithe comis me goor fid evown, le Nerst.
For me thesing loved well ance and PARD P