In [1]:
from nltk import word_tokenize, sent_tokenize
from collections import Counter
import copy
import numpy as np

In [2]:
def read_file(filename):
    sequences=[]
    with open(filename) as file:
        data=file.read()
        sents=sent_tokenize(data)
        for sent in sents:
            tokens=word_tokenize(sent)
            sequences.append(tokens)
            
    return sequences

In [3]:
# Read data from file and tokenize them into sequences comprised of tokens.

# 2020 Democratic Party platform
# sequences=read_file("../data/democrat_platform_2020.txt")

# 2020 Republican Party platform
# sequences=read_file("../data/republican_platform_2020.txt")

# Pride and Prejudice (Jane Austen)
sequences=read_file("../data/stylometry/1342_pride_and_prejudice.txt")

# All of Shakespeare's plays
# sequences=read_file("../data/pg100_plays.txt")

max_sequences=10000

In [39]:
class NgramModel():

    def __init__(self, sequences, order):
        
        # For this exercise we're going to encode the LM as a sparse dictionary (training less storage for more compute)
        # We'll store the LM as a dictionary with the conditioning context as keys; each value is a 
        # Counter object that keeps track of the number of times we see a word following that context.
        
        self.counts={}
        
        # Markov order (order 1 = conditioning on previous 1 word; order 2 = previous 2 words, etc.)
        self.order=order
        
        vocab={"[END]":0}
                
        for s_idx, tokens in enumerate(sequences):
            # We'll add [START] and [END] tokens to encode the beginning/end of sentences
            token_copy=copy.deepcopy(tokens)
            for i in range(order):
                token_copy.insert(0, "[START]")
            token_copy.append("[END]")
            
        
            for i in range(order, len(token_copy)):
                context=" ".join(token_copy[i-order:i])
                word=token_copy[i]
                
                if word not in vocab:
                    vocab[word]=len(vocab)
                
                # For just the first sentence, print the conditioning context + word
                if s_idx == 0:
                    print("Context: %s Next: %s" % (context.ljust(50), word))
                    
                if context not in self.counts:
                    self.counts[context]=Counter()
                self.counts[context][word]+=1
                


    def sample(self, context):

        total=sum(self.counts[context].values())
        
        dist=[]
        vocab=[]

        # Create a probability distribution for each conditioning context, over the vocab that we've observed it with.
        for idx, word in enumerate(self.counts[context]):
            prob=self.counts[context][word]/total
            dist.append(prob)
            vocab.append(word)

        index=np.argmax(np.random.multinomial(1, pvals=dist))
        #return vocab[index]
        return vocab, dist
        
    def generate_sequence(self):
        generated=["[START]"]*(self.order)
        word=None
        while word != "[END]":
            context=' '.join(generated[-self.order:] if self.order > 0 else "")
            word=self.sample(context)
            print(word)
            generated.append(word)
    
    

In [5]:
ngram1=NgramModel(sequences[:max_sequences], order=1)

Context: [START]                                            Next: Chapter
Context: Chapter                                            Next: 1
Context: 1                                                  Next: It
Context: It                                                 Next: is
Context: is                                                 Next: a
Context: a                                                  Next: truth
Context: truth                                              Next: universally
Context: universally                                        Next: acknowledged
Context: acknowledged                                       Next: ,
Context: ,                                                  Next: that
Context: that                                               Next: a
Context: a                                                  Next: single
Context: single                                             Next: man
Context: man                                                Next: in
Cont

In [6]:
ngram1.generate_sequence()

Since
writing
.
[END]


In [7]:
ngram0=NgramModel(sequences[:max_sequences], order=0)

Context:                                                    Next: Chapter
Context:                                                    Next: 1
Context:                                                    Next: It
Context:                                                    Next: is
Context:                                                    Next: a
Context:                                                    Next: truth
Context:                                                    Next: universally
Context:                                                    Next: acknowledged
Context:                                                    Next: ,
Context:                                                    Next: that
Context:                                                    Next: a
Context:                                                    Next: single
Context:                                                    Next: man
Context:                                                    Next: in
Cont

In [8]:
ngram0.generate_sequence()

her
into
to
I
must
,
“
reputed
be
myself
spirit
instead
wish
after
the
eminent
in
good
of
the
felt
--
will
she
well
then
encouragement
I
was
what
brother
my
what
;
now
him
promise
interview
bequeathed
with
;
who
immediately
pay
very
affectionate
lively
is
Mrs.
really
,
Forster
me
injunction
warmth
went
!
she
you
marrying
on
but
so.
and
high
constant
own
by
endeavoured
disgusted
--
token
Mrs.
the
and
,
A
there
because
Darcy
”
,
prodigiously.
am
same
[END]


In [40]:
ngram2=NgramModel(sequences[:max_sequences], order=2)

Context: [START] [START]                                    Next: Chapter
Context: [START] Chapter                                    Next: 1
Context: Chapter 1                                          Next: It
Context: 1 It                                               Next: is
Context: It is                                              Next: a
Context: is a                                               Next: truth
Context: a truth                                            Next: universally
Context: truth universally                                  Next: acknowledged
Context: universally acknowledged                           Next: ,
Context: acknowledged ,                                     Next: that
Context: , that                                             Next: a
Context: that a                                             Next: single
Context: a single                                           Next: man
Context: single man                                         Next: in
Cont

In [10]:
ngram2.generate_sequence()

The
next
variation
which
their
visit
to
proceed
from
the
object
of
her
brother
,
though
expecting
no
less
certain
is
it
you
mean
?
”
“
How
should
you
think
him
less
agreeable
man
I
know
,
that
Elizabeth
had
been
designed
for
him
,
except
the
professed
lover
of
her
intelligence
was
all
grateful
pleasure
,
unconnected
with
any
complacency
.
[END]


In [41]:
ngram2.sample("Lady Lucas")

(['are',
  '.',
  'quieted',
  'was',
  'herself',
  ')',
  'might',
  ',',
  'were',
  'began',
  'without',
  'could',
  'will',
  'has',
  'and'],
 [0.058823529411764705,
  0.058823529411764705,
  0.058823529411764705,
  0.11764705882352941,
  0.058823529411764705,
  0.058823529411764705,
  0.058823529411764705,
  0.11764705882352941,
  0.058823529411764705,
  0.058823529411764705,
  0.058823529411764705,
  0.058823529411764705,
  0.058823529411764705,
  0.058823529411764705,
  0.058823529411764705])

In [17]:
ngram3=NgramModel(sequences[:max_sequences], order=10)

Context: [START] [START] [START] [START] [START] [START] [START] [START] [START] [START] Next: Chapter
Context: [START] [START] [START] [START] [START] [START] [START] [START] [START] Chapter Next: 1
Context: [START] [START] [START] [START] [START] [START] [START] [START] Chapter 1 Next: It
Context: [START] [START] [START] [START] [START] [START] [START] Chapter 1 It Next: is
Context: [START] [START] [START] [START] [START] [START] Chapter 1 It is Next: a
Context: [START] [START] [START] [START] [START] Chapter 1 It is a Next: truth
Context: [START] [START] [START] [START] Chapter 1 It is a truth Next: universally
Context: [START] [START] [START] Chapter 1 It is a truth universally Next: acknowledged
Context: [START] [START] Chapter 1 It is a truth universally acknowledged Next: ,
Context: [START] Chapter 1 It is a truth universally acknowledged , Next: that
Context: Chapter 1 It is a truth universally acknowledged , that Next: a
Context: 1 It is a truth universally acknowledged , that

In [18]:
ngram3.generate_sequence()

At
present
I
have
not
room
to
do
them
justice.
”
“
Oh
!
[END]


A1. Explore sampling sequences from LMs of different orders above; what do you notice about the structure of the generated texts (and how they differ by orders)?  Explore LMs trained on different datasets as well.

Order 0 model generates single words based entirely own its relative frequency, without considering context at all. Order 1 and order 2 models generate words based on previous tokens. There is some semantic/syntactic structure preserved in the order 2 model. 

A2. In a second-order LM estimated from `1342_pride_and_prejudice.txt` above, what's $P(\textrm{are} | \textrm{Lady Lucas})$?

The probability of `are` occuring after `Lady Lucas` is 0. 

A3. Keep increasing the order of LMs (well past 3); compare the text that's generated to the original dataset (in the files above); are the LMs simply memorizing the source material?

For a small training sample like this, it seems that the generated outputs resemble the orginal text a lot. 