In [14]:
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from collections import Counter

#open text file in read mode
file = open("hamlet_act1.txt", "r")

#read file to a string
hamlet = file.read()

file.close()
 
print(hamlet)

ACT 1
=====

Scene 1
[Enter Barnardo and Francisco, two sentinels.]


BARNARDO  Who's there?

FRANCISCO
Nay, answer me. Stand and unfold yourself.

BARNARDO  Long live the King!

FRANCISCO  Barnardo?

BARNARDO  He.

FRANCISCO
You come most carefully upon your hour.

BARNARDO
'Tis now struck twelve. Get thee to bed, Francisco.

FRANCISCO
For this relief much thanks. 'Tis bitter cold,
And I am sick at heart.

BARNARDO  Have you had quiet guard?

FRANCISCO  Not a mouse stirring.

BARNARDO  Well, good night.
If you do meet Horatio and Marcellus,
The rivals of my watch, bid them make haste.

[Enter Horatio and Marcellus.]


FRANCISCO
I think I hear them.--Stand ho! Who is there?

HORATIO  Friends to this ground.

MARCELLUS  And liegemen to the Dane.

FRANCISCO  Give you good night.

MARCELLUS
O farewell, honest soldier. Who hath relieved
you?

FRANCISCO
Barnardo hath my place. Give you good night.
[Francisco exits.]

MARCELLUS  Holla, Barnardo.

BARNARDO  Say, what, is Horatio there?

HORAT

In [15]:
#use regex to isolate each sentence
parsed = re.findall(r"""(?<![A-Z[])[A-Z'][A-Z]?[^A-Z][A-Za-z\s\n,();:'"-]*[.!?-](?!])""", hamlet)

print("There are", len(parsed), "sentences in this list.")

parsed

There are 544 sentences in this list.


["Who's there?",
 'Nay, answer me.',
 'Stand and unfold yourself.',
 'Long live the King!',
 'Barnardo?',
 'He.',
 'You come most carefully upon your hour.',
 "'Tis now struck twelve.",
 'Get thee to bed, Francisco.',
 'For this relief much thanks.',
 "'Tis bitter cold,\nAnd I am sick at heart.",
 'Have you had quiet guard?',
 'Not a mouse stirring.',
 'Well, good night.',
 'If you do meet Horatio and Marcellus,\nThe rivals of my watch, bid them make haste.',
 'I think I hear them.',
 'Stand ho!',
 'Who is there?',
 'Friends to this ground.',
 'And liegemen to the Dane.',
 'Give you good night.',
 'O farewell, honest soldier.',
 'Who hath relieved\nyou?',
 'Barnardo hath my place.',
 'Give you good night.',
 'Holla, Barnardo.',
 'Say, what, is Horatio there?',
 'A piece of him.',
 'Welcome, Horatio.',
 'Welcome, good Marcellus.',
 'What, has this thing appeared again tonight?',
 'I have seen nothing.',
 "Horatio says 'tis but our fantasy\nAnd will not let belief take hold of him\nTouch

In [16]:
#tokenize each sentence
tokens = []
#I used this tokenizer because it kept the contractions in tact.
tk = RegexpTokenizer(r"[\w']+")

for s in parsed:
    tokens.append(tk.tokenize(s))

tokens

[["Who's", 'there'],
 ['Nay', 'answer', 'me'],
 ['Stand', 'and', 'unfold', 'yourself'],
 ['Long', 'live', 'the', 'King'],
 ['Barnardo'],
 ['He'],
 ['You', 'come', 'most', 'carefully', 'upon', 'your', 'hour'],
 ["'Tis", 'now', 'struck', 'twelve'],
 ['Get', 'thee', 'to', 'bed', 'Francisco'],
 ['For', 'this', 'relief', 'much', 'thanks'],
 ["'Tis", 'bitter', 'cold', 'And', 'I', 'am', 'sick', 'at', 'heart'],
 ['Have', 'you', 'had', 'quiet', 'guard'],
 ['Not', 'a', 'mouse', 'stirring'],
 ['Well', 'good', 'night'],
 ['If',
  'you',
  'do',
  'meet',
  'Horatio',
  'and',
  'Marcellus',
  'The',
  'rivals',
  'of',
  'my',
  'watch',
  'bid',
  'them',
  'make',
  'haste'],
 ['I', 'think', 'I', 'hear', 'them'],
 ['Stand', 'ho'],
 ['Who', 'is', 'there'],
 ['Friends', 'to', 'this', 'ground'],
 ['And', 'liegemen', 'to', 'the', 'Dane'],
 ['Give', 'you', 'good', 'night'],
 ['O', 'farewell', 'honest', 'soldier'],
 ['Who', 'hath', 'relieved', 'you'],
 ['Barnardo', 'hath', 'my', 'place'],
 ['Give', 'y

In [17]:
#add bigrams from each sentence to a list of lists.
bigrams = []
for s in tokens:
    bigrams.append(list(nltk.bigrams(s)))

bigrams

[[("Who's", 'there')],
 [('Nay', 'answer'), ('answer', 'me')],
 [('Stand', 'and'), ('and', 'unfold'), ('unfold', 'yourself')],
 [('Long', 'live'), ('live', 'the'), ('the', 'King')],
 [],
 [],
 [('You', 'come'),
  ('come', 'most'),
  ('most', 'carefully'),
  ('carefully', 'upon'),
  ('upon', 'your'),
  ('your', 'hour')],
 [("'Tis", 'now'), ('now', 'struck'), ('struck', 'twelve')],
 [('Get', 'thee'), ('thee', 'to'), ('to', 'bed'), ('bed', 'Francisco')],
 [('For', 'this'), ('this', 'relief'), ('relief', 'much'), ('much', 'thanks')],
 [("'Tis", 'bitter'),
  ('bitter', 'cold'),
  ('cold', 'And'),
  ('And', 'I'),
  ('I', 'am'),
  ('am', 'sick'),
  ('sick', 'at'),
  ('at', 'heart')],
 [('Have', 'you'), ('you', 'had'), ('had', 'quiet'), ('quiet', 'guard')],
 [('Not', 'a'), ('a', 'mouse'), ('mouse', 'stirring')],
 [('Well', 'good'), ('good', 'night')],
 [('If', 'you'),
  ('you', 'do'),
  ('do', 'meet'),
  ('meet', 'Horatio'),
  ('Horatio', 'and'),
  ('and', 'Marcellus'),
  ('Marcellus', 'The'),

In [18]:
#tag each word within each bigram with a part of speech
tags = []
for r in bigrams:
    for b in r:
        tags.append(nltk.pos_tag(b))

tags

[[("Who's", 'IN'), ('there', 'EX')],
 [('Nay', 'NNP'), ('answer', 'NN')],
 [('answer', 'IN'), ('me', 'PRP')],
 [('Stand', 'NNP'), ('and', 'CC')],
 [('and', 'CC'), ('unfold', 'JJ')],
 [('unfold', 'JJ'), ('yourself', 'PRP')],
 [('Long', 'RB'), ('live', 'JJ')],
 [('live', 'CD'), ('the', 'DT')],
 [('the', 'DT'), ('King', 'NNP')],
 [('You', 'PRP'), ('come', 'VBP')],
 [('come', 'VB'), ('most', 'JJS')],
 [('most', 'RBS'), ('carefully', 'RB')],
 [('carefully', 'RB'), ('upon', 'IN')],
 [('upon', 'IN'), ('your', 'PRP$')],
 [('your', 'PRP$'), ('hour', 'NN')],
 [("'Tis", 'CD'), ('now', 'RB')],
 [('now', 'RB'), ('struck', 'VBP')],
 [('struck', 'NN'), ('twelve', 'NN')],
 [('Get', 'NNP'), ('thee', 'NN')],
 [('thee', 'NN'), ('to', 'TO')],
 [('to', 'TO'), ('bed', 'VB')],
 [('bed', 'NN'), ('Francisco', 'NNP')],
 [('For', 'IN'), ('this', 'DT')],
 [('this', 'DT'), ('relief', 'NN')],
 [('relief', 'NN'), ('much', 'JJ')],
 [('much', 'JJ'), ('thanks', 'NNS')],
 [("'Tis", 'CD'), ('bitter', 'NN')],
 [('bitter',

In [19]:
#identify bigrams with a noun or pronoun followed by a verb
n_v_bigrams = []

for t in tags:
    #use regex to match noun and pronoun pos tags
    x = re.findall(r'[PN][NR].?.?', t[0][1])
    #use regex to match verb pos tags
    y = re.findall(r'VB.?', t[1][1])
    #add bigrams with a noun or pronoun followed by a verb to a list
    if len(x) == 1 and len(y) == 1:
        n_v_bigrams.append([t[0][0], t[1][0]])

#print(len(n_v_bigrams))
n_v_bigrams

[['You', 'come'],
 ['I', 'am'],
 ['you', 'had'],
 ['mouse', 'stirring'],
 ['you', 'do'],
 ['them', 'make'],
 ['I', 'think'],
 ['I', 'hear'],
 ['you', 'good'],
 ['hath', 'relieved'],
 ['you', 'good'],
 ['thing', 'appeared'],
 ['I', 'have'],
 ['Horatio', 'says'],
 ['belief', 'take'],
 ['him', 'Touching'],
 ['I', 'have'],
 ['apparition', 'come'],
 ['we', 'have'],
 ['nights', 'seen'],
 ['us', 'hear'],
 ["that's", 'westward'],
 ['pole', 'Had'],
 ['Had', 'made'],
 ['it', 'burns'],
 ['it', 'comes'],
 ['It', 'harrows'],
 ['I', 'charge'],
 ['It', 'is'],
 ['it', 'stalks'],
 ['I', 'charge'],
 ['you', 'tremble'],
 ['he', 'had'],
 ['He', 'smote'],
 ['he', 'gone'],
 ['I', 'know'],
 ['Doth', 'make'],
 ['whisper', 'goes'],
 ['us', 'Was'],
 ['you', 'know'],
 ['Thereto', 'pricked'],
 ['pride', 'Dared'],
 ['world', 'esteemed'],
 ['Did', 'slay'],
 ['Did', 'forfeit'],
 ['lands', 'Which'],
 ['he', 'stood'],
 ['stood', 'seized'],
 ['competent', 'Was'],
 ['Was', 'gaged'],
 ['Fortinbras', 'Had'],
 ['he', 'been

In [20]:
#convert list of lists to tuple so I can use counter function
t = map(tuple,n_v_bigrams)

#Put all the words in lower case to ensure proper count of bigrams
t = [(x.lower(), y.lower()) for x,y in t]
c = Counter(t)

c.most_common(20)

[(('it', 'is'), 18),
 (('i', 'am'), 10),
 (('i', 'have'), 7),
 (('i', 'do'), 7),
 (('you', 'are'), 6),
 (('i', 'think'), 5),
 (('he', 'hath'), 5),
 (('you', 'have'), 5),
 (('we', 'have'), 4),
 (('you', 'know'), 4),
 (('it', 'be'), 4),
 (('i', 'pray'), 4),
 (('you', 'do'), 3),
 (('you', 'good'), 3),
 (('it', 'comes'), 3),
 (('i', 'charge'), 3),
 (('i', 'know'), 3),
 (('father', 'lost'), 3),
 (('we', 'do'), 3),
 (('it', 'was'), 3)]

In [21]:
x = c.most_common()

#function to return length of list containing the 10 most common bigrams, i.e. to determine if number 10 has any ties for frequency
def common(x):
    i = 10
    while i in range(10 ,len(x)):
        if x[i][1] == x[9][1]:
            i+=1
        else:
            return i

n = common(x)
#bigrams with the 10 highest frequencies
most_common = c.most_common(n)
most_common


[(('it', 'is'), 18),
 (('i', 'am'), 10),
 (('i', 'have'), 7),
 (('i', 'do'), 7),
 (('you', 'are'), 6),
 (('i', 'think'), 5),
 (('he', 'hath'), 5),
 (('you', 'have'), 5),
 (('we', 'have'), 4),
 (('you', 'know'), 4),
 (('it', 'be'), 4),
 (('i', 'pray'), 4)]

In [22]:
#write top bigrams to .txt file
f = open('bigrams.txt', 'w')
for t in most_common:
    line = ' '.join(str(x) for x in t)
    f.write(line + '\n')
f.close()