Exploring a text corpus

In [2]:
import nltk 
nltk.download('gutenberg',quiet=True) 
nltk.download('stopwords',quiet=True) 
from nltk.corpus import gutenberg, stopwords 
from nltk.collocations import * 
from nltk import FreqDist 
from nltk import word_tokenize 
import string 
import re

In [3]:
file_ids = gutenberg.fileids() 
file_ids

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

To load the actual corpus,we need to pass in the file id for macbeth into guternberg.raw()

In [4]:
macbeth_text = gutenberg.raw(file_ids[-2]) 
print(macbeth_text[:1000])

[The Tragedie of Macbeth by William Shakespeare 1603]


Actus Primus. Scoena Prima.

Thunder and Lightning. Enter three Witches.

  1. When shall we three meet againe?
In Thunder, Lightning, or in Raine?
  2. When the Hurley-burley's done,
When the Battaile's lost, and wonne

   3. That will be ere the set of Sunne

   1. Where the place?
  2. Vpon the Heath

   3. There to meet with Macbeth

   1. I come, Gray-Malkin

   All. Padock calls anon: faire is foule, and foule is faire,
Houer through the fogge and filthie ayre.

Exeunt.


Scena Secunda.

Alarum within. Enter King Malcome, Donalbaine, Lenox, with
attendants,
meeting a bleeding Captaine.

  King. What bloody man is that? he can report,
As seemeth by his plight, of the Reuolt
The newest state

   Mal. This is the Serieant,
Who like a good and hardie Souldier fought
'Gainst my Captiuitie: Haile braue friend;
Say to the King, the knowledge of the Broyle,
As thou didst leaue it

   Cap. Doubtfull it stood,
As two spent Swimmers, t

from the above some words are hyphenated.we will use basic tokenization,it will split hyphenated words into individual tokens

In [5]:
pattern = "([a-zA-Z]+(?:'[a-z]+)?)" 
macbeth_tokens_raw = nltk.regexp_tokenize(macbeth_text,pattern)

using a list comprehension to get the lower case on the tokens

In [6]:
macbeth_tokens = [word.lower() for word in macbeth_tokens_raw]

### frequency distribution

In [7]:
macbeth_freqdist = FreqDist(macbeth_tokens) 
macbeth_freqdist.most_common(50)

[('the', 649),
 ('and', 545),
 ('to', 383),
 ('of', 338),
 ('i', 331),
 ('a', 241),
 ('that', 227),
 ('my', 203),
 ('you', 203),
 ('in', 199),
 ('is', 180),
 ('not', 165),
 ('it', 161),
 ('with', 153),
 ('his', 146),
 ('be', 137),
 ('macb', 137),
 ('your', 126),
 ('our', 123),
 ('haue', 122),
 ('but', 120),
 ('me', 113),
 ('he', 110),
 ('for', 109),
 ('what', 106),
 ('this', 104),
 ('all', 99),
 ('so', 96),
 ('him', 90),
 ('as', 89),
 ('thou', 87),
 ('we', 83),
 ('enter', 81),
 ('which', 80),
 ('are', 73),
 ('will', 72),
 ('they', 70),
 ('shall', 68),
 ('no', 67),
 ('then', 63),
 ('macbeth', 62),
 ('their', 62),
 ('thee', 61),
 ('vpon', 58),
 ('on', 58),
 ('macd', 58),
 ('from', 57),
 ('yet', 57),
 ('thy', 56),
 ('vs', 55)]

Getting the first 1000 characters of the text to ensure it loaded correctly

In [8]:
macbeth_text = gutenberg.raw(file_ids[-2]) 
print(macbeth_text[:1000])

[The Tragedie of Macbeth by William Shakespeare 1603]


Actus Primus. Scoena Prima.

Thunder and Lightning. Enter three Witches.

  1. When shall we three meet againe?
In Thunder, Lightning, or in Raine?
  2. When the Hurley-burley's done,
When the Battaile's lost, and wonne

   3. That will be ere the set of Sunne

   1. Where the place?
  2. Vpon the Heath

   3. There to meet with Macbeth

   1. I come, Gray-Malkin

   All. Padock calls anon: faire is foule, and foule is faire,
Houer through the fogge and filthie ayre.

Exeunt.


Scena Secunda.

Alarum within. Enter King Malcome, Donalbaine, Lenox, with
attendants,
meeting a bleeding Captaine.

  King. What bloody man is that? he can report,
As seemeth by his plight, of the Reuolt
The newest state

   Mal. This is the Serieant,
Who like a good and hardie Souldier fought
'Gainst my Captiuitie: Haile braue friend;
Say to the King, the knowledge of the Broyle,
As thou didst leaue it

   Cap. Doubtfull it stood,
As two spent Swimmers, t

Using basic tokenization splits hyphenated words into individual tokens

In [9]:
pattern = "([a-zA-Z]+(?:'[a-z]+)?)" 
macbeth_tokens_raw = nltk.regexp_tokenize(macbeth_text,pattern)

In [10]:
stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation)
stopwords_list += ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

macbeth_words_stopped = [word for word in macbeth_tokens if word not in stopwords_list]

In [11]:
macbeth_stopped_freqdist = FreqDist(macbeth_words_stopped)
macbeth_stopped_freqdist.most_common(50)

[('macb', 137),
 ('haue', 122),
 ('thou', 87),
 ('enter', 81),
 ('shall', 68),
 ('macbeth', 62),
 ('thee', 61),
 ('vpon', 58),
 ('macd', 58),
 ('yet', 57),
 ('thy', 56),
 ('vs', 55),
 ('come', 54),
 ('king', 54),
 ('hath', 52),
 ('good', 49),
 ('rosse', 49),
 ('lady', 48),
 ('would', 47),
 ('time', 46),
 ('like', 43),
 ('say', 39),
 ('doe', 38),
 ('lord', 38),
 ('make', 38),
 ('tis', 37),
 ('must', 36),
 ('done', 35),
 ('selfe', 35),
 ('ile', 35),
 ('feare', 35),
 ('let', 35),
 ('man', 34),
 ('wife', 34),
 ('night', 34),
 ('banquo', 34),
 ('well', 33),
 ('know', 33),
 ('one', 32),
 ('great', 31),
 ('see', 31),
 ('may', 31),
 ('exeunt', 30),
 ('speake', 29),
 ('sir', 29),
 ('lenox', 28),
 ('mine', 26),
 ('vp', 26),
 ('th', 26),
 ('mal', 25)]

In [12]:
total_word_count = sum(macbeth_stopped_freqdist.values()) 
macbeth_top_50 = macbeth_stopped_freqdist.most_common(50) 
print(f'{"Word":10} Normalized Frequency') 
for word in macbeth_top_50:
    normalized_frequency = word[1] / total_word_count 
    print(f'{word[0]:10} {normalized_frequency:^20.4}')

Word       Normalized Frequency
macb             0.01354       
haue             0.01206       
thou             0.008601      
enter            0.008008      
shall            0.006723      
macbeth          0.00613       
thee             0.006031      
vpon             0.005734      
macd             0.005734      
yet              0.005635      
thy              0.005536      
vs               0.005437      
come             0.005339      
king             0.005339      
hath             0.005141      
good             0.004844      
rosse            0.004844      
lady             0.004745      
would            0.004647      
time             0.004548      
like             0.004251      
say              0.003856      
doe              0.003757      
lord             0.003757      
make             0.003757      
tis              0.003658      
must             0.003559      
done             0.00346       
selfe            0.00346       
ile              0.00346       
feare   

In [13]:
bigram_measures = nltk.collocations.BigramAssocMeasures() 

In [14]:
macbeth_finder = BigramCollocationFinder.from_words(macbeth_words_stopped)

In [17]:
macbeth_scored = macbeth_finder.score_ngrams(bigram_measures.pmi)

In [19]:
macbeth_scored[:50]

[(('abiure', 'taints'), 13.304208699445645),
 (('abound', 'diuision'), 13.304208699445645),
 (('accounted', 'dangerous'), 13.304208699445645),
 (("accustom'd", 'action'), 13.304208699445645),
 (('acheron', 'meete'), 13.304208699445645),
 (('acts', 'thoght'), 13.304208699445645),
 (('actuall', 'performances'), 13.304208699445645),
 (('adders', 'forke'), 13.304208699445645),
 (("admir'd", 'disorder'), 13.304208699445645),
 (('affection', 'stanchlesse'), 13.304208699445645),
 (('afterwards', 'seale'), 13.304208699445645),
 (('agents', "prey's"), 13.304208699445645),
 (('alarme', 'excite'), 13.304208699445645),
 (("alarum'd", 'centinell'), 13.304208699445645),
 (('alter', 'fauor'), 13.304208699445645),
 (('among', 'guests'), 13.304208699445645),
 (('anger', 'blunt'), 13.304208699445645),
 (('anthonies', 'caesar'), 13.304208699445645),
 (("anticipat'st", 'dread'), 13.304208699445645),
 (('antidote', 'cleanse'), 13.304208699445645),
 (('appeare', 'flying'), 13.304208699445645),
 (('appease',