In [1]:
import re
import string
import os
import nltk
import nltk.corpus
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk

# Getting the string from the corpus

In [2]:
hamlet = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt')
hamlet

['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', ...]

In [3]:
input_str = ""
for word in hamlet[:500]:
    input_str = input_str + word + " "
input_str

"[ The Tragedie of Hamlet by William Shakespeare 1599 ] Actus Primus . Scoena Prima . Enter Barnardo and Francisco two Centinels . Barnardo . Who ' s there ? Fran . Nay answer me : Stand & vnfold your selfe Bar . Long liue the King Fran . Barnardo ? Bar . He Fran . You come most carefully vpon your houre Bar . ' Tis now strook twelue , get thee to bed Francisco Fran . For this releefe much thankes : ' Tis bitter cold , And I am sicke at heart Barn . Haue you had quiet Guard ? Fran . Not a Mouse stirring Barn . Well , goodnight . If you do meet Horatio and Marcellus , the Riuals of my Watch , bid them make hast . Enter Horatio and Marcellus . Fran . I thinke I heare them . Stand : who ' s there ? Hor . Friends to this ground Mar . And Leige - men to the Dane Fran . Giue you good night Mar . O farwel honest Soldier , who hath relieu ' d you ? Fra . Barnardo ha ' s my place : giue you goodnight . Exit Fran . Mar . Holla Barnardo Bar . Say , what is Horatio there ? Hor . A peece of him Bar

# Convert Text to Lowercase

In [4]:
input_str = input_str.lower()
input_str

"[ the tragedie of hamlet by william shakespeare 1599 ] actus primus . scoena prima . enter barnardo and francisco two centinels . barnardo . who ' s there ? fran . nay answer me : stand & vnfold your selfe bar . long liue the king fran . barnardo ? bar . he fran . you come most carefully vpon your houre bar . ' tis now strook twelue , get thee to bed francisco fran . for this releefe much thankes : ' tis bitter cold , and i am sicke at heart barn . haue you had quiet guard ? fran . not a mouse stirring barn . well , goodnight . if you do meet horatio and marcellus , the riuals of my watch , bid them make hast . enter horatio and marcellus . fran . i thinke i heare them . stand : who ' s there ? hor . friends to this ground mar . and leige - men to the dane fran . giue you good night mar . o farwel honest soldier , who hath relieu ' d you ? fra . barnardo ha ' s my place : giue you goodnight . exit fran . mar . holla barnardo bar . say , what is horatio there ? hor . a peece of him bar

# Removing Numbers

In [5]:
input_str = re.sub(r'\d+', '', input_str)
input_str

"[ the tragedie of hamlet by william shakespeare  ] actus primus . scoena prima . enter barnardo and francisco two centinels . barnardo . who ' s there ? fran . nay answer me : stand & vnfold your selfe bar . long liue the king fran . barnardo ? bar . he fran . you come most carefully vpon your houre bar . ' tis now strook twelue , get thee to bed francisco fran . for this releefe much thankes : ' tis bitter cold , and i am sicke at heart barn . haue you had quiet guard ? fran . not a mouse stirring barn . well , goodnight . if you do meet horatio and marcellus , the riuals of my watch , bid them make hast . enter horatio and marcellus . fran . i thinke i heare them . stand : who ' s there ? hor . friends to this ground mar . and leige - men to the dane fran . giue you good night mar . o farwel honest soldier , who hath relieu ' d you ? fra . barnardo ha ' s my place : giue you goodnight . exit fran . mar . holla barnardo bar . say , what is horatio there ? hor . a peece of him bar . w

# The following code removes this set of symbols # [!”$%&’()*+,-./:;<=>?@[]^_`{|}~]: 

In [6]:
table = str.maketrans({key: None for key in string.punctuation})
input_str = input_str.translate(table) 
input_str

' the tragedie of hamlet by william shakespeare   actus primus  scoena prima  enter barnardo and francisco two centinels  barnardo  who  s there  fran  nay answer me  stand  vnfold your selfe bar  long liue the king fran  barnardo  bar  he fran  you come most carefully vpon your houre bar   tis now strook twelue  get thee to bed francisco fran  for this releefe much thankes   tis bitter cold  and i am sicke at heart barn  haue you had quiet guard  fran  not a mouse stirring barn  well  goodnight  if you do meet horatio and marcellus  the riuals of my watch  bid them make hast  enter horatio and marcellus  fran  i thinke i heare them  stand  who  s there  hor  friends to this ground mar  and leige  men to the dane fran  giue you good night mar  o farwel honest soldier  who hath relieu  d you  fra  barnardo ha  s my place  giue you goodnight  exit fran  mar  holla barnardo bar  say  what is horatio there  hor  a peece of him bar  welcome horatio  welcome good marcellus mar  what  ha  s t

# Whitespaces Removal

In [7]:
input_str = input_str.strip()
input_str

'the tragedie of hamlet by william shakespeare   actus primus  scoena prima  enter barnardo and francisco two centinels  barnardo  who  s there  fran  nay answer me  stand  vnfold your selfe bar  long liue the king fran  barnardo  bar  he fran  you come most carefully vpon your houre bar   tis now strook twelue  get thee to bed francisco fran  for this releefe much thankes   tis bitter cold  and i am sicke at heart barn  haue you had quiet guard  fran  not a mouse stirring barn  well  goodnight  if you do meet horatio and marcellus  the riuals of my watch  bid them make hast  enter horatio and marcellus  fran  i thinke i heare them  stand  who  s there  hor  friends to this ground mar  and leige  men to the dane fran  giue you good night mar  o farwel honest soldier  who hath relieu  d you  fra  barnardo ha  s my place  giue you goodnight  exit fran  mar  holla barnardo bar  say  what is horatio there  hor  a peece of him bar  welcome horatio  welcome good marcellus mar  what  ha  s th

# TOKENIZATION
<b>Tokenization is the process of splitting the given text into smaller pieces called tokens. Words, numbers, punctuation marks, and others can be considered as tokens.</b>


In [8]:
input_str_tokens = word_tokenize(input_str)
input_str_tokens

['the',
 'tragedie',
 'of',
 'hamlet',
 'by',
 'william',
 'shakespeare',
 'actus',
 'primus',
 'scoena',
 'prima',
 'enter',
 'barnardo',
 'and',
 'francisco',
 'two',
 'centinels',
 'barnardo',
 'who',
 's',
 'there',
 'fran',
 'nay',
 'answer',
 'me',
 'stand',
 'vnfold',
 'your',
 'selfe',
 'bar',
 'long',
 'liue',
 'the',
 'king',
 'fran',
 'barnardo',
 'bar',
 'he',
 'fran',
 'you',
 'come',
 'most',
 'carefully',
 'vpon',
 'your',
 'houre',
 'bar',
 'tis',
 'now',
 'strook',
 'twelue',
 'get',
 'thee',
 'to',
 'bed',
 'francisco',
 'fran',
 'for',
 'this',
 'releefe',
 'much',
 'thankes',
 'tis',
 'bitter',
 'cold',
 'and',
 'i',
 'am',
 'sicke',
 'at',
 'heart',
 'barn',
 'haue',
 'you',
 'had',
 'quiet',
 'guard',
 'fran',
 'not',
 'a',
 'mouse',
 'stirring',
 'barn',
 'well',
 'goodnight',
 'if',
 'you',
 'do',
 'meet',
 'horatio',
 'and',
 'marcellus',
 'the',
 'riuals',
 'of',
 'my',
 'watch',
 'bid',
 'them',
 'make',
 'hast',
 'enter',
 'horatio',
 'and',
 'marcellus',
 '

In [9]:
len(input_str_tokens)

391

# REMOVE STOP WORDS
<b>Stop words are the most common words in a language like “the”, “a”, “on”, “is”, “all”. These words do not carry important meaning and are usually removed from texts.</b>

In [10]:
stop_words = set(stopwords.words('english'))
input_str_tokens = [i for i in input_str_tokens if not i in stop_words]
input_str_tokens

['tragedie',
 'hamlet',
 'william',
 'shakespeare',
 'actus',
 'primus',
 'scoena',
 'prima',
 'enter',
 'barnardo',
 'francisco',
 'two',
 'centinels',
 'barnardo',
 'fran',
 'nay',
 'answer',
 'stand',
 'vnfold',
 'selfe',
 'bar',
 'long',
 'liue',
 'king',
 'fran',
 'barnardo',
 'bar',
 'fran',
 'come',
 'carefully',
 'vpon',
 'houre',
 'bar',
 'tis',
 'strook',
 'twelue',
 'get',
 'thee',
 'bed',
 'francisco',
 'fran',
 'releefe',
 'much',
 'thankes',
 'tis',
 'bitter',
 'cold',
 'sicke',
 'heart',
 'barn',
 'haue',
 'quiet',
 'guard',
 'fran',
 'mouse',
 'stirring',
 'barn',
 'well',
 'goodnight',
 'meet',
 'horatio',
 'marcellus',
 'riuals',
 'watch',
 'bid',
 'make',
 'hast',
 'enter',
 'horatio',
 'marcellus',
 'fran',
 'thinke',
 'heare',
 'stand',
 'hor',
 'friends',
 'ground',
 'mar',
 'leige',
 'men',
 'dane',
 'fran',
 'giue',
 'good',
 'night',
 'mar',
 'farwel',
 'honest',
 'soldier',
 'hath',
 'relieu',
 'fra',
 'barnardo',
 'ha',
 'place',
 'giue',
 'goodnight',
 'exit

In [11]:
len(input_str_tokens)

238

# Stemming
<b>Stemming is a process of reducing words to their word stem, base or root form (for example, books — book, looked — look). </b>

In [12]:
stemmer= PorterStemmer()
for word in input_str_tokens:
    print(stemmer.stem(word))

tragedi
hamlet
william
shakespear
actu
primu
scoena
prima
enter
barnardo
francisco
two
centinel
barnardo
fran
nay
answer
stand
vnfold
self
bar
long
liue
king
fran
barnardo
bar
fran
come
care
vpon
hour
bar
ti
strook
twelu
get
thee
bed
francisco
fran
releef
much
thank
ti
bitter
cold
sick
heart
barn
haue
quiet
guard
fran
mous
stir
barn
well
goodnight
meet
horatio
marcellu
riual
watch
bid
make
hast
enter
horatio
marcellu
fran
think
hear
stand
hor
friend
ground
mar
leig
men
dane
fran
giue
good
night
mar
farwel
honest
soldier
hath
relieu
fra
barnardo
ha
place
giue
goodnight
exit
fran
mar
holla
barnardo
bar
say
horatio
hor
peec
bar
welcom
horatio
welcom
good
marcellu
mar
ha
thing
appear
again
night
bar
haue
seen
noth
mar
horatio
sai
ti
fantasi
let
beleef
take
hold
touch
dread
sight
twice
seen
vs
therefor
haue
intreat
along
vs
watch
minut
night
again
apparit
come
may
approu
eye
speak
hor
tush
tush
twill
appear
bar
sit
down
let
vs
again
assail
ear
fortifi
stori
two
night
haue
seen
hor
well
sit


# Lemmatization

<b>The aim of lemmatization, like stemming, is to reduce inflectional forms to a common base form. As opposed to stemming, lemmatization does not simply chop off inflections. Instead it uses lexical knowledge bases to get the correct base forms of words.</b>


In [13]:
lemmatizer=WordNetLemmatizer()
for word in input_str_tokens:
    print(lemmatizer.lemmatize(word))

tragedie
hamlet
william
shakespeare
actus
primus
scoena
prima
enter
barnardo
francisco
two
centinels
barnardo
fran
nay
answer
stand
vnfold
selfe
bar
long
liue
king
fran
barnardo
bar
fran
come
carefully
vpon
houre
bar
ti
strook
twelue
get
thee
bed
francisco
fran
releefe
much
thankes
ti
bitter
cold
sicke
heart
barn
haue
quiet
guard
fran
mouse
stirring
barn
well
goodnight
meet
horatio
marcellus
riuals
watch
bid
make
hast
enter
horatio
marcellus
fran
thinke
heare
stand
hor
friend
ground
mar
leige
men
dane
fran
giue
good
night
mar
farwel
honest
soldier
hath
relieu
fra
barnardo
ha
place
giue
goodnight
exit
fran
mar
holla
barnardo
bar
say
horatio
hor
peece
bar
welcome
horatio
welcome
good
marcellus
mar
ha
thing
appear
againe
night
bar
haue
seene
nothing
mar
horatio
say
ti
fantasie
let
beleefe
take
hold
touching
dreaded
sight
twice
seene
v
therefore
haue
intreated
along
v
watch
minute
night
againe
apparition
come
may
approue
eye
speake
hor
tush
tush
twill
appeare
bar
sit
downe
let
v
againe
ass

# Part of speech tagging (POS)

<b>Part-of-speech tagging aims to assign parts of speech to each word of a given text (such as nouns, verbs, adjectives, and others) based on its definition and its context.</b>


In [15]:
tagged = nltk.pos_tag(input_str_tokens)
tagged

[('tragedie', 'JJ'),
 ('hamlet', 'NN'),
 ('william', 'NN'),
 ('shakespeare', 'NN'),
 ('actus', 'NN'),
 ('primus', 'NN'),
 ('scoena', 'NN'),
 ('prima', 'NN'),
 ('enter', 'NN'),
 ('barnardo', 'VBP'),
 ('francisco', 'NN'),
 ('two', 'CD'),
 ('centinels', 'NNS'),
 ('barnardo', 'VBP'),
 ('fran', 'JJ'),
 ('nay', 'RB'),
 ('answer', 'VBP'),
 ('stand', 'VBP'),
 ('vnfold', 'JJ'),
 ('selfe', 'JJ'),
 ('bar', 'NN'),
 ('long', 'RB'),
 ('liue', 'JJ'),
 ('king', 'NN'),
 ('fran', 'NN'),
 ('barnardo', 'NN'),
 ('bar', 'NN'),
 ('fran', 'NN'),
 ('come', 'VBP'),
 ('carefully', 'RB'),
 ('vpon', 'JJ'),
 ('houre', 'NN'),
 ('bar', 'NN'),
 ('tis', 'NN'),
 ('strook', 'NN'),
 ('twelue', 'NN'),
 ('get', 'VB'),
 ('thee', 'JJ'),
 ('bed', 'NN'),
 ('francisco', 'NN'),
 ('fran', 'VBD'),
 ('releefe', 'RB'),
 ('much', 'JJ'),
 ('thankes', 'NNS'),
 ('tis', 'VBP'),
 ('bitter', 'JJ'),
 ('cold', 'JJ'),
 ('sicke', 'NN'),
 ('heart', 'NN'),
 ('barn', 'NN'),
 ('haue', 'NN'),
 ('quiet', 'JJ'),
 ('guard', 'NN'),
 ('fran', 'NN'),
 ('m

# Named entity recognition

<b>Named-entity recognition (NER) aims to find named entities in text and classify them into pre-defined categories (names of persons, locations, organizations, times, etc.).</b>


In [17]:
print(ne_chunk(pos_tag(input_str_tokens)))

(S
  tragedie/JJ
  hamlet/NN
  william/NN
  shakespeare/NN
  actus/NN
  primus/NN
  scoena/NN
  prima/NN
  enter/NN
  barnardo/VBP
  francisco/NN
  two/CD
  centinels/NNS
  barnardo/VBP
  fran/JJ
  nay/RB
  answer/VBP
  stand/VBP
  vnfold/JJ
  selfe/JJ
  bar/NN
  long/RB
  liue/JJ
  king/NN
  fran/NN
  barnardo/NN
  bar/NN
  fran/NN
  come/VBP
  carefully/RB
  vpon/JJ
  houre/NN
  bar/NN
  tis/NN
  strook/NN
  twelue/NN
  get/VB
  thee/JJ
  bed/NN
  francisco/NN
  fran/VBD
  releefe/RB
  much/JJ
  thankes/NNS
  tis/VBP
  bitter/JJ
  cold/JJ
  sicke/NN
  heart/NN
  barn/NN
  haue/NN
  quiet/JJ
  guard/NN
  fran/NN
  mouse/NN
  stirring/VBG
  barn/NN
  well/RB
  goodnight/JJ
  meet/NN
  horatio/NN
  marcellus/JJ
  riuals/NNS
  watch/VBP
  bid/NN
  make/VBP
  hast/NN
  enter/NN
  horatio/NN
  marcellus/NN
  fran/NN
  thinke/NN
  heare/NN
  stand/VBP
  hor/NN
  friends/NNS
  ground/NN
  mar/FW
  leige/FW
  men/NNS
  dane/VBP
  fran/JJ
  giue/NN
  good/JJ
  night/NN
  mar/NN
  farwel/NN
  h