### Load Packages

In [1]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
stop_words = set(stopwords.words('english'))
nltk.download("words")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### Prepare Data

In [2]:
txt = "personally led a rescue operation to free Indonesian crew members taken hostage by pirates in June 2020"
tokenized = sent_tokenize(txt)
print(tokenized)

['personally led a rescue operation to free Indonesian crew members taken hostage by pirates in June 2020']


### String Tokenization And Parts of Speech Tagging

In [3]:
# Passing sentence by sentence in the for loop
for token in tokenized:
    # Tokenizing the sentence
    wordsList = nltk.word_tokenize(token)
    # Removing stop words like is, on , and, ...
    wordsList = [w for w in wordsList if not w in stop_words]
    # Task1 Call the method that does Parts of Speech Tagging from nltk library and the pass the wordsList
    # tagged = ...
    # Your code here
    tagged = nltk.pos_tag(wordsList)
    
    print(tagged)

[('personally', 'RB'), ('led', 'VBN'), ('rescue', 'NN'), ('operation', 'NN'), ('free', 'JJ'), ('Indonesian', 'NNP'), ('crew', 'NN'), ('members', 'NNS'), ('taken', 'VBN'), ('hostage', 'NN'), ('pirates', 'VBZ'), ('June', 'NNP'), ('2020', 'CD')]


Representations:

CC 	coordinating conjunction
CD 	cardinal digit
DT 	determiner
EX 	existential there
FW 	foreign word
IN 	preposition/subordinating conjunction
JJ 	This NLTK POS Tag is an adjective (large)
JJR 	adjective, comparative (larger)
JJS 	adjective, superlative (largest)
LS 	list market
MD 	modal (could, will)
NN 	noun, singular (cat, tree)
NNS 	noun plural (desks)
NNP 	proper noun, singular (sarah)
NNPS 	proper noun, plural (indians or americans)
PDT 	predeterminer (all, both, half)
POS 	possessive ending (parent\ 's)
PRP 	personal pronoun (hers, herself, him,himself)
PRP$ 	possessive pronoun (her, his, mine, my, our )
RB 	adverb (occasionally, swiftly)
RBR 	adverb, comparative (greater)
RBS 	adverb, superlative (biggest)
RP 	particle (about)
TO 	infinite marker (to)
UH 	interjection (goodbye)
VB 	verb (ask)
VBG 	verb gerund (judging)
VBD 	verb past tense (pleaded)
VBN 	verb past participle (reunified)
VBP 	verb, present tense not 3rd person singular(wrap)
VBZ 	verb, present tense with 3rd person singular (bases)
WDT 	wh-determiner (that, what)
WP 	wh- pronoun (who)
WRB 	wh- adverb (how) 


### Named Entity Recognition

**Named Entity Recognition** (NER) is a standard NLP problem which involves spotting named entities (people, places, organizations etc.) from a chunk of text, and classifying them into a predefined set of categories.

Refer: https://www.nltk.org/book/ch07.html

In [4]:
# Sample sentence

sentence = "personally led a rescue operation to free Indonesian crew members taken hostage by pirates in June 2021"

# Sending sentence by sentence inside the loop
for sent in nltk.sent_tokenize(sentence):
    # We will form chunks based on the tokenized words and pos tagging
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        # The chunks will hold a attribute label if it has identified that the chunk is a Named Entity
        if hasattr(chunk, 'label'):
            print(chunk.label(), ' '.join(c[0] for c in chunk))

GPE Indonesian


***NE Type 	Examples***
ORGANIZATION 	Georgia-Pacific Corp., 
PERSON 	Eddy Bonte, President Obama
LOCATION 	Murray River, Mount Everest
DATE 	June, 2008-06-29
TIME 	two fifty a m, 1:30 p.m.
MONEY 	175 million Canadian Dollars, 
PERCENT 	twenty pct, 18.75 %
FACILITY 	Washington Monument, Stonehenge
GPE 	South East Asia, Midlothian

In [5]:
# Task 2 
# List a few named entities that you can think of.
# Check if those entities are being caputered by nltk library by passing a sentence containing that named entitiy in the above code.

In [6]:
# Example for other Parsers
sentences = "Donald John Trump is an American media personality and businessman who served as the 45th president of the United States from 2017 to 2021. Born and raised in Queens, New York City, Trump attended Fordham University and the University of Pennsylvania, graduating with a bachelor's degree in 1968."
sentence1 = "Barack Hussein Obama Jr. is an American politician and attorney who served as the 44th president of the United States from 2009 to 2017. A member of the Democratic Party, Obama was the first African-American president of the United States."

print("\nSentence 1:\n")
# Sending sentence by sentence inside the loop
for sent in nltk.sent_tokenize(sentences):
    # We will form chunks based on the tokenized words and pos tagging
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        # The chunks will hold a attribute label if it has identified that the chunk is a Named Entity
        if hasattr(chunk, 'label'):
            print(chunk.label(), ' '.join(c[0] for c in chunk))

print("\nSentence 2:\n")

 # Sending sentence by sentence inside the loop
for sent in nltk.sent_tokenize(sentence1):
    # We will form chunks based on the tokenized words and pos tagging
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        # The chunks will hold a attribute label if it has identified that the chunk is a Named Entity
        if hasattr(chunk, 'label'):
            print(chunk.label(), ' '.join(c[0] for c in chunk))



Sentence 1:

PERSON Donald
PERSON John Trump
GPE American
GPE United States
GPE Born
GPE Queens
GPE New York City
PERSON Trump
ORGANIZATION Fordham University
ORGANIZATION University
GPE Pennsylvania

Sentence 2:

PERSON Barack
PERSON Hussein Obama
GPE American
GPE United States
ORGANIZATION Democratic Party
PERSON Obama
GPE United States


In [9]:
from nltk.parse.stanford import StanfordDependencyParser

# Download the parser from https://nlp.stanford.edu/software/stanford-parser-4.2.0.zip
# Extract the folder and link stanford-parser.jar
path_jar = "./stanford-parser.jar"
# Link stanford-parser-4.2.0-models.jar
path_models_jar = "./stanford-parser-4.2.0-models.jar"

dep_parser = StanfordDependencyParser(path_to_jar = path_jar, path_to_models_jar = path_models_jar)
result = dep_parser.raw_parse("I saw an elephant in my sleep")
dependency = result.__next__()

#Print the results of the parser
print(list(dependency.triples()))

[(('saw', 'VBD'), 'nsubj', ('I', 'PRP')), (('saw', 'VBD'), 'obj', ('elephant', 'NN')), (('elephant', 'NN'), 'det', ('an', 'DT')), (('saw', 'VBD'), 'obl', ('sleep', 'NN')), (('sleep', 'NN'), 'case', ('in', 'IN')), (('sleep', 'NN'), 'nmod:poss', ('my', 'PRP$'))]
