### Load Packages

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
stop_words = set(stopwords.words('english'))
nltk.download("words")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kiran/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/kiran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/kiran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/kiran/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/kiran/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### Prepare Data

In [None]:
txt = "Heavy rain continues to lash parts of western Maharashtra, Raigad and Thane districts with more than 60 barrages submerged in Kolhapur district"
tokenized = sent_tokenize(txt)
print(tokenized)

['Heavy rain continues to lash parts of western Maharashtra, Raigad and Thane districts with more than 60 barrages submerged in Kolhapur district']


### String Tokenization And Parts of Speech Tagging

In [None]:
# Passing sentence by sentence in the for loop
for i in tokenized:
    # Tokenizing the sentence
    wordsList = nltk.word_tokenize(i)
    # Removing stop words like is, on , and, ...
    wordsList = [w for w in wordsList if not w in stop_words]
    # Task1 Call the method that does Parts of Speech Tagging from nltk library and the pass the wordsList
    # tagged = ...
    # Your code here
    
    print(tagged)

[('Heavy', 'NNP'), ('rain', 'NN'), ('continues', 'VBZ'), ('lash', 'JJ'), ('parts', 'NNS'), ('western', 'JJ'), ('Maharashtra', 'NNP'), (',', ','), ('Raigad', 'NNP'), ('Thane', 'NNP'), ('districts', 'NNS'), ('60', 'CD'), ('barrages', 'NNS'), ('submerged', 'VBN'), ('Kolhapur', 'NNP'), ('district', 'NN')]


Representations:

CC 	coordinating conjunction
CD 	cardinal digit
DT 	determiner
EX 	existential there
FW 	foreign word
IN 	preposition/subordinating conjunction
JJ 	This NLTK POS Tag is an adjective (large)
JJR 	adjective, comparative (larger)
JJS 	adjective, superlative (largest)
LS 	list market
MD 	modal (could, will)
NN 	noun, singular (cat, tree)
NNS 	noun plural (desks)
NNP 	proper noun, singular (sarah)
NNPS 	proper noun, plural (indians or americans)
PDT 	predeterminer (all, both, half)
POS 	possessive ending (parent\ 's)
PRP 	personal pronoun (hers, herself, him,himself)
PRP$ 	possessive pronoun (her, his, mine, my, our )
RB 	adverb (occasionally, swiftly)
RBR 	adverb, comparative (greater)
RBS 	adverb, superlative (biggest)
RP 	particle (about)
TO 	infinite marker (to)
UH 	interjection (goodbye)
VB 	verb (ask)
VBG 	verb gerund (judging)
VBD 	verb past tense (pleaded)
VBN 	verb past participle (reunified)
VBP 	verb, present tense not 3rd person singular(wrap)
VBZ 	verb, present tense with 3rd person singular (bases)
WDT 	wh-determiner (that, what)
WP 	wh- pronoun (who)
WRB 	wh- adverb (how) 


### Named Entity Recognition

**Named Entity Recognition** (NER) is a standard NLP problem which involves spotting named entities (people, places, organizations etc.) from a chunk of text, and classifying them into a predefined set of categories.

Refer: https://www.nltk.org/book/ch07.html

In [None]:
# Sample sentence

sentence = "Heavy rain continues to lash parts of western Maharashtra districts with more than 60 barrages submerged in Kolhapur district."

# Sending sentence by sentence inside the loop
for sent in nltk.sent_tokenize(sentence):
    # We will form chunks based on the tokenized words and pos tagging
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        # The chunks will hold a attribute label if it has identified that the chunk is a Named Entity
        if hasattr(chunk, 'label'):
            print(chunk.label(), ' '.join(c[0] for c in chunk))

GPE Heavy
GPE Kolhapur


***NE Type 	Examples***
ORGANIZATION 	Georgia-Pacific Corp., 
PERSON 	Eddy Bonte, President Obama
LOCATION 	Murray River, Mount Everest
DATE 	June, 2008-06-29
TIME 	two fifty a m, 1:30 p.m.
MONEY 	175 million Canadian Dollars, 
PERCENT 	twenty pct, 18.75 %
FACILITY 	Washington Monument, Stonehenge
GPE 	South East Asia, Midlothian

In [None]:
# Task 2 
# List a few named entities that you can think of.
# Check if those entities are being caputered by nltk library by passing a sentence containing that named entitiy in the above code.

In [None]:
# Example for other Parsers

In [None]:
from nltk.parse.stanford import StanfordDependencyParser

# Download the parser from https://nlp.stanford.edu/software/stanford-parser-4.2.0.zip
# Extract the folder and link stanford-parser.jar
path_jar = "/home/kiran/ta/nlp/nlp_assignment/Word2vec_MB/stanford-parser-4.2.0/stanford-parser-full-2020-11-17/stanford-parser.jar"
# Link stanford-parser-4.2.0-models.jar
path_models_jar = "/home/kiran/ta/nlp/nlp_assignment/Word2vec_MB/stanford-parser-4.2.0/stanford-parser-full-2020-11-17/stanford-parser-4.2.0-models.jar"

dep_parser = StanfordDependencyParser(path_to_jar = path_jar, path_to_models_jar = path_models_jar)

result = dep_parser.raw_parse("I saw an elephant in my sleep")
dependency = result.__next__()

#Print the results of the parser
print(list(dependency.triples()))

[(('saw', 'VBD'), 'nsubj', ('I', 'PRP')),
 (('saw', 'VBD'), 'obj', ('elephant', 'NN')),
 (('elephant', 'NN'), 'det', ('an', 'DT')),
 (('saw', 'VBD'), 'obl', ('sleep', 'NN')),
 (('sleep', 'NN'), 'case', ('in', 'IN')),
 (('sleep', 'NN'), 'nmod:poss', ('my', 'PRP$'))]