<a href="https://colab.research.google.com/github/vijender412/Colab/blob/master/UseofStandfordNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stanford NLP Implementation with Python



References 


1.   https://www.analyticsvidhya.com/blog/2019/02/stanfordnlp-nlp-library-python/
2.   https://stanfordnlp.github.io/CoreNLP/
3.   https://textblob.readthedocs.io/en/dev/


---
Contact for any help @ Vijender Singh

# Setting up StanfordNLP in Python

In [2]:
!pip install stanfordnlp

Collecting stanfordnlp
[?25l  Downloading https://files.pythonhosted.org/packages/c6/53/86245cebb380fb5f7f5e16eccfe78afed8c3c2c7ef218331cbcafce2be18/stanfordnlp-0.1.2-py3-none-any.whl (135kB)
[K    100% |████████████████████████████████| 143kB 10.6MB/s 
Installing collected packages: stanfordnlp
Successfully installed stanfordnlp-0.1.2


# Importing StanfordNLP

In [0]:
import stanfordnlp

# Using StanfordNLP to Perform Basic NLP Tasks

In [4]:
stanfordnlp.download('en')   # This downloads the English models for the neural pipeline
nlp = stanfordnlp.Pipeline() # This sets up a default neural pipeline in English

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)
Y

Default download directory: /root/stanfordnlp_resources
Hit enter to continue or type an alternate directory.


Downloading models for: en_ewt
Download location: /root/stanfordnlp_resources/en_ewt_models.zip


100%|██████████| 1.96G/1.96G [01:10<00:00, 16.1MB/s]



Download complete.  Models saved to: /root/stanfordnlp_resources/en_ewt_models.zip
Extracting models file for: en_ewt
Cleaning up...Done.
Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/root/stanfordnlp_resources

# Dependency Extraction

In [28]:
my_str = "Hello my name is vijender. I am Swimming "
doc = nlp(my_str)
doc.sentences[0].print_dependencies()

('Hello', '5', 'discourse')
('my', '3', 'nmod:poss')
('name', '5', 'nsubj')
('is', '5', 'cop')
('vijender', '0', 'root')
('.', '5', 'punct')


#Tokenization

In [16]:
doc.sentences[0].print_tokens()

<Token index=1;words=[<Word index=1;text=Hello;lemma=hello;upos=INTJ;xpos=UH;feats=_;governor=5;dependency_relation=discourse>]>
<Token index=2;words=[<Word index=2;text=my;lemma=my;upos=PRON;xpos=PRP$;feats=Number=Sing|Person=1|Poss=Yes|PronType=Prs;governor=3;dependency_relation=nmod:poss>]>
<Token index=3;words=[<Word index=3;text=name;lemma=name;upos=NOUN;xpos=NN;feats=Number=Sing;governor=5;dependency_relation=nsubj>]>
<Token index=4;words=[<Word index=4;text=is;lemma=be;upos=AUX;xpos=VBZ;feats=Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin;governor=5;dependency_relation=cop>]>
<Token index=5;words=[<Word index=5;text=vijender;lemma=vijender;upos=NOUN;xpos=NN;feats=Number=Sing;governor=0;dependency_relation=root>]>
<Token index=6;words=[<Word index=6;text=.;lemma=.;upos=PUNCT;xpos=.;feats=_;governor=5;dependency_relation=punct>]>


# Lemmatization

In [17]:
import pandas as pd

#extract lemma
def extract_lemma(doc):
    parsed_text = {'word':[], 'lemma':[]}
    for sent in doc.sentences:
        for wrd in sent.words:
            #extract text and lemma
            parsed_text['word'].append(wrd.text)
            parsed_text['lemma'].append(wrd.lemma)
    #return a dataframe
    return pd.DataFrame(parsed_text)

#call the function on doc
extract_lemma(doc)

Unnamed: 0,lemma,word
0,hello,Hello
1,my,my
2,name,name
3,be,is
4,vijender,vijender
5,.,.
6,I,I
7,be,am
8,swim,Swimming


# Parts of Speech (PoS) Tagging

In [41]:
#dictionary that contains pos tags and their explanations
pos_dict = {
'CC': 'coordinating conjunction','CD': 'cardinal digit','DT': 'determiner',
'EX': 'existential there (like: \"there is\" ... think of it like \"there exists\")',
'FW': 'foreign word','IN':  'preposition/subordinating conjunction','JJ': 'adjective \'big\'',
'JJR': 'adjective, comparative \'bigger\'','JJS': 'adjective, superlative \'biggest\'',
'LS': 'list marker 1)','MD': 'modal could, will','NN': 'noun, singular \'desk\'',
'NNS': 'noun plural \'desks\'','NNP': 'proper noun, singular \'Harrison\'',
'NNPS': 'proper noun, plural \'Americans\'','PDT': 'predeterminer \'all the kids\'',
'POS': 'possessive ending parent\'s','PRP': 'personal pronoun I, he, she',
'PRP$': 'possessive pronoun my, his, hers','RB': 'adverb very, silently,',
'RBR': 'adverb, comparative better','RBS': 'adverb, superlative best',
'RP': 'particle give up','TO': 'to go \'to\' the store.','UH': 'interjection errrrrrrrm',
'VB': 'verb, base form take','VBD': 'verb, past tense took',
'VBG': 'verb, gerund/present participle taking','VBN': 'verb, past participle taken',
'VBP': 'verb, sing. present, non-3d take','VBZ': 'verb, 3rd person sing. present takes',
'WDT': 'wh-determiner which','WP': 'wh-pronoun who, what','WP$': 'possessive wh-pronoun whose',
'WRB': 'wh-abverb where, when','QF' : 'quantifier, bahut, thoda, kam (Hindi)','VM' : 'main verb',
'PSP' : 'postposition, common in indian langs','DEM' : 'demonstrative, common in indian langs'
}

#extract parts of speech
def extract_pos(doc):
    parsed_text = {'word':[], 'pos':[], 'exp':[]}
    for sent in doc.sentences:
        for wrd in sent.words:
            if wrd.pos in pos_dict.keys():
                pos_exp = pos_dict[wrd.pos]
            else:
                pos_exp = 'NA'
            parsed_text['word'].append(wrd.text)
            parsed_text['pos'].append(wrd.pos)
            parsed_text['exp'].append(pos_exp)
    #return a dataframe of pos and text
    return pd.DataFrame(parsed_text)

#extract pos
extract_pos(doc)

Unnamed: 0,exp,pos,word
0,interjection errrrrrrrm,UH,Hello
1,"possessive pronoun my, his, hers",PRP$,my
2,"noun, singular 'desk'",NN,name
3,"verb, 3rd person sing. present takes",VBZ,is
4,"noun, singular 'desk'",NN,vijender
5,,.,.
6,"personal pronoun I, he, she",PRP,I
7,"verb, sing. present, non-3d take",VBP,am
8,"noun, singular 'desk'",NN,Swimming


# **Processing text in Hindi (Devanagari Script)**

In [36]:
stanfordnlp.download('hi')

Using the default treebank "hi_hdtb" for language "hi".
Would you like to download the models for: hi_hdtb now? (Y/n)
Y

Default download directory: /root/stanfordnlp_resources
Hit enter to continue or type an alternate directory.


Downloading models for: hi_hdtb
Download location: /root/stanfordnlp_resources/hi_hdtb_models.zip


100%|██████████| 208M/208M [00:15<00:00, 13.0MB/s]



Download complete.  Models saved to: /root/stanfordnlp_resources/hi_hdtb_models.zip
Extracting models file for: hi_hdtb
Cleaning up...Done.


In [0]:
my_str2 = """केंद्र की मोदी सरकार ने शुक्रवार को अपना अंतरिम बजट पेश किया. कार्यवाहक वित्त मंत्री पीयूष गोयल ने अपने बजट में किसान, मजदूर, करदाता, महिला वर्ग समेत हर किसी के लिए बंपर ऐलान किए. हालांकि, बजट के बाद भी टैक्स को लेकर काफी कन्फ्यूजन बना रहा. केंद्र सरकार के इस अंतरिम बजट क्या खास रहा और किसको क्या मिला, आसान भाषा में यहां समझें"""
hindi_doc = nlp(my_str2)

In [38]:
extract_pos(hindi_doc)

Unnamed: 0,exp,pos,word
0,"proper noun, singular 'Harrison'",NNP,केंद्र
1,"proper noun, singular 'Harrison'",NNP,की
2,"proper noun, singular 'Harrison'",NNP,मोदी
3,"proper noun, singular 'Harrison'",NNP,सरकार
4,preposition/subordinating conjunction,IN,ने
5,"proper noun, singular 'Harrison'",NNP,शुक्रवार
6,preposition/subordinating conjunction,IN,को
7,"proper noun, singular 'Harrison'",NNP,अपना
8,"proper noun, singular 'Harrison'",NNP,अंतरिम
9,"proper noun, singular 'Harrison'",NNP,बजट


# Sentiment Analysis using TextBlob

In [0]:
from textblob import TextBlob
def sentiment(message):
    # create TextBlob object of passed tweet text
    analysis = TextBlob(message)
    # set sentiment
    return (analysis.sentiment.polarity)
sentiment(my_str)

0.0

# Sentiment Analysis with Machine Learning Classifier

In [2]:
# Custom training
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
train = [
        ('I love samosa', 'pos'),
        ('Mumbai is not an amazing place!', 'neg'),
        ('Delhi is amazing place', 'pos'),
        ('I do not like this bar', 'neg'),
        ("I can't deal with this", 'neg'),
        ("My wife is horrible.", "neg")
]
cl = NaiveBayesClassifier(train)
# cl.classify("I feel amazing!")
'pos'
blob = TextBlob("The weather is good. But the pollution is horrible.", classifier=cl)
for s in blob.sentences:
  print(s)
  print(s.classify())


The weather is good.
pos
But the pollution is horrible.
neg


# Thanks Bye

In [9]:
temp = input("Enter your text: ")

Enter your text: i am feeling awesome


In [12]:
blob = TextBlob(temp, classifier=cl)
result = blob.classify()
# print(result)
if result=='pos':
  print("Great to hear that you have a good mood")
else:
  print("Sad to hear that you have a bad mood")

Great to hear that you have a good mood


# Text Blob with Sentiment Analysis - Beta

In [17]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [33]:
from textblob import TextBlob

# text = '''
# Hey my name is Vijender Singh. I am having a great day. I believe python is the best programming language. 
# '''
text = input("Enter your text: ")

Enter your text: Hey my name is Vijender Singh. I am having a great day. I believe python is the best programming language. 


In [34]:
blob = TextBlob(text)
print("Tags of text are" )
print(blob.tags)

Tags of text are
[('Hey', 'NNP'), ('my', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Vijender', 'NNP'), ('Singh', 'NNP'), ('I', 'PRP'), ('am', 'VBP'), ('having', 'VBG'), ('a', 'DT'), ('great', 'JJ'), ('day', 'NN'), ('I', 'PRP'), ('believe', 'VBP'), ('python', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('best', 'JJS'), ('programming', 'NN'), ('language', 'NN')]


In [35]:
print("Noun Phrases of text are" )
blob.noun_phrases   



Noun Phrases of text are


WordList(['hey', 'vijender singh', 'great day'])

In [36]:
print("Sentiment Analysis of each sentence is as follows " )
for sentence in blob.sentences:
    print(sentence,sentence.sentiment.polarity)


Sentiment Analysis of each sentence is as follows 
Hey my name is Vijender Singh. 0.0
I am having a great day. 0.8
I believe python is the best programming language. 1.0


In [37]:
blob.translate(to="hi") 

TextBlob("अरे मेरा नाम विजेंद्र सिंह है। मैं एक महान दिन रहा हूं। मेरा मानना ​​है कि अजगर सबसे अच्छी प्रोग्रामिंग भाषा है।")