## Natural Language Processing Basics

In [7]:
import nltk
import spacy
import numpy as np
import pandas as pd

# following line is optional for custom vocabulary installation
# you can use nlp = spacy.load('en')
nlp = spacy.load('en_core_web_md', parse=True, tag=True, entity=True)

In [18]:
def nltk_pos_tags(text):
    return nltk.pos_tag(text.split())

def spacy_pos_tags(text):
    return [(token, token.pos_, token.tag_) for token in nlp(text)]

nltk_pos_tags('the quick brown fox jumped over the lazy dog')

[('the', 'DT'),
 ('quick', 'JJ'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('jumped', 'VBD'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('dog', 'NN')]

In [19]:
spacy_pos_tags('the quick brown fox jumped over the lazy dog')

[(the, 'DET', 'DT'),
 (quick, 'ADJ', 'JJ'),
 (brown, 'ADJ', 'JJ'),
 (fox, 'NOUN', 'NN'),
 (jumped, 'VERB', 'VBD'),
 (over, 'ADP', 'IN'),
 (the, 'DET', 'DT'),
 (lazy, 'ADJ', 'JJ'),
 (dog, 'NOUN', 'NN')]

In [15]:
# dependency grammar

from spacy import displacy
displacy.render(nlp('the quick brown fox jumped over the lazy dog'), jupyter=True,
                options={'distance': 100,
                         'arrow_stroke': 1.5,
                         'arrow_width': 8})

In [20]:
print(nltk_pos_tags('They are going to the annual fair'))
print('----')
print(spacy_pos_tags('They are going to the annual fair'))

[('They', 'PRP'), ('are', 'VBP'), ('going', 'VBG'), ('to', 'TO'), ('the', 'DT'), ('annual', 'JJ'), ('fair', 'NN')]
----
[(They, 'PRON', 'PRP'), (are, 'VERB', 'VBP'), (going, 'VERB', 'VBG'), (to, 'ADP', 'IN'), (the, 'DET', 'DT'), (annual, 'ADJ', 'JJ'), (fair, 'NOUN', 'NN')]


In [22]:
text = 'I hope the judgment is fair to all'
print(nltk_pos_tags(text))
print('----')
print(spacy_pos_tags(text))

displacy.render(nlp(text), jupyter=True)

[('I', 'PRP'), ('hope', 'VBP'), ('the', 'DT'), ('judgment', 'NN'), ('is', 'VBZ'), ('fair', 'JJ'), ('to', 'TO'), ('all', 'DT')]
----
[(I, 'PRON', 'PRP'), (hope, 'VERB', 'VBP'), (the, 'DET', 'DT'), (judgment, 'NOUN', 'NN'), (is, 'VERB', 'VBZ'), (fair, 'ADJ', 'JJ'), (to, 'ADP', 'IN'), (all, 'DET', 'DT')]


In [42]:
from  nltk.corpus import brown

In [43]:
print('Total Categories:', len(brown.categories()))
print('-------') 
print(brown.categories())
print('-------') 
print(brown.sents(categories="adventure"))
print('-------') 
print(brown.tagged_sents(categories='mystery'))

Total Categories: 15
-------
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
-------
[['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.'], ['He', 'was', 'well', 'rid', 'of', 'her', '.'], ...]
-------
[[('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('I', 'PPSS'), ('left', 'VBD'), ('for', 'IN'), ('Hanover', 'NP'), (',', ','), ('most', 'AP'), ('of', 'IN'), ('them', 'PPO'), ('disturbed', 'VBN'), ('and', 'CC'), ('hallucinating', 'VBG'), ('.', '.')], [('An', 'AT'), ('interne', 'NN'), (',', ','), ('a', 'AT'), ('nurse', 'NN'), ('and', 'CC'), ('two', 'CD'), ('attendants', 'NNS'), ('were', 'BED'), ('in', 'IN'), ('charge', 'NN'), ('of', 'IN'), ('us', 'PPO'), ('.', '.')], ...]


In [49]:
tagged_words = brown.tagged_words(categories='mystery')
nouns = [(word, tag) for word, tag in tagged_words if any(noun_tag in tag for noun_tag in ['NP', 'NN'])]
print(nouns[:20])

[('patients', 'NNS'), ('bus', 'NN'), ('morning', 'NN'), ('Hanover', 'NP'), ('interne', 'NN'), ('nurse', 'NN'), ('attendants', 'NNS'), ('charge', 'NN'), ('bus', 'NN'), ('window', 'NN'), ("Chicago's", 'NP$'), ('Side', 'NN-TL'), ('drone', 'NN'), ('voices', 'NNS'), ('odors', 'NNS'), ('patients', 'NNS'), ('ward', 'NN'), ('state', 'NN'), ('Illinois', 'NP'), ('hospital', 'NN')]


In [58]:
# build frequency distribution for nouns
nouns_freq = nltk.FreqDist([word for word, tag in nouns])
# view top 10 occurring nouns
nouns_freq.most_common(10)

[('man', 106),
 ('time', 82),
 ('door', 80),
 ('car', 69),
 ('room', 65),
 ('Mr.', 63),
 ('way', 61),
 ('office', 50),
 ('eyes', 48),
 ('hand', 46)]

In [61]:
nltk.download('reuters')

[nltk_data] Downloading package reuters to /Users/tusharm/nltk_data...


True

In [62]:
from nltk.corpus import reuters

# print the categories
print(reuters.categories())

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [63]:
sentences = reuters.sents(categories=['housing', 'income'])
sentences = [' '.join(sentence_tokens) for sentence_tokens in sentences]
sentences[0:5]  # view the first 5 sentences

["YUGOSLAV ECONOMY WORSENED IN 1986 , BANK DATA SHOWS National Bank economic data for 1986 shows that Yugoslavia ' s trade deficit grew , the inflation rate rose , wages were sharply higher , the money supply expanded and the value of the dinar fell .",
 'The trade deficit for 1986 was 2 . 012 billion dlrs , 25 . 7 pct higher than in 1985 .',
 'The trend continued in the first three months of this year as exports dropped by 17 . 8 pct , in hard currency terms , to 2 . 124 billion dlrs .',
 'Yugoslavia this year started quoting trade figures in dinars based on current exchange rates , instead of dollars based on a fixed exchange rate of 264 . 53 dinars per dollar .',
 "Yugoslavia ' s balance of payments surplus with the convertible currency area fell to 245 mln dlrs in 1986 from 344 mln in 1985 ."]

In [66]:
# load the Wordnet Corpus
from nltk.corpus import wordnet as wn

word = 'economic' # taking hike as our word of interest
# get word synsets
word_synsets = wn.synsets(word)
print(word_synsets)
print('----------')
# get details for each synonym in synset
for synset in word_synsets:
    print(('Synset Name: {name}\n'
           'POS Tag: {tag}\n'
           'Definition: {defn}\n'
           'Examples: {ex}\n').format(name=synset.name(),
                                      tag=synset.pos(),
                                      defn=synset.definition(),
                                      ex=synset.examples()))

[Synset('economic.a.01'), Synset('economic.a.02'), Synset('economic.s.03'), Synset('economic.s.04'), Synset('economic.s.05')]
----------
Synset Name: economic.a.01
POS Tag: a
Definition: of or relating to an economy, the system of production and management of material wealth
Examples: ['economic growth', 'aspects of social, political, and economical life']

Synset Name: economic.a.02
POS Tag: a
Definition: of or relating to the science of economics
Examples: ['economic theory']

Synset Name: economic.s.03
POS Tag: s
Definition: using the minimum of time or resources necessary for effectiveness
Examples: ['an economic use of home heating oil', 'a modern economical heating system', 'an economical use of her time']

Synset Name: economic.s.04
POS Tag: s
Definition: concerned with worldly necessities of life (especially money)
Examples: ['he wrote the book primarily for economic reasons', 'gave up the large house for economic reasons', 'in economic terms they are very privileged']

Synset 

In [73]:
nltk.download('indian')

[nltk_data] Downloading package indian to /Users/tusharm/nltk_data...
[nltk_data]   Unzipping corpora/indian.zip.


True

In [71]:
from nltk.corpus import indian

In [103]:
indian.tagged_words()[18000:18020]

[('उन्हें', 'PRP'),
 ('प्रदेश', 'NNC'),
 ('कांग्रेस', 'NN'),
 ('का', 'PREP'),
 ('अध्यक्ष', 'NN'),
 ('बनाया', 'VFM'),
 ('गया', 'VAUX'),
 ('था', 'VAUX'),
 ('।', 'PUNC'),
 ('कश्मीर', 'NNP'),
 ('में', 'PREP'),
 ('युद्ध', 'NNC'),
 ('विराम', 'NN'),
 ('की', 'PREP'),
 ('अवधि', 'NN'),
 ('३', 'QFNUM'),
 ('माह', 'NN'),
 ('बढ़ी', 'VFM'),
 ('नई', 'NNPC'),
 ('दिल्ली', 'NNP')]