# Module 2 (Python 3)

## Basic NLP Tasks with NLTK

In [1]:
import nltk

## download this only when for the first time: nltk.download()
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


### Counting vocabulary of words

In [2]:
text5

<Text: Chat Corpus>

In [3]:
sent5

['I',
 'have',
 'a',
 'problem',
 'with',
 'people',
 'PMing',
 'me',
 'to',
 'lol',
 'JOIN']

In [4]:
len(sent5)

11

In [5]:
len(text5)

45010

### Unique nr of words

In [6]:
len(set(text5)) 

6066

In [7]:
list(set(text5))[:10] ## TYPE FIRST UNIQUE WORDS. "u" stands for UTP-8 coding

[u'',
 u'raining',
 u'fawk',
 u'four',
 u'woods',
 u'hanging',
 u'Until',
 u'opener',
 u'lord',
 u'minigames']

### Frequency of words

In [8]:
dist = FreqDist(text5) ## FreqDist is a command
len(dist) ## set of UNIQUE WORDS

6066

In [9]:
vocab1 = dist.keys() ## the ACTUAL WORDS
#vocab1[:10] 
# In Python 3 dict.keys() returns an iterable view instead of a list
list(vocab1)[:10] ## first 10 words

[u'',
 u'raining',
 u'fawk',
 u'four',
 u'woods',
 u'hanging',
 u'Until',
 u'opener',
 u'lord',
 u'minigames']

In [10]:
dist['four'] ## give me distribution of word 'four'

1

### how may times words occured with the certain lenth

In [11]:
freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100] 
freqwords 

[u'ACTION']

### Normalization and stemming

In [12]:
input1 = "List listed lists listing listings"
words1 = input1.lower().split(' ')
words1

['list', 'listed', 'lists', 'listing', 'listings']

In [13]:
porter = nltk.PorterStemmer()
[porter.stem(t) for t in words1]

[u'list', u'list', u'list', u'list', u'list']

### Lemmatization 
Is like Stemming, but keeping words useful / which make sense

In [14]:
udhr = nltk.corpus.udhr.words('English-Latin1') 
udhr[:20] ##print first 20 words

[u'Universal',
 u'Declaration',
 u'of',
 u'Human',
 u'Rights',
 u'Preamble',
 u'Whereas',
 u'recognition',
 u'of',
 u'the',
 u'inherent',
 u'dignity',
 u'and',
 u'of',
 u'the',
 u'equal',
 u'and',
 u'inalienable',
 u'rights',
 u'of']

In [15]:
[porter.stem(t) for t in udhr[:20]] # Still Lemmatization

[u'Univers',
 u'Declar',
 u'of',
 u'Human',
 u'Right',
 u'Preambl',
 u'Wherea',
 u'recognit',
 u'of',
 u'the',
 u'inher',
 u'digniti',
 u'and',
 u'of',
 u'the',
 u'equal',
 u'and',
 u'inalien',
 u'right',
 u'of']

In [16]:
WNlemma = nltk.WordNetLemmatizer()
[WNlemma.lemmatize(t) for t in udhr[:20]]

[u'Universal',
 u'Declaration',
 u'of',
 u'Human',
 u'Rights',
 u'Preamble',
 u'Whereas',
 u'recognition',
 u'of',
 u'the',
 u'inherent',
 u'dignity',
 u'and',
 u'of',
 u'the',
 u'equal',
 u'and',
 u'inalienable',
 u'right',
 u'of']

### Tokenization - splitting sentence into the words

In [1]:
text11 = "Children shouldn't drink a sugary drink before bed."
text11.split(' ')

['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']

### NLTK has an in-built tokenizer

In [18]:
nltk.word_tokenize(text11)

['Children',
 'should',
 "n't",
 'drink',
 'a',
 'sugary',
 'drink',
 'before',
 'bed',
 '.']

### Sentence Splitting: in-built sentence splitter too:

In [19]:
text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
sentences = nltk.sent_tokenize(text12)
len(sentences)

4

In [20]:
sentences

['This is the first sentence.',
 'A gallon of milk in the U.S. costs $2.99.',
 'Is this the third sentence?',
 'Yes, it is!']

## Advanced NLP Tasks with NLTK

### POS tagging - Part-of-Speach tagging

In [21]:
nltk.help.upenn_tagset('MD')

MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would


In [22]:
text13 = nltk.word_tokenize(text11)
nltk.pos_tag(text13)

URLError: <urlopen error unknown url type: c>

In [23]:
text14 = nltk.word_tokenize("Visiting aunts can be a nuisance")
nltk.pos_tag(text14)

URLError: <urlopen error unknown url type: c>

In [24]:
# Parsing sentence structure
text15 = nltk.word_tokenize("Alice loves Bob")
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP
NP -> 'Alice' | 'Bob'
V -> 'loves'
""")

parser = nltk.ChartParser(grammar)
trees = parser.parse_all(text15)
for tree in trees:
    print(tree)

(S (NP Alice) (VP (V loves) (NP Bob)))


In [26]:
text16 = nltk.word_tokenize("I saw the man with a telescope")
grammar1 = nltk.data.load('mygrammar.cfg')
grammar1

<Grammar with 13 productions>

In [27]:
parser = nltk.ChartParser(grammar1)
trees = parser.parse_all(text16)
for tree in trees:
    print(tree)

(S
  (NP I)
  (VP
    (VP (V saw) (NP (Det the) (N man)))
    (PP (P with) (NP (Det a) (N telescope)))))
(S
  (NP I)
  (VP
    (V saw)
    (NP (Det the) (N man) (PP (P with) (NP (Det a) (N telescope))))))


In [28]:
from nltk.corpus import treebank
text17 = treebank.parsed_sents('wsj_0001.mrg')[0]
print(text17)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))


### POS tagging and parsing ambiguity

In [30]:
text18 = nltk.word_tokenize("The old man the boat")
nltk.pos_tag(text18)

URLError: <urlopen error unknown url type: c>

In [31]:
text19 = nltk.word_tokenize("Colorless green ideas sleep furiously")
nltk.pos_tag(text19)

URLError: <urlopen error unknown url type: c>