# Introduction to NLTK

In [None]:
import nltk
# if you have never downloaded nltk, please unmark the following statement.
# nltk.download()
from nltk.book import *

In [None]:
text7

In [None]:
sent7

In [None]:
len(sent7)

In [None]:
len(text7)

In [None]:
list(set(text7))[:10]

## Frequency of words

In [None]:
dist = FreqDist(text7)
dist

In [None]:
len(dist)

In [None]:
vocab1 = dist.keys()
vocab1

In [None]:
dist['four']

In [None]:
freqwords =[w for w in vocab1 if len(w) >5 and dist[w]>100]
freqwords

## Normalization and Stemming
Stemming : In linguistic morphology and information retrieval, stemming is the process for reducing inflected (or sometimes derived) words to their stem, base or root form—generally a written word form. The stem need not be identical to the morphological root of the word; it is usually sufficient that related words map to the same stem, even if this stem is not in itself a valid root. Algorithms for stemming have been studied in computer science since the 1960s. Many search engines treat words with the same stem as synonyms as a kind of query expansion, a process called conflation.

Stemming programs are commonly referred to as stemming algorithms or stemmers.

In [None]:
input1 ="In linguistic morphology and information retrieval, stemming is the process for reducing inflected (or sometimes derived) words to their stem, base or root form—generally a written word form."
words1 = input1.lower().split(' ')
words1

In [None]:
# porter stemmer
porter =nltk.PorterStemmer()
[porter.stem(t) for t in words1]

In [None]:
# snowball stemmer
snowball = nltk.stem.SnowballStemmer('english')
[snowball.stem(t) for t in words1]

## Lemmatization
Lemmatisation: Lemmatisation (or lemmatization) in linguistics, is the process of grouping together the different inflected forms of a word so they can be analysed as a single item.

In computational linguistics, lemmatisation is the algorithmic process of determining the lemma for a given word. Since the process may involve complex tasks such as understanding context and determining the part of speech of a word in a sentence (requiring, for example, knowledge of the grammar of a language) it can be a hard task to implement a lemmatiser for a new language.
 
In many languages, words appear in several inflected forms. For example, in English, the verb ‘to walk’ may appear as ‘walk’, ‘walked’, ‘walks’, ‘walking’. The base form, ‘walk’, that one might look up in a dictionary, is called the lemma for the word. The combination of the base form with the part of speech is often called the lexeme of the word.

Lemmatisation is closely related to stemming. The difference is that a stemmer operates on a single word without knowledge of the context, and therefore cannot discriminate between words which have different meanings depending on part of speech. However, stemmers are typically easier to implement and run faster, and the reduced accuracy may not matter for some applications.

In [None]:
lemma = nltk.wordnet.WordNetLemmatizer() # Lemmatization: stemming, but resulting stems are all valid words
[lemma.lemmatize(t) for t in words1]

In [None]:
udhr = nltk.corpus.udhr.words('English-Latin1')
udhr[:20]

In [None]:
WNlemma = nltk.WordNetLemmatizer()
[WNlemma.lemmatize(t) for t in udhr[:20]]

## Tokenization
NLTK has an built-in tokenizer

In [None]:
import nltk
text11 ="Children shouldn’t drink a sugary drink before bed."

from nltk.tokenize import word_tokenize
text13 = nltk.word_tokenize(text11)
text13

# Sentence splitting

In [None]:
text12 ="this is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is !"
sentences = nltk.sent_tokenize(text12)
sentences

# POS Tagging
NLTK provides documentation for each tag, which can be queried using the tag, e.g. nltk.help.upenn_tagset('RB'), or a regular expression, e.g. nltk.help.upenn_tagset('NN.*')

In [None]:
nltk.help.upenn_tagset('MD')

In [None]:
nltk.pos_tag(text13)

In [None]:
text14 = nltk.word_tokenize("Visiting aunts can be a nuisance")
nltk.pos_tag(text14)

In [None]:
text15 = nltk.word_tokenize("Alice loves Bob")
text15

In [None]:
grammar = nltk.CFG.fromstring(""" 
S -> NP VP
VP -> V NP
NP -> 'Alice'|'Bob'
V -> 'loves'
""")

In [None]:
parser = nltk.ChartParser(grammar)
trees = parser.parse_all(text15)
for tree in trees:
    print(tree)

## Using WordNet in NLTK

In [None]:
from nltk.corpus import wordnet as wn
deer = wn.synset('deer.n.01')
elk = wn.synset('elk.n.01')
horse = wn.synset('horse.n.01')

In [None]:
deer.path_similarity(elk)

In [None]:
deer.path_similarity(horse)

## Use an information criteria to find Lin similarity

In [None]:
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')

In [None]:
deer.lin_similarity(elk, brown_ic)

In [None]:
deer.lin_similarity(horse, brown_ic)

## Collections and Distributional similarity

In [None]:
from nltk.collocations import BigramCollocationFinder

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
text = "obama says that obama says that the war is happening"
finder = BigramCollocationFinder.from_words(nltk.word_tokenize(text))
finder.nbest(bigram_measures.pmi, 5)