In [31]:
Text = 'In computer science, artificial intelligence (AI), sometimes called machine intelligence, \
is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans. \
Colloquially, the term artificial intelligence is often used to describe machines (or computers) that mimic cognitive functions \
that humans associate with the human mind, such as learning and problem solving \
\
    A search can be designed to return every match on a line, if there are more than\
one, or just the first match. In the following examples we generally underline the\
exact part of the pattern that matches the regular expression and show only the first\
match. We’ll show regular expressions delimited by slashes but note that slashes are\
not part of the regular expressions.'

# Tokenization : List of words and characters

In [32]:
import nltk
import nltk.tokenize

In [33]:
tokens = nltk.tokenize.word_tokenize(Text)
print("Total tokens in given sentence : " + str(len(tokens)))
print("First 5 tokens:\n[")
for i in tokens[:5]:
    print("   " + i)
print("]")

Total tokens in given sentence : 133
First 5 tokens:
[
   In
   computer
   science
   ,
   artificial
]


# Bag of Words : Word frequency calculation

In [34]:
from nltk.probability import FreqDist
fdist = FreqDist()

In [35]:
for i in stemmed_words:
    fdist[i.lower()] += 1
fdist
fdist_top10 = fdist.most_common(10)
fdist_top10

[('the', 9),
 (',', 8),
 ('intellig', 5),
 ('.', 4),
 ('that', 4),
 ('in', 3),
 ('machin', 3),
 ('by', 3),
 ('to', 3),
 ('human', 3)]

# Bi / Tri / NGrams
Collection of consequent words.
* bi-grams  : 2 consequent words
* tri-grams : 3 consequent words
* n-grams   : n consequent words

In [36]:
from nltk import bigrams, trigrams, ngrams

In [37]:
# bigrams
bi_grams = list(bigrams(tokens))
tri_grams = list(trigrams(tokens))
n_grams = list(ngrams(tokens, 4))

In [38]:
# bigrams
print("BiGrams")
for i in bi_grams[:5]:
    print(i)

print("\nTriGrams")
# trigrams
for i in tri_grams[:5]:
    print(i)

print("\nNGrams")
# ngrams
for i in n_grams[:5]:
    print(i)

BiGrams
('In', 'computer')
('computer', 'science')
('science', ',')
(',', 'artificial')
('artificial', 'intelligence')

TriGrams
('In', 'computer', 'science')
('computer', 'science', ',')
('science', ',', 'artificial')
(',', 'artificial', 'intelligence')
('artificial', 'intelligence', '(')

NGrams
('In', 'computer', 'science', ',')
('computer', 'science', ',', 'artificial')
('science', ',', 'artificial', 'intelligence')
(',', 'artificial', 'intelligence', '(')
('artificial', 'intelligence', '(', 'AI')


# Stemming : getting root word or base form of word

In [39]:
from nltk.stem import PorterStemmer, ISRIStemmer, LancasterStemmer, RegexpStemmer, RSLPStemmer, SnowballStemmer
pst = PorterStemmer()
lst = LancasterStemmer()

In [40]:
stemmed_words = []
for i in tokens:
    stemmed_words.append(pst.stem(i))

for i in stemmed_words[:5]:
    print(i)

In
comput
scienc
,
artifici


In [41]:
lan_stemmed_words = []
for i in tokens:
    lan_stemmed_words.append(i.lower())

for i in stemmed_words[:5]:
    print(i)

In
comput
scienc
,
artifici


# Lemmatization : Alternative to Stemming.
* More powerful than just stemming.
* Unlike stemming, output of lemmatizer is a proper word (stemming provides root form, not the word)

In [42]:
from nltk import wordnet
from nltk import WordNetLemmatizer
word_lem = WordNetLemmatizer()

In [43]:
word_lem.lemmatize('giving')

'giving'

* Lemmatizer has just returned the same word as POST tags are not assigned
* If POS tags are not assigned, it assumes that all words are nouns

# POS Tags & Stop Words
* NLTK by default comes with stop words

In [44]:
from nltk.corpus import stopwords

In [45]:
for i in stopwords.words("english")[:5]:
    print(i)

i
me
my
myself
we


In [46]:
fdist_top10

[('the', 9),
 (',', 8),
 ('intellig', 5),
 ('.', 4),
 ('that', 4),
 ('in', 3),
 ('machin', 3),
 ('by', 3),
 ('to', 3),
 ('human', 3)]

* Most of the words are either punctuations or the stop words
* Lets remove punctuation first
* Then stop words

In [47]:
import re
punctuation = re.compile(r'[-.,?!:;()|0-9]')

In [48]:
post_punctuation = []
for words in tokens:
    word = punctuation.sub("", words)
    if len(word) > 0:
        post_punctuation.append(word)

In [49]:
words_5 = [x for x in post_punctuation[:5]]
words_5

['In', 'computer', 'science', 'artificial', 'intelligence']

## POS Tagging
* input : Tokens
* Output: word, POS Tag

In [50]:
from nltk import pos_tag

In [51]:
sent1 = "The dog is in the water"
sen1_tokens = nltk.tokenize.word_tokenize(sent1)

for i in sen1_tokens:
    print(pos_tag([i]))

[('The', 'DT')]
[('dog', 'NN')]
[('is', 'VBZ')]
[('in', 'IN')]
[('the', 'DT')]
[('water', 'NN')]


In [52]:
sent2 = "John is eating his food"
sen2_tokens = nltk.tokenize.word_tokenize(sent2)

for i in sen2_tokens:
    print(pos_tag([i]))

[('John', 'NNP')]
[('is', 'VBZ')]
[('eating', 'VBG')]
[('his', 'PRP$')]
[('food', 'NN')]


'is' and 'eating' are together but considered as verbs. Which is wrong.


# Named Entity recognition
There are 3 major types of identification
1. Non phrase identification : Deep parsing and pos tagging
2. Phrase classification : Identifies nouns into categories (google -> organization, sundar -> person)
3. Entity disambuiguation : Creates one more layer of validation
    * IBM Watson
    * Google graph

In [53]:
from nltk import ne_chunk

In [54]:
sentence = 'The US President stays in the white house'
sen_tokens = nltk.tokenize.word_tokenize(sentence)
sen_pos = nltk.pos_tag(sen_tokens)
sen_pos

[('The', 'DT'),
 ('US', 'NNP'),
 ('President', 'NNP'),
 ('stays', 'VBZ'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('white', 'JJ'),
 ('house', 'NN')]

In [55]:
sen_ner = nltk.ne_chunk(sen_pos)

In [56]:
print(sen_ner)

(S
  The/DT
  (ORGANIZATION US/NNP)
  President/NNP
  stays/VBZ
  in/IN
  the/DT
  white/JJ
  house/NN)


# SYNTAX : Process, rules and principles that govern the structure of a sentence
* Syntactical structure of a sentence
* Tree like structure
* Defines which part of sentence comes after which part

## Ghost script
* ghost script is needed to visualize the tree
* download : https://www.ghostscript.com/download/gsdnld.html
* Install  : https://www.ghostscript.com/doc/9.27/Install.htm
    1. gunzip <file.tar.gz>
    2. tar -xvf <file.tar>
    3. cd <folder>
    4. ./configure
    5. make install