In [1]:
import nltk
# nltk.download("all")

### 1. Tokenizer
- word tokenizer
- sentence tokenizer

**Lexicon and corporas**
    - corpora : body of text . ex
        - medical journals, predential speeches, English language
    - Lexicon : words and their meanings
        - ex. inverstor language 'bull' : someone who is +ve about the market
            - english bull : scary animal
        - for numbers it is words and their values


In [2]:
from nltk.tokenize import word_tokenize,sent_tokenize

# By sentence --------------------------

example_text = "Hello there, how are you doing today? the weather is great and python is awesome. The sky is blue."
# for split by sentence, we can think that full stop followed by space will do
# we cannot split by sentense easily ex. Mr. Smith

print(sent_tokenize(example_text))

print("--"*20)
example_text = "Hello Mr. Smith, how are you doing today? the weather is great and python is awesome. The sky is blue."
print(sent_tokenize(example_text))

['Hello there, how are you doing today?', 'the weather is great and python is awesome.', 'The sky is blue.']
----------------------------------------
['Hello Mr. Smith, how are you doing today?', 'the weather is great and python is awesome.', 'The sky is blue.']


In [3]:
# By word -------------------
print(word_tokenize(example_text))
# by default puntuation is taken as a word.

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'the', 'weather', 'is', 'great', 'and', 'python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'blue', '.']


In [4]:
for i in word_tokenize(example_text):
    print(i)

Hello
Mr.
Smith
,
how
are
you
doing
today
?
the
weather
is
great
and
python
is
awesome
.
The
sky
is
blue
.


- there are advanced tokenizers where we can use unsupervised machine learning built in nltk.
- nltk by default works with english, but also works with other languages.

### 2. Stop words
a. words that do not add much meaning
b. whose meaning is ambigous

In [5]:
from nltk.corpus import stopwords

example_sent = "This is an example showing off stop word filtration."
stop_words = set(stopwords.words("english"))
print(stop_words)

{'her', 'are', 've', 'hers', 'by', 'yours', 'him', 'then', 'those', 'the', 'more', 'when', 'if', 'so', "you'd", "haven't", 'below', 'same', 'aren', 'our', 'that', 'against', "doesn't", "she's", 'this', 'been', 'there', "you've", 'my', 'am', 'than', 'myself', 'few', 'y', 'did', 'of', 'who', 'while', 'through', 'd', 'himself', "isn't", "you'll", 'hadn', 'hasn', 'once', 'mustn', 'nor', 'won', 'it', 'have', 'having', "shan't", 'not', 'isn', 'here', 'as', 'were', "you're", 'all', 'yourself', 'only', 'to', 'because', 'on', 'itself', 'its', 'haven', 'themselves', 'doesn', 'herself', 'do', 'up', 'how', 'didn', 'i', 'own', 'down', 'most', 'other', 'being', 'you', 'at', "don't", 'why', 'theirs', 'which', 'and', 'ain', 'wouldn', 'o', 'can', "couldn't", "that'll", 'each', 'some', 'm', "it's", 'a', 'in', 'about', 'before', 'for', "mightn't", 'both', 'until', 'whom', 'we', 'with', 'ma', 's', "hadn't", "should've", 'what', 'out', 'such', "mustn't", 'does', "didn't", 'he', 'be', 'under', "hasn't", 'ju

In [6]:
words = word_tokenize(example_sent)
filtered_sent = [w.lower() for w in words if w.lower() not in stop_words]

In [7]:
filtered_sent

['example', 'showing', 'stop', 'word', 'filtration', '.']

### 3. Stemming
- we stem a word 
    - ex. riding becomes rid
    - ie. we have different variations of a word but meaning of word is unchanged
    - ex. I was taking a ride in the car
        - I was riding a car.

Porter Stemmer :
- since 1979
- there are many stemmers, Porter stemmer is gud but rule based : one of the most gentle stemmers
- Snowball stemmer is Porter2, developed to support other languages : slightly better than 1st
- Lancaster Stemmer is very aggressive, sometimes the words get two small that barely readable by humans. 
    - It is good when we are having large datasets
    

In [8]:
from nltk.stem import PorterStemmer,LancasterStemmer,SnowballStemmer,WordNetLemmatizer
from nltk.tokenize import sent_tokenize,word_tokenize
ps = PorterStemmer()
sn = SnowballStemmer(language="english")
ls = LancasterStemmer()
wn = WordNetLemmatizer()

example_words = ["python","pythoner","pythoning","pythoned","pythonly","pythony"]

for w in example_words:
    print(ps.stem(w))
print("-"*20)
for w in example_words:
    print(sn.stem(w))
print("-"*20)
for w in example_words:
    print(ls.stem(w))    

python
python
python
python
pythonli
pythoni
--------------------
python
python
python
python
python
pythoni
--------------------
python
python
python
python
python
pythony


- except last one all of the words in stemmed

In [9]:
new_text = "It is very important,to be pythonly while you are pythoning with pyhton. All pythoners have pythoned once."
words = word_tokenize(new_text)
for w in words:
    print(ps.stem(w)+"----"+ wn.lemmatize(w))

It----It
is----is
veri----very
import----important
,----,
to----to
be----be
pythonli----pythonly
while----while
you----you
are----are
python----pythoning
with----with
pyhton----pyhton
.----.
all----All
python----pythoners
have----have
python----pythoned
onc----once
.----.


- Now a days, people prefer wordnet 
- Wordnet will find the synonym using synset.

** Stemming vs lemmatization **
- lemmatization looks for meaning, stemming does not 
    - ex. good is lemma of better
    - we need part of speech first as normalization rules are different for diff parts of speech.

### 4. Part of Speech Tagging
- labelling part of speech to each word

In [10]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
# punktSentenceTokenizer is an unsupervised ML tokenizer
# it comes pretrained but we can also retrain

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

# here we are training on train_text
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for sent in tokenized:
            words = nltk.word_tokenize(sent)
            print(words)
            print("-"*40)
            tagged = nltk.pos_tag(words)
            print(tagged)
            break;
    
    except Exception as e:
        print(str(e))

process_content()

['PRESIDENT', 'GEORGE', 'W.', 'BUSH', "'S", 'ADDRESS', 'BEFORE', 'A', 'JOINT', 'SESSION', 'OF', 'THE', 'CONGRESS', 'ON', 'THE', 'STATE', 'OF', 'THE', 'UNION', 'January', '31', ',', '2006', 'THE', 'PRESIDENT', ':', 'Thank', 'you', 'all', '.']
----------------------------------------
[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]


- NNP:  Proper noun, singular
- PRP : Personal pronoun
- VB : verbs
- RB : Adverb

- pos tagging will create part of speech tagging of words

### 5. Chunking
- we did part of speech tags, we can have many nouns in a sentence and we need to find what applies to which noun.
- Noun-phrases : noun with a bunch of modifiers around. ( kind of descriptive group of words surrounding noun) 
    - with regex 
    - or words that have affect on the noun
    
- we use part of speech tags and regex to do chunking.

In [13]:
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)
from IPython import display

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk : {<RB.?>*<VB.?>*<NNP><NN>?} """
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            print(chunked)
            chunked.draw()
            break;
    except:
        print("")
        
process_content()        

(S
  (Chunk PRESIDENT/NNP)
  (Chunk GEORGE/NNP)
  (Chunk W./NNP)
  (Chunk BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP)
  (Chunk JOINT/NNP)
  (Chunk SESSION/NNP)
  OF/IN
  (Chunk THE/NNP)
  (Chunk CONGRESS/NNP)
  (Chunk ON/NNP)
  (Chunk THE/NNP)
  (Chunk STATE/NNP)
  OF/IN
  (Chunk THE/NNP)
  (Chunk UNION/NNP)
  (Chunk January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP)
  (Chunk PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)


## REGEX

In [65]:
text = "abcd efg hi 12"
import re

In [75]:
# IDENTIFIERS
# \d = any number
# \D = anything but a number
# \s = space
# \S = anything but a space
# \w = any letter
# \W = anything but a letter
# . = any character, except for a new line
# \b = space around whole words
obj = re.compile(r"\d{1,2}")
re.findall(obj,text)

['12']

In [105]:
### QUANTIFIERS #####
# * : zero or more times
# ? : Once or none
# + : one or more
# {3} : three times
# {1,2} : 1 to 2 count
obj = re.compile(r"\w+")
re.findall(obj,text)

['abcd', 'efg', 'hi', '12']

In [100]:
obj = re.compile("\w{1,2}")
re.findall(obj,text)

['ab', 'cd', 'ef', 'g', 'hi', '12']

In [103]:
# . : any character except line break
# \ is a special character
obj = re.compile(".")
re.findall(obj,text)

['a', 'b', 'c', 'd', ' ', 'e', 'f', 'g', ' ', 'h', 'i', ' ', '1', '2']

In [108]:
obj = re.compile(".")
re.findall(obj,text)

['a', 'b', 'c', 'd', ' ', 'e', 'f', 'g', ' ', 'h', 'i', ' ', '1', '2']

In [135]:
#### LOGIC ####
# $ = matches at the end of string
# ^ = matches start of a string
# | = matches either/or. Example x|y = will match either x or y
# [] = range, or "variance"

obj = re.compile("ab[cd]")
re.findall(obj,text)
# ab with c or d

['abc']

In [140]:
obj = re.compile("^abcd")
re.findall(obj,text)

['abcd']

In [145]:
obj = re.compile("i 12$")
re.findall(obj,text)

['i 12']

In [160]:
obj = re.compile("[0-9 A-Z a-z]?")
re.findall(obj,text)

['a', 'b', 'c', 'd', ' ', 'e', 'f', 'g', ' ', 'h', 'i', ' ', '1', '2', '']

In [156]:
# match anything except abcd
obj = re.compile("[^abcd]")
re.findall(obj,text)

[' ', 'e', 'f', 'g', ' ', 'h', 'i', ' ', '1', '2']