In [1]:
import nltk
# nltk.download("all")

### 1. Tokenizer
- word tokenizer
- sentence tokenizer

**Lexicon and corporas**
    - corpora : body of text . ex
        - medical journals, predential speeches, English language
    - Lexicon : words and their meanings
        - ex. inverstor language 'bull' : someone who is +ve about the market
            - english bull : scary animal
        - for numbers it is words and their values


In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

# By sentence --------------------------

example_text = "Hello there, how are you doing today? the weather is great and python is awesome. The sky is blue."
# for split by sentence, we can think that full stop followed by space will do
# we cannot split by sentense easily ex. Mr. Smith

print(sent_tokenize(example_text))

print("--"*20)
example_text = "Hello Mr. Smith, how are you doing today? the weather is great and python is awesome. The sky is blue."
print(sent_tokenize(example_text))

['Hello there, how are you doing today?', 'the weather is great and python is awesome.', 'The sky is blue.']
----------------------------------------
['Hello Mr. Smith, how are you doing today?', 'the weather is great and python is awesome.', 'The sky is blue.']


In [6]:
# By word -------------------
print(word_tokenize(example_text))
# by default puntuation is taken as a word.

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'the', 'weather', 'is', 'great', 'and', 'python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'blue', '.']


In [16]:
for i in word_tokenize(example_text):
    print(i)

Hello
Mr.
Smith
,
how
are
you
doing
today
?
the
weather
is
great
and
python
is
awesome
.
The
sky
is
blue
.


- there are advanced tokenizers where we can use unsupervised machine learning built in nltk.
- nltk by default works with english, but also works with other languages.

### 2. Stop words
a. words that do not add much meaning
b. whose meaning is ambigous

In [20]:
from nltk.corpus import stopwords

example_sent = "This is an example showing off stop word filtration."
stop_words = set(stopwords.words("english"))
print(stop_words)

{'our', 'at', "it's", 'any', 'into', 'why', 'same', "wasn't", "she's", 'was', 'couldn', 'if', 'ours', 'by', 'having', 'does', 't', "isn't", 'ma', 'off', 'once', "don't", 'needn', 'be', 'do', 'under', 'about', 'herself', 'who', 'whom', 'itself', 'it', 'most', 'not', 'and', "didn't", 'have', 'which', 'an', 'we', 'that', 'am', 'for', 'nor', 'what', 'll', 'didn', 'because', 'these', 'did', 'from', 'all', 'himself', 'while', 'weren', 'had', 'the', 'his', 'myself', 'doing', 'some', 's', 'has', 'to', "couldn't", 'against', "hasn't", 'wouldn', 'been', 'down', 'where', "should've", 'as', "mustn't", 'o', 'they', "haven't", 'hadn', 'hers', 'isn', 'him', 're', 'mustn', 'i', 'there', 'few', 'but', "that'll", 'again', 'when', 'don', "needn't", "you're", "you've", 'me', 'only', 'themselves', 'she', 'over', 'such', 'through', 'during', 'out', 'yours', 'were', 'yourself', 'just', "mightn't", 'this', 'so', 'on', 'is', "weren't", 'up', 'm', 'theirs', "wouldn't", 'wasn', 'a', 'doesn', 'being', 'very', 'ar

In [25]:
words = word_tokenize(example_sent)
filtered_sent = [w.lower() for w in words if w.lower() not in stop_words]

In [26]:
filtered_sent

['example', 'showing', 'stop', 'word', 'filtration', '.']

### 3. Stemming
- we stem a word 
    - ex. riding becomes rid
    - ie. we have different variations of a word but meaning of word is unchanged
    - ex. I was taking a ride in the car
        - I was riding a car.

Porter Stemmer :
- since 1979
- there are many stemmers, Porter stemmer is gud

In [28]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


- except last one all of the words in stemmed

In [31]:
new_text = "It is very important to be pythonly while you are pythoning with pyhton. All pythoners have pythoned once."

words = word_tokenize(new_text)
for w in words:
    print(ps.stem(w))

It
is
veri
import
to
be
pythonli
while
you
are
python
with
pyhton
.
all
python
have
python
onc
.


- Now a days, people prefer wordnet 
- Wordnet will find the synonym using synset.

### 4. Part of Speech Tagging
- labelling part of speech to each word

In [37]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
# punktSentenceTokenizer is an unsupervise ML tokenizer
# it comes pretrained but we can also retrain

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

# here we are training on train_text
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for sent in tokenized:
            words = nltk.word_tokenize(sent)
            print(words)
            print("-"*40)
            tagged = nltk.pos_tag(words)
            print(tagged)
            break;
    
    except Exception as e:
        print(str(e))

process_content()

['PRESIDENT', 'GEORGE', 'W.', 'BUSH', "'S", 'ADDRESS', 'BEFORE', 'A', 'JOINT', 'SESSION', 'OF', 'THE', 'CONGRESS', 'ON', 'THE', 'STATE', 'OF', 'THE', 'UNION', 'January', '31', ',', '2006', 'THE', 'PRESIDENT', ':', 'Thank', 'you', 'all', '.']
----------------------------------------
[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]


- NNP:  Proper noun, singular
- PRP : Personal pronoun

- pos tagging will create part of speech tagging of words

### 5. Chunking
- we did part of speech tags, we can have many nouns in a sentence and we need to find what applies to which noun.
- Noun-phrases : noun with a bunch of modifiers around. ( kind of descriptive group of words surrounding noun) 
    - with regex 
    
- we use part of speech tags and regex to do chunking.