In [22]:
import nltk
nltk.download("punkt")
nltk.download('state_union')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vivek.kumar5\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package state_union to
[nltk_data]     C:\Users\vivek.kumar5\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\state_union.zip.


True

### 1. Tokenizer
- word tokenizer
- sentence tokenizer

**Lexicon and corporas**
    - corpora : body of text . ex
        - medical journals, predential speeches, English language
    - Lexicon : words and their meanings
        - ex. inverstor language 'bull' : someone who is +ve about the market
            - english bull : scary animal
        - for numbers it is words and their values


In [20]:
from nltk.tokenize import word_tokenize,sent_tokenize

# By sentence --------------------------

example_text = "Hello there, how are you doing today? the weather is great and python is awesome. The sky is blue."
# for split by sentence, we can think that full stop followed by space will do
# we cannot split by sentense easily ex. Mr. Smith

print(sent_tokenize(example_text))

print("--"*20)
example_text = "Hello Mr. Smith, how are you doing today? the weather is great and python is awesome. The sky is blue."
print(sent_tokenize(example_text))

['Hello there, how are you doing today?', 'the weather is great and python is awesome.', 'The sky is blue.']
----------------------------------------
['Hello Mr. Smith, how are you doing today?', 'the weather is great and python is awesome.', 'The sky is blue.']


In [None]:
# By word -------------------
print(word_tokenize(example_text))
# by default puntuation is taken as a word.

In [None]:
for i in word_tokenize(example_text):
    print(i)

- there are advanced tokenizers where we can use unsupervised machine learning built in nltk.
- nltk by default works with english, but also works with other languages.

### 2. Stop words
a. words that do not add much meaning
b. whose meaning is ambigous

In [None]:
from nltk.corpus import stopwords

example_sent = "This is an example showing off stop word filtration."
stop_words = set(stopwords.words("english"))
print(stop_words)

In [None]:
words = word_tokenize(example_sent)
filtered_sent = [w.lower() for w in words if w.lower() not in stop_words]

In [None]:
filtered_sent

### 3. Stemming
- we stem a word 
    - ex. riding becomes rid
    - ie. we have different variations of a word but meaning of word is unchanged
    - ex. I was taking a ride in the car
        - I was riding a car.

Porter Stemmer :
- since 1979
- there are many stemmers, Porter stemmer is gud but rule based : one of the most gentle stemmers
- Snowball stemmer is Porter2, developed to support other languages : slightly better than 1st
- Lancaster Stemmer is very aggressive, sometimes the words get two small that barely readable by humans. 
    - It is good when we are having large datasets
    

In [None]:
from nltk.stem import PorterStemmer,LancasterStemmer,SnowballStemmer,WordNetLemmatizer
from nltk.tokenize import sent_tokenize,word_tokenize
ps = PorterStemmer()
sn = SnowballStemmer(language="english")
ls = LancasterStemmer()
wn = WordNetLemmatizer()

example_words = ["python","pythoner","pythoning","pythoned","pythonly","pythony"]

for w in example_words:
    print(ps.stem(w))
print("-"*20)
for w in example_words:
    print(sn.stem(w))
print("-"*20)
for w in example_words:
    print(ls.stem(w))    

- except last one all of the words in stemmed

In [None]:
new_text = "It is very important,to be pythonly while you are pythoning with pyhton. All pythoners have pythoned once."
words = word_tokenize(new_text)
for w in words:
    print(ps.stem(w)+"----"+ wn.lemmatize(w))

- Now a days, people prefer wordnet 
- Wordnet will find the synonym using synset.

** Stemming vs lemmatization **
- lemmatization looks for meaning, stemming does not 
    - ex. good is lemma of better
    - we need part of speech first as normalization rules are different for diff parts of speech.

### 4. Part of Speech Tagging
- labelling part of speech to each word

In [23]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
# punktSentenceTokenizer is an unsupervised ML tokenizer
# it comes pretrained but we can also retrain

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

# here we are training on train_text
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for sent in tokenized:
            words = nltk.word_tokenize(sent)
            print(words)
            print("-"*40)
            tagged = nltk.pos_tag(words)
            print(tagged)
            break;
    
    except Exception as e:
        print(str(e))

process_content()

[u'PRESIDENT', u'GEORGE', u'W.', u'BUSH', u"'S", u'ADDRESS', u'BEFORE', u'A', u'JOINT', u'SESSION', u'OF', u'THE', u'CONGRESS', u'ON', u'THE', u'STATE', u'OF', u'THE', u'UNION', u'January', u'31', u',', u'2006', u'THE', u'PRESIDENT', u':', u'Thank', u'you', u'all', u'.']
----------------------------------------

**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  Searched in:
    - 'C:\\Users\\vivek.kumar5/nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'C:\\ProgramData\\Anaconda2\\nltk_data'
    - 'C:\\ProgramData\\Anaconda2\\share\\nltk_data'
    - 'C:\\ProgramData\\Anaconda2\\lib\\nltk_data'
    - 'C:\\Users\\vivek.kumar5\\AppData\\Roaming\\nltk_data'
**********************************************************************



- NNP:  Proper noun, singular
- PRP : Personal pronoun
- VB : verbs
- RB : Adverb

- pos tagging will create part of speech tagging of words

### 5. Chunking
- we did part of speech tags, we can have many nouns in a sentence and we need to find what applies to which noun.
- Noun-phrases : noun with a bunch of modifiers around. ( kind of descriptive group of words surrounding noun) 
    - with regex 
    - or words that have affect on the noun
    
- we use part of speech tags and regex to do chunking.

In [None]:
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)
from IPython import display

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk : {<RB.?>*<VB.?>*<NNP><NN>?} """
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            print(chunked)
            chunked.draw()
            break;
    except:
        print("")
        
process_content()        

## REGEX

In [6]:
text = "abcd efg hi 12"
import re

In [7]:
# IDENTIFIERS
# \d = any number
# \D = anything but a number
# \s = space
# \S = anything but a space
# \w = any letter
# \W = anything but a letter
# . = any character, except for a new line
# \b = space around whole words
obj = re.compile(r"\d{1,2}")
re.findall(obj,text)

['12']

In [8]:
### QUANTIFIERS #####
# * : zero or more times
# ? : Once or none
# + : one or more
# {3} : three times
# {1,2} : 1 to 2 count
obj = re.compile(r"\w+")
re.findall(obj,text)

['abcd', 'efg', 'hi', '12']

In [9]:
obj = re.compile("\w{1,2}")
re.findall(obj,text)

['ab', 'cd', 'ef', 'g', 'hi', '12']

In [10]:
# . : any character except line break
# \ is a special character
obj = re.compile(".")
re.findall(obj,text)

['a', 'b', 'c', 'd', ' ', 'e', 'f', 'g', ' ', 'h', 'i', ' ', '1', '2']

In [11]:
obj = re.compile(".")
re.findall(obj,text)

['a', 'b', 'c', 'd', ' ', 'e', 'f', 'g', ' ', 'h', 'i', ' ', '1', '2']

In [12]:
#### LOGIC ####
# $ = matches at the end of string
# ^ = matches start of a string
# | = matches either/or. Example x|y = will match either x or y
# [] = range, or "variance"

obj = re.compile("ab[cd]")
re.findall(obj,text)
# ab with c or d

['abc']

In [13]:
obj = re.compile("^abcd")
re.findall(obj,text)

['abcd']

In [14]:
obj = re.compile("i 12$")
re.findall(obj,text)

['i 12']

In [15]:
obj = re.compile("[0-9 A-Z a-z]?")
re.findall(obj,text)

['a', 'b', 'c', 'd', ' ', 'e', 'f', 'g', ' ', 'h', 'i', ' ', '1', '2', '']

In [16]:
# match anything except abcd
obj = re.compile("[^abcd]")
re.findall(obj,text)

[' ', 'e', 'f', 'g', ' ', 'h', 'i', ' ', '1', '2']

In [18]:
# match anything except abcd
obj = re.compile("<ab>")
re.findall(obj,text)

[]