**NLTK is the mother of all the NLP libraries, and, it is used for building python programs that work with human language data and relavant application(s in statistical natural language processing.**

In [None]:
# Importing NLTK
import nltk

In [None]:
# Downloaing resource
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print("happier:", lemmatizer.lemmatize("happier", pos="a"))

happier: happy


In [None]:
# Stemming -- PorterStemmer
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()
print("clapping :",porter_stemmer.stem("clapping"))
print("clapped :",porter_stemmer.stem("clapped"))

clapping : clap
clapped : clap


In [None]:
# Word Normalization
sentence = "Mary had a little Lamb."
normalized_sentence = sentence.lower()
print(normalized_sentence)

mary had a little lamb.


In [None]:
# Regex
import re, string

# Replacing all the no's with ' '
sentence = "1000 ships reached the shores of atlantis and invaded the hinterland and established 17 some camps in and around the island."
words = re.sub(r'\d+', '', sentence)
print("REGEX TO SUB ALL THE DIGITS :", words)

# Removing Punctuation
sentence = "Would you like to accompany us in the comming tour? Or, do you have any other plans running in parallel!?"
words = re.sub('[%s]' % re.escape(string.punctuation), '', sentence)
print("REGEX FOR REMOVING PUNCTUATION :", result)

REGEX TO SUB ALL THE DIGITS :  ships reached the shores of atlantis and invaded the hinterland and established  some camps in and around the island.
REGEX FOR REMOVING PUNCTUATION : Would you like to accompany us in the comming tour Or do you have any other plans running in parallel


In [None]:
# Spaces
input_str = ' Alt rose to \n power and \r\t\n \n\n went back to the dungens where   \t he belonged.  '

print('Remove spaces using regex :', re.sub(r"\s+", "", input_str),"\n", sep='')
print('Remove landing spaces using regex :', re.sub(r"^\s+", "", input_str),"\n", sep='')
print('Remove trailing spaces using regex :', re.sub(r"\s+$", "", input_str),"\n", sep='')
print('Remove landing spaces using regex :', re.sub(r"^\s+|\s+$", "", input_str),"\n", sep='')

# Emails
print('EMAIL :', re.sub(r"^([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$, "", input_str),"\n", sep='') 

Remove spaces using regex :Altrosetopowerandwentbacktothedungenswherehebelonged.

Remove landing spaces using regex :Alt rose to 
 power and 	
 

 went back to the dungens where   	 he belonged.  

Remove trailing spaces using regex : Alt rose to 
 power and 	
 

 went back to the dungens where   	 he belonged.

Remove landing spaces using regex :Alt rose to 
 power and 	
 

 went back to the dungens where   	 he belonged.



In [None]:
# Word Tokenization
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

sentence = "1000 ships reached the shores of atlantis and invaded the hinterland, and, established 17 some camps in and around the island."
tokens = word_tokenize(sentence)
print(tokens)

sent_tokens = sent_tokenize(sentence)
print(sent_tokens)

['1000', 'ships', 'reached', 'the', 'shores', 'of', 'atlantis', 'and', 'invaded', 'the', 'hinterland', ',', 'and', ',', 'established', '17', 'some', 'camps', 'in', 'and', 'around', 'the', 'island', '.']
['1000 ships reached the shores of atlantis and invaded the hinterland, and, established 17 some camps in and around the island.']


In [None]:
# NLTK Stopwords
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [None]:
# Removing the stop words from tokenized sentence
filtered_words = [ i for i in tokens if not i in stop_words ]
filtered_words

['1000',
 'ships',
 'reached',
 'shores',
 'atlantis',
 'invaded',
 'hinterland',
 'established',
 '17',
 'camps',
 'around',
 'island',
 '.']

------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# N-GRAM
from nltk import ngrams

sentence = "1000 ships reached the shores of atlantis and invaded the hinterland, and, established 17 some camps in and around the island."

n = 3

n_grams = ngrams(sentence.split(), n)
for grams in n_grams:
  print(grams)

('1000', 'ships', 'reached')
('ships', 'reached', 'the')
('reached', 'the', 'shores')
('the', 'shores', 'of')
('shores', 'of', 'atlantis')
('of', 'atlantis', 'and')
('atlantis', 'and', 'invaded')
('and', 'invaded', 'the')
('invaded', 'the', 'hinterland,')
('the', 'hinterland,', 'and,')
('hinterland,', 'and,', 'established')
('and,', 'established', '17')
('established', '17', 'some')
('17', 'some', 'camps')
('some', 'camps', 'in')
('camps', 'in', 'and')
('in', 'and', 'around')
('and', 'around', 'the')
('around', 'the', 'island.')


**NLTK** being so vasy in functionalities, it does not support word vectors and is SLOW. And, definetely cannot be used for production purpose.
