In [48]:
import nltk #Natural Language Tool Kit
from nltk.tokenize import sent_tokenize, word_tokenize

In [49]:
# create example text
EXAMPLE_TEXT = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
print(sent_tokenize(EXAMPLE_TEXT)) #Tokenizing in sentences

['It is important to by very pythonly while you are pythoning with python.', 'All pythoners have pythoned poorly at least once.']


In [50]:
print(word_tokenize(EXAMPLE_TEXT)) #Tokenizing in words

##Most of the times, we use word_tokenize but not the sentence_tokenize

['It', 'is', 'important', 'to', 'by', 'very', 'pythonly', 'while', 'you', 'are', 'pythoning', 'with', 'python', '.', 'All', 'pythoners', 'have', 'pythoned', 'poorly', 'at', 'least', 'once', '.']


In [51]:
#Removing Stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)

{'out', 'until', 'down', "haven't", 'but', 'needn', 'mustn', 'hers', 'itself', 'there', 'she', "aren't", "should've", 'than', 'whom', 'my', 'don', 'll', 'i', 'our', 'how', 'y', 'the', 't', 'once', "needn't", 'yourself', "it's", 'now', 'them', 'myself', 'into', 'theirs', 'at', 'those', 'her', 'each', 'with', 'ma', "shan't", 'you', 'being', 'on', 'and', 'such', 'me', 'which', 'hasn', 'shouldn', 'we', 'your', 'against', "mustn't", 'o', 'are', 'himself', 'doing', 'these', 'all', 'were', 'too', "you've", 'he', 'further', 'why', "shouldn't", 'am', "hadn't", 'about', 're', 'very', 'this', 'couldn', 'should', 'then', 'other', 'has', 'in', 'same', 'if', "you're", 'is', 'mightn', 'who', "didn't", "doesn't", "you'll", 'did', 'ourselves', 'between', 'does', 'an', 'during', 'wouldn', 'their', 'have', 'had', 'again', 'can', 'ain', 'isn', 'for', 'him', "you'd", 'its', 'some', 'of', "couldn't", 'below', 'it', 'to', 'before', 'more', 'was', 'been', 'no', 'doesn', 'haven', 'won', 'under', 'd', "won't", 

In [52]:
#Printing Stop words
word_tokens = word_tokenize(EXAMPLE_TEXT)
print(word_tokens)

['It', 'is', 'important', 'to', 'by', 'very', 'pythonly', 'while', 'you', 'are', 'pythoning', 'with', 'python', '.', 'All', 'pythoners', 'have', 'pythoned', 'poorly', 'at', 'least', 'once', '.']


In [53]:
#Printing filtered Stop words
filtered_sentences = [w for w in word_tokens if not w in stop_words]
print(filtered_sentences)

['It', 'important', 'pythonly', 'pythoning', 'python', '.', 'All', 'pythoners', 'pythoned', 'poorly', 'least', '.']


In [54]:
#Stemming a list of words
##Stemming means - finding the root words among different words. e.g. go/going/goes will result go as a result of stemming
from nltk.stem import PorterStemmer

ps = PorterStemmer()
for w in filtered_sentences:
    print(ps.stem(w))

It
import
pythonli
python
python
.
all
python
python
poorli
least
.


In [55]:
#Lemmatization: Getting original word of the dictionary. Little advanced from stemming. e.g. go/goes/went/gone/going -> go

#Stemming is not very accurate. e.g. important->import, pythonly->pythonli
#so, it's used for initial analysis. That's why lemmatization is prefered for such cases.


#Lemmatizing the list of words
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

for w in filtered_sentences:
    print(lemmatizer.lemmatize(w))

It
important
pythonly
pythoning
python
.
All
pythoners
pythoned
poorly
least
.


In [56]:
# nltk.download('all')

In [57]:
nltk.help.upenn_tagset() #Explains the Parts of Speech (POS) abbreviation

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [59]:
# POS tagging example
print(nltk.pos_tag(['feet']))
print(nltk.pos_tag(filtered_sentences))


[('feet', 'NNS')]
[('It', 'PRP'), ('important', 'JJ'), ('pythonly', 'RB'), ('pythoning', 'VBG'), ('python', 'NN'), ('.', '.'), ('All', 'DT'), ('pythoners', 'NNS'), ('pythoned', 'VBD'), ('poorly', 'RB'), ('least', 'JJS'), ('.', '.')]


In [60]:
# Clearly Lemmatize is better option than stemming if done with correct POS tagging
# Hence lets Lemmatize each word with its corresonding POS Tag
from nltk.corpus import wordnet
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    print(tag)
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN) # wordnet.NOUN is optional param, if no match for given key then return this 
#--------------------------------------------------------------
# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()
#------------------------------------------------------------
# 2. Lemmatize Single Word with the appropriate POS tag
# word = 'feet'
# print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))
#-----------------------------------------------------------
# 3. Lemmatize a Sentence with the appropriate POS tag
sentence = "The striped bats are hanging on their feet for best"
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

D
V
N
V
V
I
P
N
I
J
['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']
