
https://www.machinelearningplus.com/nlp/lemmatization-examples-python/


# Installation
You need first to open your envirnoment and install the NLTK package 

`!pip install nltk`

`!pip install spacy`

`python -m spacy download en`

In [2]:
import nltk as nltk

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/kayvan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Applying bag-of-words to a toy dataset

In [17]:
bards_words =["The fool doth think think he is wise,",
              "but the wise man knows himself to be a fool"]

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
vect =  CountVectorizer()

In [20]:
vect.fit(bards_words)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [21]:
print("Vocabulary size: {}".format(len(vect.vocabulary_)))

Vocabulary size: 13


In [22]:
print("Vocabulary content:\n {}".format(vect.vocabulary_))

Vocabulary content:
 {'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}


In [23]:
bag_of_words = vect.transform(bards_words)

In [24]:
print("Dense representation of bag_of_words:\n{}".format(
    bag_of_words.toarray()))

Dense representation of bag_of_words:
[[0 0 1 1 1 0 1 0 0 1 2 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


# Tf-idf Vectorizer

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [25]:
vectorizer = TfidfVectorizer()

In [26]:
vectors = vectorizer.fit_transform(bards_words)

In [27]:
feature_names = vectorizer.get_feature_names()

In [28]:
print(feature_names)

['be', 'but', 'doth', 'fool', 'he', 'himself', 'is', 'knows', 'man', 'the', 'think', 'to', 'wise']


In [38]:
dense_vec = vectors.todense()
dense_list = dense_vec.tolist()
tfidf_data = pd.DataFrame(dense_list, columns=feature_names)
tfidf_data

Unnamed: 0,be,but,doth,fool,he,himself,is,knows,man,the,think,to,wise
0,0.0,0.0,0.34262,0.243777,0.34262,0.0,0.34262,0.0,0.0,0.243777,0.68524,0.0,0.243777
1,0.364693,0.364693,0.0,0.259482,0.0,0.364693,0.0,0.364693,0.364693,0.259482,0.0,0.364693,0.259482


# Lemmatization

In [40]:
import spacy
print("SpaCy version: {}".format(spacy.__version__))
print("nltk version: {}".format(nltk.__version__))

SpaCy version: 2.2.3
nltk version: 3.4.5


In [41]:
# load spacy's English-language models
en_nlp = spacy.load('en')
# instantiate nltk's Porter stemmer
stemmer = nltk.stem.PorterStemmer()

# define function to compare lemmatization in spacy with stemming in nltk
def compare_normalization(doc):
    # tokenize document in spacy
    doc_spacy = en_nlp(doc)
    # print lemmas found by spacy
    print("Lemmatization:")
    print([token.lemma_ for token in doc_spacy])
    # print tokens found by Porter stemmer
    print("Stemming:")
    print([stemmer.stem(token.norm_.lower()) for token in doc_spacy])

In [42]:
compare_normalization(u"Our meeting today was worse than yesterday, "
                       "I'm scared of meeting the clients tomorrow.")

Lemmatization:
['-PRON-', 'meeting', 'today', 'be', 'bad', 'than', 'yesterday', ',', '-PRON-', 'be', 'scared', 'of', 'meet', 'the', 'client', 'tomorrow', '.']
Stemming:
['our', 'meet', 'today', 'wa', 'wors', 'than', 'yesterday', ',', 'i', 'am', 'scare', 'of', 'meet', 'the', 'client', 'tomorrow', '.']


In [None]:
from nltk.stem import WordNetLemmatizer