In [1]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joshi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joshi\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joshi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\joshi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\joshi\AppData\Roaming\nltk_data...


True

Tokenization example

In [3]:
text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce commodo mauris id justo condimentum dignissim. Nullam placerat semper dapibus. Pellentesque ac risus nulla. Phasellus ut dapibus nunc, id aliquam dolor."

In [4]:
print(word_tokenize(text))

['Lorem', 'ipsum', 'dolor', 'sit', 'amet', ',', 'consectetur', 'adipiscing', 'elit', '.', 'Fusce', 'commodo', 'mauris', 'id', 'justo', 'condimentum', 'dignissim', '.', 'Nullam', 'placerat', 'semper', 'dapibus', '.', 'Pellentesque', 'ac', 'risus', 'nulla', '.', 'Phasellus', 'ut', 'dapibus', 'nunc', ',', 'id', 'aliquam', 'dolor', '.']


In [5]:
print(sent_tokenize(text))

['Lorem ipsum dolor sit amet, consectetur adipiscing elit.', 'Fusce commodo mauris id justo condimentum dignissim.', 'Nullam placerat semper dapibus.', 'Pellentesque ac risus nulla.', 'Phasellus ut dapibus nunc, id aliquam dolor.']


POS Tagging

In [6]:
to_tag = word_tokenize(text)

In [7]:
print(pos_tag(to_tag))

[('Lorem', 'NNP'), ('ipsum', 'NN'), ('dolor', 'NN'), ('sit', 'NN'), ('amet', 'NN'), (',', ','), ('consectetur', 'NN'), ('adipiscing', 'VBG'), ('elit', 'NN'), ('.', '.'), ('Fusce', 'NNP'), ('commodo', 'JJ'), ('mauris', 'NN'), ('id', 'NN'), ('justo', 'NN'), ('condimentum', 'NN'), ('dignissim', 'NN'), ('.', '.'), ('Nullam', 'NNP'), ('placerat', 'VBZ'), ('semper', 'JJR'), ('dapibus', 'NN'), ('.', '.'), ('Pellentesque', 'NNP'), ('ac', 'JJ'), ('risus', 'NN'), ('nulla', 'NN'), ('.', '.'), ('Phasellus', 'CC'), ('ut', 'JJ'), ('dapibus', 'NN'), ('nunc', 'NN'), (',', ','), ('id', 'JJ'), ('aliquam', 'NN'), ('dolor', 'NN'), ('.', '.')]


Stopwords 

In [8]:
stop_words = set(stopwords.words("english"))
print(stop_words)

{"you're", 's', 'to', 'between', 'below', 'why', 'doing', "doesn't", 'itself', "isn't", 'before', 'his', 'had', 'we', 'were', 'do', 'how', 'each', 'y', 'shan', 'wasn', 'because', 'ma', 'other', 'be', 'these', 'at', 'him', 'any', "needn't", 'than', 'don', "you've", 'this', 'as', 'who', 'm', 'them', 'your', 'weren', 'shouldn', 'very', 'are', 'some', 'she', "won't", 'and', 'most', 'mustn', 'needn', 'themselves', 'where', 'a', "aren't", "you'd", 'few', 'of', 'just', 'when', 'isn', 'against', 'her', 'from', 'then', 'under', 'after', "weren't", 'theirs', 'into', 'on', 've', 'ain', "couldn't", 'won', 'further', 'if', 't', 'was', 're', "mustn't", 'such', 'yourselves', "you'll", 'what', 'doesn', 'their', 'up', 'down', 'yourself', "should've", 'should', 'does', 'which', "she's", 'no', 'my', 'didn', 'only', 'did', 'whom', 'mightn', 'will', 'hadn', 'by', 'hasn', 'while', 'but', 'more', 'here', "don't", 'they', 'through', 'me', 'same', 'there', 'd', 'o', 'its', 'until', 'during', 'have', 'haven', "

In [9]:
to_clean = word_tokenize(text)

In [10]:
no_stopwords_text = []
for token in to_clean:
    if(token not in stop_words):
        no_stopwords_text.append(token)

print(no_stopwords_text)

['Lorem', 'ipsum', 'dolor', 'sit', 'amet', ',', 'consectetur', 'adipiscing', 'elit', '.', 'Fusce', 'commodo', 'mauris', 'id', 'justo', 'condimentum', 'dignissim', '.', 'Nullam', 'placerat', 'semper', 'dapibus', '.', 'Pellentesque', 'ac', 'risus', 'nulla', '.', 'Phasellus', 'ut', 'dapibus', 'nunc', ',', 'id', 'aliquam', 'dolor', '.']


Stemming 

In [11]:
stemmer = PorterStemmer()

In [12]:
stemmed_words = []
for token in no_stopwords_text:
    stemmed_word = stemmer.stem(token)
    stemmed_words.append(stemmed_word)

In [13]:
print(stemmed_words)

['lorem', 'ipsum', 'dolor', 'sit', 'amet', ',', 'consectetur', 'adipisc', 'elit', '.', 'fusc', 'commodo', 'mauri', 'id', 'justo', 'condimentum', 'dignissim', '.', 'nullam', 'placerat', 'semper', 'dapibu', '.', 'pellentesqu', 'ac', 'risu', 'nulla', '.', 'phasellu', 'ut', 'dapibu', 'nunc', ',', 'id', 'aliquam', 'dolor', '.']


Lemmatization

In [14]:
lemmatizer = WordNetLemmatizer()

In [15]:
lemmatized_words = []
for token in no_stopwords_text:
    lemmatized = lemmatizer.lemmatize(token)  # Assuming you want to lemmatize verbs (you can change the 'pos' argument as needed)
    lemmatized_words.append(lemmatized)


In [16]:
print(lemmatized_words)

['Lorem', 'ipsum', 'dolor', 'sit', 'amet', ',', 'consectetur', 'adipiscing', 'elit', '.', 'Fusce', 'commodo', 'mauris', 'id', 'justo', 'condimentum', 'dignissim', '.', 'Nullam', 'placerat', 'semper', 'dapibus', '.', 'Pellentesque', 'ac', 'risus', 'nulla', '.', 'Phasellus', 'ut', 'dapibus', 'nunc', ',', 'id', 'aliquam', 'dolor', '.']


TF-IDF Vectorization 


In [17]:
vectorizer = TfidfVectorizer()

In [18]:
corpus = [
    "I love to eat pizza",
    "Pizza is my favorite food",
    "I enjoy eating pizza with friends",
    "I like to have pizza for dinner",
    "Pizza toppings include cheese, pepperoni, and mushrooms"
]


In [19]:
vectorizer = TfidfVectorizer()

In [20]:
tfidf_matrix = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names_out()

In [21]:

print(tfidf_matrix.toarray())

print(feature_names)

[[0.         0.         0.         0.58946308 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.58946308 0.         0.         0.
  0.28088232 0.4755751  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.48638585 0.48638585 0.         0.         0.         0.
  0.48638585 0.         0.         0.         0.48638585 0.
  0.23176546 0.         0.         0.        ]
 [0.         0.         0.         0.         0.48638585 0.48638585
  0.         0.         0.         0.48638585 0.         0.
  0.         0.         0.         0.         0.         0.
  0.23176546 0.         0.         0.48638585]
 [0.         0.         0.45277275 0.         0.         0.
  0.         0.         0.45277275 0.         0.45277275 0.
  0.         0.45277275 0.         0.         0.         0.
  0.21574864 0.36529421 0.         0.        ]
 [0.40073619 0.40073619 0.         0.         0.         0.
  0.         0.         