In [26]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [28]:
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [30]:
document = """
Natural Language Processing (NLP) is a sub-field of artificial intelligence (AI) that deals with the interaction 
between computers and human (natural) languages. The ultimate goal of NLP is to enable computers to read, 
understand, and derive meaning from human languages in a manner that is both valuable and useful.
"""

In [32]:
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(document)

In [34]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

In [36]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

In [38]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

In [40]:
print("Tokens:", tokens)
print("\nFiltered Tokens (Stopwords Removed):", filtered_tokens)
print("\nStemmed Tokens:", stemmed_tokens)
print("\nLemmatized Tokens:", lemmatized_tokens)


Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'sub-field', 'of', 'artificial', 'intelligence', '(', 'AI', ')', 'that', 'deals', 'with', 'the', 'interaction', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages.', 'The', 'ultimate', 'goal', 'of', 'NLP', 'is', 'to', 'enable', 'computers', 'to', 'read', ',', 'understand', ',', 'and', 'derive', 'meaning', 'from', 'human', 'languages', 'in', 'a', 'manner', 'that', 'is', 'both', 'valuable', 'and', 'useful', '.']

Filtered Tokens (Stopwords Removed): ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'sub-field', 'artificial', 'intelligence', '(', 'AI', ')', 'deals', 'interaction', 'computers', 'human', '(', 'natural', ')', 'languages.', 'ultimate', 'goal', 'NLP', 'enable', 'computers', 'read', ',', 'understand', ',', 'derive', 'meaning', 'human', 'languages', 'manner', 'valuable', 'useful', '.']

Stemmed Tokens: ['natur', 'languag', 'process', '(', 'nlp', ')', 'sub-field', 'artifici', 'inte

In [44]:
import pandas as pd

In [46]:
# Sample Document Corpus (Multiple Documents)
documents = [
    "Natural Language Processing (NLP) is a sub-field of artificial intelligence.",
    "NLP deals with the interaction between computers and human languages.",
    "The goal of NLP is to enable computers to understand and process human language."
]

# Calculate TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Get Feature Names (Words)
feature_names = vectorizer.get_feature_names_out()

# Display the TF-IDF values
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

print("\nTF-IDF Representation:")
print(df_tfidf)



TF-IDF Representation:
        and  artificial   between  computers     deals    enable     field  \
0  0.000000    0.351711  0.000000   0.000000  0.000000  0.000000  0.351711   
1  0.274746    0.000000  0.361258   0.274746  0.361258  0.000000  0.000000   
2  0.215996    0.000000  0.000000   0.215996  0.000000  0.284008  0.000000   

       goal     human  intelligence  ...   natural       nlp        of  \
0  0.000000  0.000000      0.351711  ...  0.351711  0.207726  0.267485   
1  0.000000  0.274746      0.000000  ...  0.000000  0.213365  0.000000   
2  0.284008  0.215996      0.000000  ...  0.000000  0.167740  0.215996   

    process  processing       sub       the        to  understand      with  
0  0.000000    0.351711  0.351711  0.000000  0.000000    0.000000  0.000000  
1  0.000000    0.000000  0.000000  0.274746  0.000000    0.000000  0.361258  
2  0.284008    0.000000  0.000000  0.215996  0.568016    0.284008  0.000000  

[3 rows x 24 columns]
