In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [2]:
# Sample document
sample_document = "Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human natural languages."

In [3]:
# Tokenization
tokens = word_tokenize(sample_document)


In [4]:
len(tokens)

29

In [5]:
# Part-of-speech (POS) tagging
pos_tags = pos_tag(tokens)

In [6]:
pos_tags

[('Natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN'),
 ('(', '('),
 ('NLP', 'NNP'),
 (')', ')'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('field', 'NN'),
 ('of', 'IN'),
 ('computer', 'NN'),
 ('science', 'NN'),
 (',', ','),
 ('artificial', 'JJ'),
 ('intelligence', 'NN'),
 ('and', 'CC'),
 ('computational', 'JJ'),
 ('linguistics', 'NNS'),
 ('concerned', 'VBN'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('interactions', 'NNS'),
 ('between', 'IN'),
 ('computers', 'NNS'),
 ('and', 'CC'),
 ('human', 'JJ'),
 ('natural', 'JJ'),
 ('languages', 'NNS'),
 ('.', '.')]

In [7]:
# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]


In [8]:
filtered_tokens

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'field',
 'computer',
 'science',
 ',',
 'artificial',
 'intelligence',
 'computational',
 'linguistics',
 'concerned',
 'interactions',
 'computers',
 'human',
 'natural',
 'languages',
 '.']

In [9]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]


In [10]:
stemmed_tokens

['natur',
 'languag',
 'process',
 '(',
 'nlp',
 ')',
 'field',
 'comput',
 'scienc',
 ',',
 'artifici',
 'intellig',
 'comput',
 'linguist',
 'concern',
 'interact',
 'comput',
 'human',
 'natur',
 'languag',
 '.']

In [11]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]


In [12]:
lemmatized_tokens

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'field',
 'computer',
 'science',
 ',',
 'artificial',
 'intelligence',
 'computational',
 'linguistics',
 'concerned',
 'interaction',
 'computer',
 'human',
 'natural',
 'language',
 '.']

In [13]:
# Create representation using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([sample_document])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


In [14]:
# Displaying the processed tokens
print("Tokens:", tokens)
print("POS Tags:", pos_tags)
print("Filtered Tokens (after stop words removal):", filtered_tokens)
print("Stemmed Tokens:", stemmed_tokens)
print("Lemmatized Tokens:", lemmatized_tokens)
print("\nTF-IDF Representation:")
print(tfidf_df)

Tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'natural', 'languages', '.']
POS Tags: [('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('field', 'NN'), ('of', 'IN'), ('computer', 'NN'), ('science', 'NN'), (',', ','), ('artificial', 'JJ'), ('intelligence', 'NN'), ('and', 'CC'), ('computational', 'JJ'), ('linguistics', 'NNS'), ('concerned', 'VBN'), ('with', 'IN'), ('the', 'DT'), ('interactions', 'NNS'), ('between', 'IN'), ('computers', 'NNS'), ('and', 'CC'), ('human', 'JJ'), ('natural', 'JJ'), ('languages', 'NNS'), ('.', '.')]
Filtered Tokens (after stop words removal): ['Natural', 'language', 'processing', '(', 'NLP', ')', 'field', 'computer', 'science', ',', 'artificial', 'intelligence',