# **Perform tokenization, stopword removal, stemming, and lemmatization on a sample dataset. Compare how these preprocessing steps impact the quality of text representation.**

#Importing Libararies



In [None]:
!pip install nltk pandas



In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#Loading Data set

In [None]:
df = pd.read_csv("Reviews.csv", usecols=['Text', 'Score'])
df = df.dropna().head(3000)   # take 3000 rows for speed
df.head()


Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [None]:
df = pd.read_csv("Reviews.csv", usecols=['Text', 'Score'], engine='python', on_bad_lines='skip')
df = df.dropna().head(3000)   # take 3000 rows for speed
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


#Data Preprocessing

In [None]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # 1. Tokenization
    tokens = word_tokenize(text.lower())

    # 2. Stopword Removal + keep alphabetic words only
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]

    # 3. Stemming
    stemmed = [stemmer.stem(w) for w in filtered]

    # 4. Lemmatization
    lemmatized = [lemmatizer.lemmatize(w) for w in filtered]

    return tokens, filtered, stemmed, lemmatized


In [None]:
import nltk
nltk.download('punkt_tab')

df["tokens"] = df["Text"].apply(lambda x: preprocess(x)[0])
df["no_stopwords"] = df["Text"].apply(lambda x: preprocess(x)[1])
df["stemmed"] = df["Text"].apply(lambda x: preprocess(x)[2])
df["lemmatized"] = df["Text"].apply(lambda x: preprocess(x)[3])

df.head()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,Score,Text,tokens,no_stopwords,stemmed,lemmatized
0,5,I have bought several of the Vitality canned d...,"[i, have, bought, several, of, the, vitality, ...","[bought, several, vitality, canned, dog, food,...","[bought, sever, vital, can, dog, food, product...","[bought, several, vitality, canned, dog, food,..."
1,1,Product arrived labeled as Jumbo Salted Peanut...,"[product, arrived, labeled, as, jumbo, salted,...","[product, arrived, labeled, jumbo, salted, pea...","[product, arriv, label, jumbo, salt, peanut, p...","[product, arrived, labeled, jumbo, salted, pea..."
2,4,This is a confection that has been around a fe...,"[this, is, a, confection, that, has, been, aro...","[confection, around, centuries, light, pillowy...","[confect, around, centuri, light, pillowi, cit...","[confection, around, century, light, pillowy, ..."
3,2,If you are looking for the secret ingredient i...,"[if, you, are, looking, for, the, secret, ingr...","[looking, secret, ingredient, robitussin, beli...","[look, secret, ingredi, robitussin, believ, fo...","[looking, secret, ingredient, robitussin, beli..."
4,5,Great taffy at a great price. There was a wid...,"[great, taffy, at, a, great, price, ., there, ...","[great, taffy, great, price, wide, assortment,...","[great, taffi, great, price, wide, assort, yum...","[great, taffy, great, price, wide, assortment,..."


#Comparing Representation Quality

In [None]:
def get_vocab_size(list_of_docs):
    vocab = set()
    for doc in list_of_docs:
        vocab.update(doc)
    return len(vocab)

# Ensure preprocessing columns are present before calculating vocabulary size
# This addresses potential inconsistencies if previous cells were not run or state was lost
df["tokens"] = df["Text"].apply(lambda x: preprocess(x)[0])
df["no_stopwords"] = df["Text"].apply(lambda x: preprocess(x)[1])
df["stemmed"] = df["Text"].apply(lambda x: preprocess(x)[2])
df["lemmatized"] = df["Text"].apply(lambda x: preprocess(x)[3])

results = {
    "Original Tokens": get_vocab_size(df["tokens"]),
    "After Stopword Removal": get_vocab_size(df["no_stopwords"]),
    "After Stemming": get_vocab_size(df["stemmed"]),
    "After Lemmatization": get_vocab_size(df["lemmatized"])
}

pd.DataFrame(results, index=["Vocabulary Size"])

Unnamed: 0,Original Tokens,After Stopword Removal,After Stemming,After Lemmatization
Vocabulary Size,12282,9668,6847,8637
