In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...


True

In [3]:
data = {
    "review": [
        "The product quality is amazing and works perfectly",
        "I am not happy with this product, it stopped working",
        "Excellent performance and very easy to use",
        "Worst product ever, totally waste of money"
    ]
}

df = pd.DataFrame(data)
print(df)


                                              review
0  The product quality is amazing and works perfe...
1  I am not happy with this product, it stopped w...
2         Excellent performance and very easy to use
3         Worst product ever, totally waste of money


In [4]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords & non-alphabetic words
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    
    return tokens

In [5]:
df['tokens'] = df['review'].apply(preprocess_text)

df['stemmed'] = df['tokens'].apply(
    lambda tokens: [stemmer.stem(word) for word in tokens]
)

print("\nStemming Output:")
print(df[['review', 'stemmed']])



Stemming Output:
                                              review  \
0  The product quality is amazing and works perfe...   
1  I am not happy with this product, it stopped w...   
2         Excellent performance and very easy to use   
3         Worst product ever, totally waste of money   

                                      stemmed  
0   [product, qualiti, amaz, work, perfectli]  
1                [happi, product, stop, work]  
2                 [excel, perform, easi, use]  
3  [worst, product, ever, total, wast, money]  


In [7]:
df['lemmatized'] = df['tokens'].apply(
    lambda tokens: [lemmatizer.lemmatize(word) for word in tokens]
)

print("\nLemmatization Output:")
print(df[['review', 'lemmatized']])



Lemmatization Output:
                                              review  \
0  The product quality is amazing and works perfe...   
1  I am not happy with this product, it stopped w...   
2         Excellent performance and very easy to use   
3         Worst product ever, totally waste of money   

                                      lemmatized  
0   [product, quality, amazing, work, perfectly]  
1             [happy, product, stopped, working]  
2            [excellent, performance, easy, use]  
3  [worst, product, ever, totally, waste, money]  
