# **Text Preprocessing & Feature Extraction**

In [10]:
# Import required libraries
import nltk
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
# Download necessary NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## **Preprocess a Sample Text Dataset**

In [12]:
# Sample dataset
data = {'Text': [
    "I loved the new Batman movie! It was amazing :)",
    "The food was terrible and the service was slow.",
    "Python is great for Natural Language Processing."
]}
df = pd.DataFrame(data)

In [23]:
# Preprocessing function
def preprocess(text):
    text = text.lower()                          # Lowercase
    text = re.sub(r'[^a-z\s]', '', text)         # Remove punctuation/numbers
    tokens = word_tokenize(text)                 # Tokenization
    stop_words = set(stopwords.words('english')) # Stopwords
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)

In [24]:
df['Cleaned_Text'] = df['Text'].apply(preprocess)
df

Unnamed: 0,Text,Cleaned_Text
0,I loved the new Batman movie! It was amazing :),loved new batman movie amazing
1,The food was terrible and the service was slow.,food terrible service slow
2,Python is great for Natural Language Processing.,python great natural language processing


## **Compare Stemming vs Lemmatization**

In [15]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [16]:
text = "The cats are playing while the children are better at games."
tokens = word_tokenize(text)

In [17]:
# Apply stemming
stemmed = [stemmer.stem(word) for word in tokens]

In [18]:
# Apply lemmatization (verb form for better results)
lemmatized = [lemmatizer.lemmatize(word, pos='v') for word in tokens]


In [19]:
# Display comparison
comparison = pd.DataFrame({
    "Original": tokens,
    "Stemmed": stemmed,
    "Lemmatized": lemmatized
})
comparison

Unnamed: 0,Original,Stemmed,Lemmatized
0,The,the,The
1,cats,cat,cat
2,are,are,be
3,playing,play,play
4,while,while,while
5,the,the,the
6,children,children,children
7,are,are,be
8,better,better,better
9,at,at,at


## **Generate TF-IDF Features for a Text Corpus**

In [20]:
corpus = [
    "I love Python and NLP",
    "Python is great for text mining",
    "I enjoy learning text processing with Python"
]

In [21]:
# Create TF-IDF model
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [22]:
# Convert to DataFrame
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,and,enjoy,for,great,is,learning,love,mining,nlp,processing,python,text,with
0,0.546454,0.0,0.0,0.0,0.0,0.0,0.546454,0.0,0.546454,0.0,0.322745,0.0,0.0
1,0.0,0.0,0.450504,0.450504,0.450504,0.0,0.0,0.450504,0.0,0.0,0.266075,0.34262,0.0
2,0.0,0.450504,0.0,0.0,0.0,0.450504,0.0,0.0,0.0,0.450504,0.266075,0.34262,0.450504
