In [8]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Downloads (run only first time)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt_tab')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ruthv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ruthv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ruthv\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ruthv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ruthv\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [None]:
# Handle missing values in the text
def handle_missing(text):
    if pd.isna(text):
        return ""
    return str(text).encode('utf-8', 'ignore').decode('utf-8').strip()

In [None]:
# Clean the text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)   # remove URLs
    text = re.sub(r"<.*?>", "", text)                     # remove HTML
    text = re.sub(r"[^a-z\s]", "", text)                  # keep only alphabets
    text = re.sub(r"\s+", " ", text).strip()              # normalize whitespace
    return text

In [None]:
# Remove stopwords from the text
def remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [None]:
# Lemmatize the text
def lemmatize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

In [None]:
# Tokenize the text
def tokenize_text(text):
    return nltk.word_tokenize(text)

In [None]:
df = pd.read_csv("./combined_data.csv")

# Drop missing/empty rows
df = df.dropna(subset=['text'])
df['text'] = df['text'].apply(handle_missing)
df = df[df['text'].str.strip() != ""]

# Apply preprocessing steps
df['text'] = df['text'].apply(clean_text)
df['text'] = df['text'].apply(remove_stopwords)
df['text'] = df['text'].apply(lemmatize_text)
df['tokens'] = df['text'].apply(tokenize_text)

In [None]:
# Save the processed DataFrame
df.to_csv("./combined_data.csv", index=False)
print("✅ Preprocessing complete. Saved to combined_data.csv")

✅ Preprocessing complete. Saved to combined_data.csv
