In [5]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
data = {
    "text": [
        "I love Machine Learning!!! It's amazing :)",
        "Deep Learning is powerful, but needs more data.",
        "NLP is fun. Cleaning text is important!"
    ],
    "label": ["positive", "neutral", "positive"]
}

In [3]:
df = pd.DataFrame(data)

In [13]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):

    # Removing lowercase
    text = text.lower()

    # Remove punctuation/numbers
    text = re.sub(r"[^a-z\s]", "", text)

    # remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    words = text.split()

    # Remove stopwords
    words = [w for w in words if w not in stop_words]

    # lemmatization
    words = [lemmatizer.lemmatize(w) for w in words]

    return " ".join(words)

df["clean_text"] = df["text"].apply(preprocess_text)

In [9]:
le = LabelEncoder()
df["label_encoded"] = le.fit_transform(df["label"])

In [10]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df["clean_text"])

tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())


In [11]:
df.to_csv("cleaned_data.csv", index=False)

In [12]:
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

print(" Saved Files:")
print("1) cleaned_data.csv")
print("2) tfidf_features.csv")
print("3) label_encoder.pkl")
print("4) tfidf_vectorizer.pkl")


 Saved Files:
1) cleaned_data.csv
2) tfidf_features.csv
3) label_encoder.pkl
4) tfidf_vectorizer.pkl
