In [1]:
import nltk
import re
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [2]:
data = {
    "text": [
        "I love Natural Language Processing!",
        "Machine learning is AMAZING!!!",
        "NLP helps computers understand human language",
        "I love AI and Machine Learning"
    ],
    "label": ["positive", "positive", "neutral", "positive"]
}

df = pd.DataFrame(data)
print(df)


                                            text     label
0            I love Natural Language Processing!  positive
1                 Machine learning is AMAZING!!!  positive
2  NLP helps computers understand human language   neutral
3                 I love AI and Machine Learning  positive


In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df["cleaned_text"] = df["text"].apply(clean_text)
print(df["cleaned_text"])


0               i love natural language processing
1                      machine learning is amazing
2    nlp helps computers understand human language
3                   i love ai and machine learning
Name: cleaned_text, dtype: object


In [5]:
import nltk
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

df["processed_text"] = df["cleaned_text"].apply(preprocess_text)
print(df["processed_text"])

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


0               love natural language processing
1                       machine learning amazing
2    nlp help computer understand human language
3                       love ai machine learning
Name: processed_text, dtype: object


In [6]:
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])

print(df[["label", "label_encoded"]])


      label  label_encoded
0  positive              1
1  positive              1
2   neutral              0
3  positive              1


In [8]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df["processed_text"])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)

print(tfidf_df)

         ai   amazing  computer      help     human  language  learning  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.437791  0.000000   
1  0.000000  0.667679  0.000000  0.000000  0.000000  0.000000  0.526405   
2  0.000000  0.000000  0.421765  0.421765  0.421765  0.332524  0.000000   
3  0.590819  0.000000  0.000000  0.000000  0.000000  0.000000  0.465809   

       love   machine   natural       nlp  processing  understand  
0  0.437791  0.000000  0.555283  0.000000    0.555283    0.000000  
1  0.000000  0.526405  0.000000  0.000000    0.000000    0.000000  
2  0.000000  0.000000  0.000000  0.421765    0.000000    0.421765  
3  0.465809  0.465809  0.000000  0.000000    0.000000    0.000000  


In [9]:
df.to_csv("cleaned_text_data.csv", index=False)


In [10]:
tfidf_df.to_csv("tfidf_features.csv", index=False)


In [11]:
label_mapping = pd.DataFrame({
    "Label": label_encoder.classes_,
    "Encoded_Value": range(len(label_encoder.classes_))
})

label_mapping.to_csv("label_encoding_mapping.csv", index=False)
