# Assignment 3

Name: Vivek Mule
Roll: 381072
PRN: 22420145

Perform text cleaning, perform lemmatization (any method), remove stop words (any method), label encoding. Create representations using TF-IDF. Save output.

In [1]:
!pip install nltk scikit-learn pandas

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# imports

import nltk
import re
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vivek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vivek\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vivek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vivek\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# sample dataset

data = {
    "text": [
        "Using NLTK for text processing",
        "NLTK provides easy-to-use interfaces",
        "Pyton is great for NLP tasks",
        "NLP includes tokenization, stemming, and more"
    ],
    "label": ["positive", "positive", "neutral", "neutral"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,Using NLTK for text processing,positive
1,NLTK provides easy-to-use interfaces,positive
2,Pyton is great for NLP tasks,neutral
3,"NLP includes tokenization, stemming, and more",neutral


In [4]:
# Text cleaning function

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df['clean_text'] = df['text'].apply(clean_text)
df

Unnamed: 0,text,label,clean_text
0,Using NLTK for text processing,positive,using nltk for text processing
1,NLTK provides easy-to-use interfaces,positive,nltk provides easytouse interfaces
2,Pyton is great for NLP tasks,neutral,pyton is great for nlp tasks
3,"NLP includes tokenization, stemming, and more",neutral,nlp includes tokenization stemming and more


In [5]:
# Stop word removal
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word not in stop_words])

df['no_stopwords'] = df['clean_text'].apply(remove_stopwords)
df

Unnamed: 0,text,label,clean_text,no_stopwords
0,Using NLTK for text processing,positive,using nltk for text processing,using nltk text processing
1,NLTK provides easy-to-use interfaces,positive,nltk provides easytouse interfaces,nltk provides easytouse interfaces
2,Pyton is great for NLP tasks,neutral,pyton is great for nlp tasks,pyton great nlp tasks
3,"NLP includes tokenization, stemming, and more",neutral,nlp includes tokenization stemming and more,nlp includes tokenization stemming


In [6]:
# Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    return " ".join([lemmatizer.lemmatize(word) for word in words])

df['lemmatized_text'] = df['no_stopwords'].apply(lemmatize_text)
df

Unnamed: 0,text,label,clean_text,no_stopwords,lemmatized_text
0,Using NLTK for text processing,positive,using nltk for text processing,using nltk text processing,using nltk text processing
1,NLTK provides easy-to-use interfaces,positive,nltk provides easytouse interfaces,nltk provides easytouse interfaces,nltk provides easytouse interface
2,Pyton is great for NLP tasks,neutral,pyton is great for nlp tasks,pyton great nlp tasks,pyton great nlp task
3,"NLP includes tokenization, stemming, and more",neutral,nlp includes tokenization stemming and more,nlp includes tokenization stemming,nlp includes tokenization stemming


In [7]:
# Label Encoding
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

df[['label', 'encoded_label']]

Unnamed: 0,label,encoded_label
0,positive,1
1,positive,1
2,neutral,0
3,neutral,0


In [8]:
# TF-IDF Representation
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['lemmatized_text'])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)

tfidf_df

Unnamed: 0,easytouse,great,includes,interface,nlp,nltk,processing,provides,pyton,stemming,task,text,tokenization,using
0,0.0,0.0,0.0,0.0,0.0,0.414289,0.525473,0.0,0.0,0.0,0.0,0.525473,0.0,0.525473
1,0.525473,0.0,0.0,0.525473,0.0,0.414289,0.0,0.525473,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.525473,0.0,0.0,0.414289,0.0,0.0,0.0,0.525473,0.0,0.525473,0.0,0.0,0.0
3,0.0,0.0,0.525473,0.0,0.414289,0.0,0.0,0.0,0.0,0.525473,0.0,0.0,0.525473,0.0


In [9]:
# Combine Final Output
final_df = pd.concat([df, tfidf_df], axis=1)
final_df


Unnamed: 0,text,label,clean_text,no_stopwords,lemmatized_text,encoded_label,easytouse,great,includes,interface,nlp,nltk,processing,provides,pyton,stemming,task,text.1,tokenization,using
0,Using NLTK for text processing,positive,using nltk for text processing,using nltk text processing,using nltk text processing,1,0.0,0.0,0.0,0.0,0.0,0.414289,0.525473,0.0,0.0,0.0,0.0,0.525473,0.0,0.525473
1,NLTK provides easy-to-use interfaces,positive,nltk provides easytouse interfaces,nltk provides easytouse interfaces,nltk provides easytouse interface,1,0.525473,0.0,0.0,0.525473,0.0,0.414289,0.0,0.525473,0.0,0.0,0.0,0.0,0.0,0.0
2,Pyton is great for NLP tasks,neutral,pyton is great for nlp tasks,pyton great nlp tasks,pyton great nlp task,0,0.0,0.525473,0.0,0.0,0.414289,0.0,0.0,0.0,0.525473,0.0,0.525473,0.0,0.0,0.0
3,"NLP includes tokenization, stemming, and more",neutral,nlp includes tokenization stemming and more,nlp includes tokenization stemming,nlp includes tokenization stemming,0,0.0,0.0,0.525473,0.0,0.414289,0.0,0.0,0.0,0.0,0.525473,0.0,0.0,0.525473,0.0


In [10]:
# Save Output to File
final_df.to_csv("assignment3_output.csv", index=False)
print("File saved successfully!")

File saved successfully!
