## Objective
Build an end-to-end NLP pipeline to preprocess raw English text using regex and standard NLP techniques,
then apply machine learning for sentiment classification.

# Install required libraries and Imports




In [None]:
!pip install nltk kagglehub

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
import kagglehub

# Download & Load Dataset

In [None]:
path = kagglehub.dataset_download("abdelmalekeladjelet/sentiment-analysis-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'sentiment-analysis-dataset' dataset.
Path to dataset files: /kaggle/input/sentiment-analysis-dataset


In [None]:
!mkdir Data
!cp -r /kaggle/input/sentiment-analysis-dataset /content/Data

mkdir: cannot create directory ‘Data’: File exists


In [None]:
English_df = pd.read_csv('/content/Data/sentiment-analysis-dataset/sentiment_data.csv',index_col=0)
English_df.head()

Unnamed: 0,Comment,Sentiment
0,lets forget apple pay required brand new iphon...,1
1,nz retailers don’t even contactless credit car...,0
2,forever acknowledge channel help lessons ideas...,2
3,whenever go place doesn’t take apple pay doesn...,0
4,apple pay convenient secure easy use used kore...,2


# NLTK Resources

In [None]:
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

#English Preprocessing
Text Normalization (Lowercase + Regex Cleaning)

In [None]:
English_df['Comment'] = English_df['Comment'].str.lower()

In [None]:
English_df

Unnamed: 0,Comment,Sentiment
0,lets forget apple pay required brand new iphon...,1
1,nz retailers don’t even contactless credit car...,0
2,forever acknowledge channel help lessons ideas...,2
3,whenever go place doesn’t take apple pay doesn...,0
4,apple pay convenient secure easy use used kore...,2
...,...,...
241921,crores paid neerav modi recovered congress lea...,0
241922,dear rss terrorist payal gawar modi killing pl...,0
241923,cover interaction forum left,1
241924,big project came india modi dream project happ...,1


In [None]:
clean_text = English_df['Comment'].astype(str).apply(
    lambda x: re.sub(r"[’'‘]", " ", x)
)

clean_text

Unnamed: 0,Comment
0,lets forget apple pay required brand new iphon...
1,nz retailers don t even contactless credit car...
2,forever acknowledge channel help lessons ideas...
3,whenever go place doesn t take apple pay doesn...
4,apple pay convenient secure easy use used kore...
...,...
241921,crores paid neerav modi recovered congress lea...
241922,dear rss terrorist payal gawar modi killing pl...
241923,cover interaction forum left
241924,big project came india modi dream project happ...


In [None]:
for i in range(5):
    print("ORIGINAL :", English_df['Comment'].iloc[i])
    print("CLEANED  :", clean_text.iloc[i])
    print("-" * 50)

ORIGINAL : lets forget apple pay required brand new iphone order use significant portion apples user base wasnt able use even wanted successive iphone incorporated technology older iphones replaced number people could use technology increased
CLEANED  : lets forget apple pay required brand new iphone order use significant portion apples user base wasnt able use even wanted successive iphone incorporated technology older iphones replaced number people could use technology increased
--------------------------------------------------
ORIGINAL : nz retailers don’t even contactless credit card machines like paywave support apple pay don’t like high fees come
CLEANED  : nz retailers don t even contactless credit card machines like paywave support apple pay don t like high fees come
--------------------------------------------------
ORIGINAL : forever acknowledge channel help lessons ideas explanations quite helpful youll sit comfort monitor account growth
CLEANED  : forever acknowledge chann

In [None]:
English_df['Comment'].str.contains(r'[^\w\s]').sum()


np.int64(22994)

In [None]:
clean_text.str.contains(r'[^\w\s]').sum()


np.int64(19884)

In [None]:
clean_text.str.contains(r'[A-Z]').sum()


np.int64(0)

In [None]:
import regex

clean_text = English_df['Comment'].astype(str).apply(
    lambda x: regex.sub(r"\p{P}+", " ", x)
)


In [None]:
clean_text = English_df['Comment'].astype(str).apply(lambda x: re.sub(r'\w*\d\w*', ' ', x))

In [None]:
import pandas as pd

num_with_numbers = clean_text.astype(str).str.contains(r'\d').sum()
print(f"Number of comments containing digits: {num_with_numbers}")


Number of comments containing digits: 0


In [None]:
all_words = set(cv.get_feature_names_out())
print("Number of features after stopword removal:", len(all_words))


Number of features after stopword removal: 127022


In [None]:
stop_words = set(stopwords.words('english'))

# Function to check which stopwords exist in a text
def check_stopwords(text):
    words = text.split()
    return [word for word in words if word in stop_words]

# Apply to dataframe
English_df['stopwords_found'] = clean_text.apply(check_stopwords)

# Check rows that still contain stopwords
rows_with_stopwords = English_df[English_df['stopwords_found'].str.len() > 0]

print(f"Number of comments with stopwords: {len(rows_with_stopwords)}")
rows_with_stopwords[['Comment', 'stopwords_found']].head(10)


Number of comments with stopwords: 0


Unnamed: 0,Comment,stopwords_found


# Tokenization & Lemmatization

In [None]:
def tokenize_text(text):
    text = str(text)
    # Split by non-word characters, keep only letters
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

# Apply tokenization
English_df['tokens'] = English_df['Comment'].apply(tokenize_text)


In [None]:
print(English_df[['Comment', 'tokens']].head())

                                             Comment  \
0  lets forget apple pay required brand new iphon...   
1  nz retailers don’t even contactless credit car...   
2  forever acknowledge channel help lessons ideas...   
3  whenever go place doesn’t take apple pay doesn...   
4  apple pay convenient secure easy use used kore...   

                                              tokens  
0  [lets, forget, apple, pay, required, brand, ne...  
1  [nz, retailers, don, t, even, contactless, cre...  
2  [forever, acknowledge, channel, help, lessons,...  
3  [whenever, go, place, doesn, t, take, apple, p...  
4  [apple, pay, convenient, secure, easy, use, us...  


In [None]:
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = str(text).lower()
    tokens = re.findall(r"\b\w+(?:'\w+)?\b", text)
    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

English_df['tokens'] = English_df['Comment'].apply(preprocess_text)

In [None]:
English_df['processed_text'] = English_df['tokens'].apply(lambda x: ' '.join(x))

In [None]:
print(English_df[['Comment', 'tokens', 'processed_text']].head())


                                             Comment  \
0  lets forget apple pay required brand new iphon...   
1  nz retailers don’t even contactless credit car...   
2  forever acknowledge channel help lessons ideas...   
3  whenever go place doesn’t take apple pay doesn...   
4  apple pay convenient secure easy use used kore...   

                                              tokens  \
0  [let, forget, apple, pay, required, brand, new...   
1  [nz, retailer, don, t, even, contactless, cred...   
2  [forever, acknowledge, channel, help, lesson, ...   
3  [whenever, go, place, doesn, t, take, apple, p...   
4  [apple, pay, convenient, secure, easy, use, us...   

                                      processed_text  
0  let forget apple pay required brand new iphone...  
1  nz retailer don t even contactless credit card...  
2  forever acknowledge channel help lesson idea e...  
3  whenever go place doesn t take apple pay doesn...  
4  apple pay convenient secure easy use used kore..

# Vectorization

In [None]:
X = English_df['processed_text']
y = English_df['Sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
cv = CountVectorizer(stop_words='english', ngram_range=(1,1))
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

print("Number of features:", len(cv.get_feature_names_out()))

Number of features: 127022


# Classification & Evaluation

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_cv, y_train)

In [None]:
y_pred = clf.predict(X_test_cv)

In [None]:
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7778722345476787

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.67      0.72     11023
           1       0.73      0.83      0.77     16594
           2       0.83      0.80      0.81     20612

    accuracy                           0.78     48229
   macro avg       0.78      0.76      0.77     48229
weighted avg       0.78      0.78      0.78     48229



## Conclusion
This notebook demonstrates a complete NLP preprocessing and sentiment analysis pipeline,
starting from raw text cleaning to feature extraction and machine learning classification.
