In [7]:
# 🧠 Task 2: Sentiment Analysis with TF-IDF + Logistic Regression

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

!pip install nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("data.csv")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vamsi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Preview
print(df.head())
print(df.info())

                                              review  rating
0  It was nice produt. I like it's design a lot. ...       5
1  awesome sound....very pretty to see this nd th...       5
2  awesome sound quality. pros 7-8 hrs of battery...       4
3  I think it is such a good product not only as ...       5
4  awesome bass sound quality very good bettary l...       5
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9976 entries, 0 to 9975
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  9976 non-null   object
 1   rating  9976 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 156.0+ KB
None


In [9]:
# Rename columns if necessary
df.columns = ['Review', 'Sentiment']

In [10]:
# Clean text function
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()  # Remove whitespace
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

In [11]:
# Apply cleaning
df['Cleaned_Review'] = df['Review'].astype(str).apply(clean_text)

In [13]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['Cleaned_Review']).toarray()

In [14]:
# Target
y = df['Sentiment']

In [15]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

In [17]:
# Predictions
y_pred = model.predict(X_test)

In [18]:
# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6472945891783567

Confusion Matrix:
 [[  73    1   11   13   43]
 [  16    1   11   11   20]
 [   5    0   22   48   81]
 [   7    0   10  113  338]
 [   5    0    8   76 1083]]

Classification Report:
               precision    recall  f1-score   support

           1       0.69      0.52      0.59       141
           2       0.50      0.02      0.03        59
           3       0.35      0.14      0.20       156
           4       0.43      0.24      0.31       468
           5       0.69      0.92      0.79      1172

    accuracy                           0.65      1996
   macro avg       0.53      0.37      0.39      1996
weighted avg       0.60      0.65      0.60      1996



In [19]:
# Predicting new example
sample = ["This product is amazing and works like a charm!"]
sample_clean = [clean_text(s) for s in sample]
sample_vec = tfidf.transform(sample_clean).toarray()
print("\nSample Prediction:", model.predict(sample_vec))  # 1 = positive, 0 = negative


Sample Prediction: [5]
