<a href="https://colab.research.google.com/github/varshi/Data-Projects/blob/main/Sentiment%20Analysis%20using%20NLP/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install & Import

!pip install -q scikit-learn pandas numpy matplotlib seaborn nltk transformers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
nltk.download('stopwords')
from nltk.corpus import stopwords


In [None]:
# Load the manually uploaded dataset
df = pd.read_csv("/content/sample_app_reviews (2).csv")
df.head()

In [None]:
# @title label

from matplotlib import pyplot as plt
df['label'].plot(kind='line', figsize=(8, 4), title='label')
plt.gca().spines[['top', 'right']].set_visible(False)

In [None]:
# Clean Reviews
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\\s]', '', text.lower())
    words = text.split()
    return ' '.join([w for w in words if w not in stop_words])

df['clean_review'] = df['Translated_Review'].apply(clean_text)


In [None]:
# TF-IDF and PCA
vectorizer = TfidfVectorizer(max_features=100)
X = vectorizer.fit_transform(df['clean_review'])
y = df['label']

pca = PCA(n_components=5)
X_reduced = pca.fit_transform(X.toarray())

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.title("PCA Explained Variance")
plt.xlabel("Components")
plt.ylabel("Variance")
plt.grid(True)
plt.show()


In [None]:
# Train Models
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
ensemble = VotingClassifier(estimators=[('rf', rf), ('gb', gb)], voting='soft')
ensemble.fit(X_train, y_train)


In [None]:
# Evaluate
y_pred = ensemble.predict(X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
type1_error = cm[0][1] / (cm[0][0] + cm[0][1]) if (cm[0][0] + cm[0][1]) else 0

print(f"✅ Accuracy: {acc*100:.2f}%")
print(f"❗ Type I Error: {type1_error*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
#RoBERTa Setup Cell:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

def roberta_predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    outputs = model(**inputs)
    scores = softmax(outputs.logits.detach().numpy()[0])
    return np.argmax(scores)  # 0 = neg, 1 = neutral, 2 = pos


In [None]:
#RoBERTa Evaluation Cell
sample_reviews = df['Translated_Review'].iloc[:100].tolist()
sample_labels = df['label'].iloc[:100].tolist()
roberta_preds = [1 if roberta_predict(r) == 2 else 0 for r in sample_reviews]

from sklearn.metrics import accuracy_score, confusion_matrix
roberta_acc = accuracy_score(sample_labels, roberta_preds)
roberta_cm = confusion_matrix(sample_labels, roberta_preds)
roberta_type1 = roberta_cm[0][1] / (roberta_cm[0][0] + roberta_cm[0][1]) if (roberta_cm[0][0] + roberta_cm[0][1]) else 0

print(f"🤖 RoBERTa Accuracy: {roberta_acc*100:.2f}%")
print(f"❗ Type I Error: {roberta_type1*100:.2f}%")

import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(roberta_cm, annot=True, fmt='d', cmap='Purples')
plt.title("RoBERTa Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
