### Importing Libaries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import torch
from TextCleaner import TextCleaner
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from scipy.special import softmax
from nltk.tokenize import sent_tokenize
from collections import Counter
import re
from wordcloud import WordCloud
import warnings
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
warnings.filterwarnings("ignore")

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Preload resources
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


### Getting Data

In [None]:
#### Apple iPhone XR (64GB) - Black

data = pd.read_csv('Data/apple_iphone_11_reviews.csv')
data.head()

### Data Exploration and Cleaning

In [None]:
data.info()

In [None]:
data.rename(columns={'Review rating out of 5': 'rating'},inplace=True)
data['helpful_count'] = data['helpful_count'].str.replace(',', '').astype(int)
data.head()

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

In [None]:
data.shape

In [None]:
sns.countplot(x='rating', data=data)
plt.xlabel("Ratings")
plt.ylabel("Count ot Ratings")
plt.show()

In [None]:
### Grouping by rating to see how people interact with the comments
helpful_rating_group = data.groupby('rating')['helpful_count'].agg(
    helpful_mean_amount='mean',
    helpful_max_amount='max',
    helpful_total_amount='sum',
    count = 'count'
).reset_index()

total_comments_rating_group = data.groupby('rating')['total_comments'].agg(
    total_comments_mean_amount='mean',
    total_comments_max_amount='max',
    total_comments_total_amount='sum',
).reset_index()

total_comments_rating_group = total_comments_rating_group.merge(helpful_rating_group,how='left',on='rating')
total_comments_rating_group

In [None]:
data.sort_values('helpful_count', ascending=False).head(10)

In [None]:
raw_data = data.copy()

### Cleaning review text

In [None]:
custom_stopwords = ['iphone', 'apple', 'camera', 'product', 'amazon', 'xr', 'x', 'phone','xs','authorize']
textCleaner = TextCleaner(remove_non_ascii=True, keep_currency_symbols=False,custom_keywords=custom_stopwords)
data['review_text']=data['review_text'].apply(textCleaner.clean)
data['review_title']=data['review_title'].apply(textCleaner.clean)

In [None]:
data.head()

In [None]:
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

### WordCloud

In [None]:
text = " ".join(review for review in data['review_text'])
wordcloud = WordCloud(background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
data_reviews = data[['review_title','review_text','rating']].copy()
data_reviews

### Sentiment Analysis

#### Textblob

In [None]:
data_textblob = data.copy()

In [None]:
polarity = []
subjectivity = []
for i in data_textblob['review_text'].values:
    try:
        analysis = TextBlob(i)
        polarity.append(analysis.sentiment.polarity)
        subjectivity.append(analysis.sentiment.subjectivity)
    except:
        polarity.append(0)
        subjectivity.append(0)

data_textblob['polarity'] = polarity
data_textblob['subjectivity'] = subjectivity

In [None]:
bins = np.linspace(-1, 1, 6) 
data_textblob['updated_rating'] = np.clip(np.digitize(data_textblob.polarity, bins, right=False), 1, 5)
data_textblob['diff'] = data_textblob['updated_rating'] - data_textblob['rating']
data_textblob['diff'].value_counts().sort_index()

In [None]:
data_reviews[['text_blob_polarity','text_blob_rating']] = data_textblob[['polarity','updated_rating']].copy()

#### VADER

In [None]:
data_vader = data.copy()

In [None]:
vader_analyzer = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    scores = vader_analyzer.polarity_scores(text)
    compound = scores['compound']
    if compound >= 0.05:
        sentiment = "Positive"
    elif compound <= -0.05:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"
    return pd.Series([sentiment, compound])

data_vader[['Sentiment', 'Compound_Score']] = data_vader['review_text'].apply(vader_sentiment)

In [None]:
data_vader.Sentiment.value_counts()

In [None]:
data_vader['updated_rating'] = np.clip(np.digitize(data_vader.Compound_Score, bins, right=False), 1, 5)
data_vader['diff'] = data_vader['updated_rating'] - data_vader['rating']
data_vader['diff'].value_counts().sort_index()

In [None]:
data_reviews[['vader_score','vader_sentiment','vader_rating']] = data_vader[['Compound_Score','Sentiment','updated_rating']].copy()

#### RoBERTa

In [None]:
data_RoBERTa = data.copy()

In [None]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

labels = ['Negative', 'Neutral', 'Positive']

def analyze_sentiment(text):
    encoded = tokenizer(text, return_tensors='pt', truncation=True)
    with torch.no_grad():
        output = model(**encoded)
    scores = output.logits[0].numpy()
    probs = softmax(scores)
    sentiment = labels[probs.argmax()]
    confidence = probs.max()
    return sentiment, confidence

data_RoBERTa[['Sentiment', 'Compound_Score']] = data_RoBERTa['review_text'].apply(lambda x: pd.Series(analyze_sentiment(x)))

In [None]:
data_RoBERTa.Sentiment.value_counts()

In [None]:
bins = np.linspace(0, 1, 6) 
data_RoBERTa['updated_rating'] = np.clip(np.digitize(data_RoBERTa.Compound_Score, bins, right=False), 1, 5)
data_RoBERTa['diff'] = data_RoBERTa['updated_rating'] - data_RoBERTa['rating']
data_RoBERTa['diff'].value_counts().sort_index()

In [None]:
data_reviews[['RoBERTa_score','RoBERTa_sentiment','RoBERTa_rating']] = data_RoBERTa[['Compound_Score','Sentiment','updated_rating']].copy()

#### Zero-Shot Classification

In [None]:
data_zero = data.copy()

In [None]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
data_zero = data_zero[data_zero['review_text'].str.strip().ne("")]

candidate_labels = ["Positive", "Negative", "Neutral"]

results = [classifier(text, candidate_labels) for text in data_zero.review_text]

In [None]:
data_zero['Predicted_Sentiment'] = [res['labels'][0] for res in results]
data_zero['Confidence_score'] = [res['scores'][0] for res in results]

In [None]:
data_zero.Predicted_Sentiment.value_counts()

In [None]:
bins = np.linspace(0, 1, 6) 
data_zero['updated_rating'] = np.clip(np.digitize(data_zero.Confidence_score, bins, right=False), 1, 5)
data_zero['diff'] = data_zero['updated_rating'] - data_zero['rating']
data_zero['diff'].value_counts().sort_index()

In [None]:
data_reviews[['zeroShot_score','zeroShot_sentiment','zeroShot_rating']] = data_zero[['Confidence_score','Predicted_Sentiment','updated_rating']].copy()

### Comparison of Various Methods

In [None]:
data_reviews.rename(columns={'rating':'user_rating'},inplace=True)
data_reviews.head()

In [None]:
#### Generating Sentiment : <=2 Negative, ==3 Neutral, >=4 Positive 
data_reviews['text_blob_sentiment'] =np.where( data_reviews['text_blob_rating'] < 3, 'Negative', np.where(data_reviews['text_blob_rating'] == 3, 'Neutral', 'Positive'))
data_reviews['user_sentiment'] =np.where( data_reviews['user_rating'] < 3, 'Negative', np.where(data_reviews['user_rating'] == 3, 'Neutral', 'Positive'))

In [None]:
scores = ['Polarity', 'Compound Score', 'Compound Score', 'Confidence Score']
columns = ['text_blob_polarity','vader_score','RoBERTa_score','zeroShot_score']
titles = ['Text Blob', 'VADER', 'RoBERTa', 'Zero Shot']

# Create 2x2 subplot grid
fig, axes = plt.subplots(2, 2, figsize=(14, 8))
axes = axes.flatten()

# Loop through datasets and plot histograms
for i,column in enumerate(columns):
    axes[i].hist(data_reviews[column], bins=20, color='lightcoral', edgecolor='black')
    axes[i].set_title(f"{titles[i]}: {scores[i]}", fontsize=14, fontweight='bold')
    axes[i].set_xlabel('Score')
    axes[i].set_ylabel('Frequency')
    axes[i].grid(True, linestyle='--', alpha=0.5)

# Layout fix
plt.tight_layout()
plt.show()

In [None]:
columns = ['text_blob_rating', 'vader_rating', 'RoBERTa_rating', 'zeroShot_rating']
titles = ['Text Blob', 'VADER', 'RoBERTa', 'Zero Shot']

# Create 2x2 subplot grid
fig, axes = plt.subplots(2, 2, figsize=(14, 8))
axes = axes.flatten()

# Loop through datasets and plot histograms
for i,column in enumerate(columns):
    conf_matrix = pd.crosstab(data_reviews['user_rating'], data_reviews[column], rownames=['Original'], colnames=[titles[i]])
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Reds',ax=axes[i], linewidths=0.5)
    axes[i].set_title(f'Confusion Matrix of Original vs {titles[i]} Ratings')
    axes[i].set_xlabel(f'{titles[i]} Predicted', fontsize=10)
    axes[i].set_ylabel('True Rating', fontsize=10)

# Layout fix
plt.tight_layout()
plt.subplots_adjust(wspace=0.2, hspace=0.4)
plt.show()

In [None]:
## Plotting Sentiment counts

sentiment_counts = pd.DataFrame()
for col in ['user_sentiment', 'text_blob_sentiment', 'vader_sentiment', 'RoBERTa_sentiment', 'zeroShot_sentiment']:
    sentiment_counts[col] = data_reviews[col].value_counts().sort_index()

sentiment_counts = sentiment_counts.fillna(0).astype(int)

sentiment_counts.plot(kind='bar', figsize=(12, 6), width=0.8)
plt.title('Sentiment Frequency per Model', fontsize=16)
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.legend(title='Source', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


In [None]:
## Plotting Rating counts

rating_counts = pd.DataFrame()
for col in ['user_rating', 'text_blob_rating', 'vader_rating', 'RoBERTa_rating', 'zeroShot_rating']:
    rating_counts[col] = data_reviews[col].value_counts().sort_index()

rating_counts = rating_counts.fillna(0).astype(int)

rating_counts.plot(kind='bar', figsize=(12, 6), width=0.8)
plt.title('Rating Frequency per Model (1–5)', fontsize=16)
plt.xlabel('Rating')
plt.ylabel('Count')
plt.legend(title='Source', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


In [None]:
### Plotting average ratings

avg_ratings = data_reviews[['user_rating', 'text_blob_rating', 'vader_rating', 'RoBERTa_rating', 'zeroShot_rating']].mean()

# Create a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=avg_ratings.index, y=avg_ratings.values, palette='Set2', edgecolor='black')

plt.title('Average Rating by Model vs User Rating', fontsize=16)
plt.ylabel('Average Rating (1–5)')
plt.ylim(0, 5)
plt.grid(axis='y', linestyle='--', alpha=0.6)

# Annotate the bars
for i, val in enumerate(avg_ratings.values):
    plt.text(i, val + 0.05, f"{val:.2f}", ha='center', fontsize=12)

plt.show()


In [None]:
data_reviews.isna().sum()

In [None]:
data_reviews[data_reviews.text_blob_polarity.isnull()]

In [None]:
from sklearn.metrics import mean_absolute_error

mae = {
    model: mean_absolute_error(data_reviews['user_rating'], data_reviews[model])
    for model in ['text_blob_rating', 'vader_rating', 'RoBERTa_rating', 'zeroShot_rating']
}

plt.figure(figsize=(8, 5))
sns.barplot(x=list(mae.keys()), y=list(mae.values()), palette='Set3', edgecolor='black')
plt.title("Mean Absolute Error vs User Ratings", fontsize=16)
plt.ylabel("MAE")
for i, val in enumerate(mae.values()):
    plt.text(i, val + 0.01, f"{val:.2f}", ha='center', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()


### Extracting top Positive/Negative Words using VADER with Zero Shot as base

In [None]:
def extract_pos_neg_words(text):
    words = re.findall(r'\b\w+\b', text.lower())  # tokenize words
    pos_words, neg_words = [], []
    for word in words:
        score = vader_analyzer.polarity_scores(word)['compound']
        if score >= 0.5:
            pos_words.append(word)
        elif score <= -0.5:
            neg_words.append(word)
    return pos_words, neg_words

In [None]:
cluster_pos_words = {}
cluster_neg_words = {}

for cluster, group in data_reviews.groupby('zeroShot_sentiment'):
    all_pos = []
    all_neg = []
    for review in group['review_text']:
        pos, neg = extract_pos_neg_words(review)
        all_pos.extend(pos)
        all_neg.extend(neg)
    cluster_pos_words[cluster] = Counter(all_pos).most_common(10)
    cluster_neg_words[cluster] = Counter(all_neg).most_common(10)


In [None]:
def plot_pos_neg_words(pos_words, neg_words, title="Top Sentiment Words"):
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    if pos_words:
        words, counts = zip(*pos_words)
        axes[0].barh(words, counts, color='green')
        axes[0].set_title("Positive Words")
        axes[0].invert_yaxis()
    else:
        axes[0].text(0.5, 0.5, 'No Positive Words', ha='center', va='center')
        axes[0].axis('off')
    
    if neg_words:
        words, counts = zip(*neg_words)
        axes[1].barh(words, counts, color='red')
        axes[1].set_title("Negative Words")
        axes[1].invert_yaxis()
    else:
        axes[1].text(0.5, 0.5, 'No Negative Words', ha='center', va='center')
        axes[1].axis('off')
    
    for ax in axes:
        ax.set_xlabel("Frequency")
    
    fig.suptitle(title)
    plt.tight_layout()
    plt.show()

In [None]:
plot_pos_neg_words(cluster_pos_words['Positive'], cluster_neg_words['Negative'], title="Positive vs Negative Words")

### Top Positive and Negtative Sentences

In [None]:
positive_sentences = []
negative_sentences = []

# Replace this with your actual column:
text_column = raw_data['review_text']

for review in text_column.dropna():
    sentences = sent_tokenize(review)
    for sentence in sentences:
        score = vader_analyzer.polarity_scores(sentence)['compound']
        if score >= 0.7:
            positive_sentences.append(sentence)
        elif score <= -0.3:
            negative_sentences.append(sentence)

            
print("🟢 Top 5 Positive Sentences:")
for s in positive_sentences[:10]:
    print("-", s)

print("\n🔴 Top 5 Negative Sentences:")
for s in negative_sentences[:10]:
    print("-", s)


### N-Grams Analysis

In [None]:
def get_ngrams(texts, n=2, top_k=15):
    vec = CountVectorizer(ngram_range=(n, n), stop_words='english')
    X = vec.fit_transform(texts)
    ngram_counts = X.sum(axis=0).A1
    vocab = vec.get_feature_names_out()
    ngram_freq = pd.Series(ngram_counts, index=vocab)
    return ngram_freq.sort_values(ascending=False).head(top_k)

for n in [1, 2, 3]:
    top_ngrams = get_ngrams(data['review_text'], n=n)
    plt.figure(figsize=(10, 5))
    top_ngrams.plot(kind='barh', title=f"Top {n}-grams", color='purple')
    plt.gca().invert_yaxis()
    plt.xlabel("Frequency")
    plt.tight_layout()
    plt.show()