# 🛒 Amazon Product Reviews — Sentiment Analysis
### Data Analysis & Visualization
**Author:** Abdul Wasay | [GitHub: theabdulwasay](https://github.com/theabdulwasay) | [LinkedIn: Abdul Wasay](https://www.linkedin.com/in/theabdulwasay)
📧 abdulwasaymalik757@gmail.com

---
**Dataset:** 1,465 Amazon India product listings with reviews, ratings & pricing  
**Goal:** Classify customer sentiment (Positive / Neutral / Negative), find keyword patterns, and visualize review insights.


## 1. 📦 Setup & Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import numpy as np
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# ── Dark theme ──────────────────────────────────────────────────
BG, SUR, SUR2, BDR = '#050a0f', '#0d1620', '#131e2c', '#1e3248'
AC, AC2, AC3 = '#00d4ff', '#7c3aed', '#10b981'
WARN, RED, TXT, MUTED = '#f59e0b', '#ef4444', '#e2eaf4', '#6b8aaa'

plt.rcParams.update({
    'figure.facecolor': BG, 'axes.facecolor': SUR, 'axes.edgecolor': BDR,
    'axes.labelcolor': TXT, 'xtick.color': MUTED, 'ytick.color': MUTED,
    'text.color': TXT, 'grid.color': BDR, 'grid.linestyle': '--',
    'grid.alpha': 0.5, 'font.family': 'monospace',
    'axes.spines.top': False, 'axes.spines.right': False,
})
print("✅ Setup complete!")

## 2. 📂 Load & Explore Data

In [None]:
df = pd.read_csv('amazon.csv')
print(f"Shape: {df.shape}")
df[['product_name','rating','review_title','review_content','category']].head()

In [None]:
# Quick stats
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df = df.dropna(subset=['rating','review_content','review_title'])
df['combined_review'] = df['review_title'].astype(str) + ' ' + df['review_content'].astype(str)

print(f"Total products   : {len(df)}")
print(f"Average rating   : {df['rating'].mean():.2f}")
print(f"Rating range     : {df['rating'].min()} – {df['rating'].max()}")
print(f"Categories       : {df['category'].apply(lambda x: str(x).split('|')[0]).nunique()}")
df.describe()

## 3. 🏷️ Sentiment Labeling
> We assign sentiment based on product rating: **≥4.0 = Positive**, **3.0–3.9 = Neutral**, **<3.0 = Negative**

In [None]:
def label_sentiment(r):
    if r >= 4.0:   return 'Positive'
    elif r >= 3.0: return 'Neutral'
    else:          return 'Negative'

df['sentiment'] = df['rating'].apply(label_sentiment)
df['category_main'] = df['category'].apply(lambda x: str(x).split('|')[0])

print("Sentiment Distribution:")
print(df['sentiment'].value_counts())
print(f"\nPositive rate: {(df['sentiment']=='Positive').mean()*100:.1f}%")

## 4. 📊 Visualization 1 — Sentiment Distribution (Donut Chart)

In [None]:
sent_counts = df['sentiment'].value_counts()
labels_d = [f"{k}\n{v} products" for k, v in sent_counts.items()]
colors_donut = [AC3, WARN, RED]

fig, ax = plt.subplots(figsize=(9, 7))
wedges, texts, autotexts = ax.pie(
    sent_counts.values, labels=labels_d, autopct='%1.1f%%',
    colors=colors_donut, startangle=140, pctdistance=0.75,
    wedgeprops={'width': 0.55, 'edgecolor': BG, 'linewidth': 3})
for t in autotexts: t.set_color(BG); t.set_fontsize(12); t.set_fontweight('bold')
for t in texts: t.set_color(TXT); t.set_fontsize(11)
ax.set_title('Sentiment Distribution\nAmazon Product Reviews',
             fontsize=15, fontweight='bold', color=TXT, pad=20)
centre = plt.Circle((0, 0), 0.35, color=SUR)
ax.add_patch(centre)
ax.text(0, 0.05, str(len(df)), ha='center', va='center',
        fontsize=20, fontweight='bold', color=AC, fontfamily='monospace')
ax.text(0, -0.12, 'reviews', ha='center', va='center', fontsize=9, color=MUTED)
plt.tight_layout()
plt.show()

## 5. 📊 Visualization 2 — Rating Distribution by Sentiment

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
for sent, color, label in zip(
    ['Positive', 'Neutral', 'Negative'],
    [AC3, WARN, RED],
    ['Positive (≥4.0)', 'Neutral (3.0–3.9)', 'Negative (<3.0)']):
    sub = df[df['sentiment'] == sent]['rating']
    ax.hist(sub, bins=20, alpha=0.75, color=color, label=label, edgecolor='none')

ax.axvline(df['rating'].mean(), color=AC, lw=2, linestyle='--',
           label=f'Mean Rating: {df["rating"].mean():.2f}')
ax.set_xlabel('Rating Score', labelpad=10)
ax.set_ylabel('Number of Products', labelpad=10)
ax.set_title('Rating Distribution by Sentiment', fontsize=15, fontweight='bold', color=TXT, pad=18)
ax.legend(framealpha=0.2, facecolor=SUR2, edgecolor=BDR, labelcolor=TXT)
ax.grid(True, axis='y')
plt.tight_layout()
plt.show()

## 6. 📊 Visualization 3 — Sentiment by Product Category

In [None]:
cat_sent = df.groupby(['category_main', 'sentiment']).size().unstack(fill_value=0)
for c in ['Positive', 'Neutral', 'Negative']:
    if c not in cat_sent.columns: cat_sent[c] = 0
cat_sent = cat_sent[['Positive', 'Neutral', 'Negative']]
cat_sent['total'] = cat_sent.sum(axis=1)
cat_sent = cat_sent.sort_values('total', ascending=True).head(10)

fig, ax = plt.subplots(figsize=(11, 7))
bottom = np.zeros(len(cat_sent))
for col, color in zip(['Negative', 'Neutral', 'Positive'], [RED, WARN, AC3]):
    ax.barh(cat_sent.index, cat_sent[col], left=bottom, color=color,
            label=col, height=0.6, edgecolor='none')
    bottom += cat_sent[col].values
ax.set_xlabel('Number of Products', labelpad=10)
ax.set_title('Sentiment Breakdown by Product Category',
             fontsize=15, fontweight='bold', color=TXT, pad=18)
ax.legend(framealpha=0.2, facecolor=SUR2, edgecolor=BDR, labelcolor=TXT, loc='lower right')
ax.grid(True, axis='x')
plt.tight_layout()
plt.show()

## 7. 📊 Visualization 4 — Top Keywords: Positive vs Negative Reviews

In [None]:
STOP = {'the','and','for','with','this','that','was','are','have','has','its',
        'not','but','my','it','is','in','to','of','a','i','you','very','so',
        'product','cable','buy','get','use','great','also','they','from','at',
        'on','as','be','an','or','we','our','all','one','can','by','https','media',
        'amazon','com','images'}

def top_words(text, n=15):
    words = re.findall(r'\b[a-z]{3,}\b', text.lower())
    filtered = [w for w in words if w not in STOP]
    return Counter(filtered).most_common(n)

pos_text = ' '.join(df[df['sentiment']=='Positive']['combined_review'])
neg_text = ' '.join(df[df['sentiment']=='Negative']['combined_review'])
pos_top = top_words(pos_text)
neg_top = top_words(neg_text)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
fig.patch.set_facecolor(BG)

for ax, top, color, title in [
    (ax1, pos_top, AC3, '🟢 Positive Review Keywords'),
    (ax2, neg_top, RED, '🔴 Negative Review Keywords')]:
    ax.set_facecolor(SUR)
    words_l = [w for w, _ in top]
    counts_l = [c for _, c in top]
    ax.barh(words_l[::-1], counts_l[::-1], color=color, alpha=0.85, height=0.65, edgecolor='none')
    ax.set_title(title, fontsize=13, fontweight='bold', color=TXT, pad=14)
    ax.set_xlabel('Frequency', color=TXT, labelpad=8)
    ax.grid(True, axis='x', color=BDR, linestyle='--', alpha=0.5)
    for spine in ax.spines.values(): spine.set_edgecolor(BDR)

fig.suptitle('Keyword Analysis: What Customers Say',
             fontsize=15, fontweight='bold', color=TXT, y=1.02)
plt.tight_layout()
plt.show()

## 8. 📊 Visualization 5 — Rating vs Discount % Scatter

In [None]:
df['discount_pct'] = pd.to_numeric(
    df['discount_percentage'].astype(str).str.replace('%','').str.strip(), errors='coerce')

fig, ax = plt.subplots(figsize=(11, 6))
for sent, color in zip(['Positive', 'Neutral', 'Negative'], [AC3, WARN, RED]):
    sub = df[df['sentiment'] == sent]
    ax.scatter(sub['discount_pct'], sub['rating'],
               color=color, alpha=0.4, s=30, label=sent, edgecolors='none')
ax.set_xlabel('Discount Percentage (%)', labelpad=10)
ax.set_ylabel('Product Rating', labelpad=10)
ax.set_title('Product Rating vs Discount % (by Sentiment)',
             fontsize=15, fontweight='bold', color=TXT, pad=18)
ax.legend(framealpha=0.2, facecolor=SUR2, edgecolor=BDR, labelcolor=TXT)
ax.grid(True)
plt.tight_layout()
plt.show()

## 9. 📊 Visualization 6 — % Positive Reviews by Category

In [None]:
cat_pos_rate = df.groupby('category_main').apply(
    lambda x: round((x['sentiment'] == 'Positive').mean() * 100, 1)
).sort_values(ascending=False).head(12)

colors_bar = [AC3 if v >= 70 else WARN if v >= 50 else RED for v in cat_pos_rate.values]

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.barh(cat_pos_rate.index, cat_pos_rate.values, color=colors_bar, height=0.6, edgecolor='none')
ax.axvline(70, color=AC, lw=1.5, linestyle='--', alpha=0.6, label='70% benchmark')
for bar, val in zip(bars, cat_pos_rate.values):
    ax.text(val + 0.5, bar.get_y() + bar.get_height()/2,
            f'{val}%', va='center', fontsize=9, color=TXT)
ax.set_xlabel('Positive Sentiment Rate (%)', labelpad=10)
ax.set_title('% Positive Reviews by Product Category',
             fontsize=15, fontweight='bold', color=TXT, pad=18)
ax.set_xlim(0, 108)
ax.legend(framealpha=0.2, facecolor=SUR2, edgecolor=BDR, labelcolor=TXT)
ax.grid(True, axis='x')
plt.tight_layout()
plt.show()

## 10. 🔍 Key Findings

| # | Finding |
|---|---------|
| 📊 | **75.8% of products** have Positive sentiment (rating ≥ 4.0) |
| 😐 | **23.8% Neutral** (rating 3.0–3.9) — room for improvement |
| 😠 | Only **0.4% Negative** — Amazon India products are generally well-received |
| 💰 | Higher discounts do **not** guarantee higher ratings |
| 🔤 | Positive reviews emphasize: *charging, quality, durable, fast, value* |
| 🔴 | Negative reviews highlight: *stopped, return, broken, issue, defective* |
| 🛍️ | **Electronics & Computers** dominate the dataset by volume |

---
**Author:** Abdul Wasay  
📧 abdulwasaymalik757@gmail.com | 🐙 [github.com/theabdulwasay](https://github.com/theabdulwasay) | 💼 [linkedin.com/in/theabdulwasay](https://www.linkedin.com/in/theabdulwasay)
