# Amazon Review Sentiment Analysis - Data Exploration

This notebook explores the Amazon review dataset from Kaggle to understand its characteristics and prepare it for sentiment analysis.

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
from collections import Counter

# Set style for plots
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

## Load Data

In [None]:
# Update this path to your data file
data_path = '../data/train.csv'

# Load the data
try:
    df = pd.read_csv(data_path)
    print(f"Data loaded successfully! Shape: {df.shape}")
except Exception as e:
    print(f"Error loading data: {e}")

## Basic Data Exploration

In [None]:
# Display the first few rows
df.head()

In [None]:
# Display information about the dataframe
df.info()

In [None]:
# Display summary statistics
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

## Data Visualization

### Rating Distribution

In [None]:
# Plot rating distribution
plt.figure(figsize=(10, 6))
rating_counts = df['Score'].value_counts().sort_index()
sns.barplot(x=rating_counts.index, y=rating_counts.values)
plt.title('Distribution of Ratings', fontsize=16)
plt.xlabel('Rating', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Add count labels on bars
for i, count in enumerate(rating_counts.values):
    plt.text(i, count + 0.1 * max(rating_counts.values), str(count), 
             ha='center', fontsize=12)

plt.tight_layout()
plt.show()

### Text Length Distribution

In [None]:
# Calculate text length
df['text_length'] = df['Text'].apply(lambda x: len(str(x).split()))

# Plot text length distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['text_length'], bins=50, kde=True)
plt.title('Distribution of Review Text Length', fontsize=16)
plt.xlabel('Text Length (Number of Words)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xlim(0, 500)  # Limit x-axis for better visualization
plt.tight_layout()
plt.show()

### Word Cloud for All Reviews

In [None]:
# Combine all reviews
all_text = ' '.join(df['Text'].dropna().astype(str))

# Generate word cloud
wordcloud = WordCloud(width=800, height=600, background_color='white',
                      max_words=100, contour_width=3, 
                      contour_color='steelblue').generate(all_text)

# Display the word cloud
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of All Reviews', fontsize=16)
plt.tight_layout()
plt.show()

### Word Clouds by Rating

In [None]:
# Create word clouds for different ratings
ratings = sorted(df['Score'].unique())

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, rating in enumerate(ratings):
    # Filter reviews by rating
    rating_text = ' '.join(df[df['Score'] == rating]['Text'].dropna().astype(str))
    
    # Generate word cloud
    wordcloud = WordCloud(width=600, height=400, background_color='white',
                          max_words=50, contour_width=2, 
                          contour_color='steelblue').generate(rating_text)
    
    # Display the word cloud
    axes[i].imshow(wordcloud, interpolation='bilinear')
    axes[i].axis('off')
    axes[i].set_title(f'Rating {rating} Stars', fontsize=14)

# Hide the last subplot if there are less than 6 ratings
if len(ratings) < 6:
    axes[-1].axis('off')

plt.tight_layout()
plt.show()

### Most Common Words

In [None]:
# Tokenize and count words
all_words = ' '.join(df['Text'].dropna().astype(str)).lower().split()
word_counts = Counter(all_words)

# Get most common words
most_common_words = word_counts.most_common(20)

# Create dataframe for plotting
df_words = pd.DataFrame(most_common_words, columns=['Word', 'Count'])

# Plot most common words
plt.figure(figsize=(12, 8))
sns.barplot(x='Count', y='Word', data=df_words)
plt.title('Most Common Words in Reviews', fontsize=16)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Word', fontsize=12)
plt.tight_layout()
plt.show()

## Sentiment Analysis Preparation

In [None]:
# Create sentiment labels based on rating
# 1-2 stars: Negative, 3 stars: Neutral, 4-5 stars: Positive
df['sentiment'] = df['Score'].apply(
    lambda x: 'negative' if x <= 2 else ('neutral' if x == 3 else 'positive')
)

# Filter out neutral reviews for binary classification
df_binary = df[df['sentiment'] != 'neutral'].copy()

# Convert sentiment to binary (0 for negative, 1 for positive)
df_binary['sentiment_binary'] = df_binary['sentiment'].apply(
    lambda x: 0 if x == 'negative' else 1
)

print(f"Original dataset shape: {df.shape}")
print(f"Binary sentiment dataset shape: {df_binary.shape}")

### Sentiment Distribution

In [None]:
# Plot sentiment distribution
plt.figure(figsize=(10, 6))
sentiment_counts = df_binary['sentiment'].value_counts()
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
plt.title('Distribution of Sentiments', fontsize=16)
plt.xlabel('Sentiment', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Add count labels on bars
for i, count in enumerate(sentiment_counts.values):
    plt.text(i, count + 0.1 * max(sentiment_counts.values), str(count), 
             ha='center', fontsize=12)

plt.tight_layout()
plt.show()

### Sentiment by Rating

In [None]:
# Create a crosstab of rating and sentiment
rating_sentiment = pd.crosstab(df_binary['Score'], df_binary['sentiment'])

# Normalize to get proportions
rating_sentiment_prop = rating_sentiment.div(rating_sentiment.sum(axis=1), axis=0)

# Plot stacked bar chart
plt.figure(figsize=(10, 6))
rating_sentiment_prop.plot(kind='bar', stacked=True, color=['#FF9999', '#66B2FF'])
plt.title('Sentiment Distribution by Rating', fontsize=16)
plt.xlabel('Rating', fontsize=12)
plt.ylabel('Proportion', fontsize=12)
plt.legend(title='Sentiment')
plt.tight_layout()
plt.show()

### Word Clouds by Sentiment

In [None]:
# Create word clouds for positive and negative reviews
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Positive reviews word cloud
positive_text = ' '.join(df_binary[df_binary['sentiment'] == 'positive']['Text'].dropna().astype(str))
positive_wordcloud = WordCloud(width=600, height=400, background_color='white',
                              max_words=100, contour_width=2, 
                              contour_color='steelblue').generate(positive_text)

axes[0].imshow(positive_wordcloud, interpolation='bilinear')
axes[0].axis('off')
axes[0].set_title('Positive Reviews', fontsize=16)

# Negative reviews word cloud
negative_text = ' '.join(df_binary[df_binary['sentiment'] == 'negative']['Text'].dropna().astype(str))
negative_wordcloud = WordCloud(width=600, height=400, background_color='white',
                              max_words=100, contour_width=2, 
                              contour_color='steelblue').generate(negative_text)

axes[1].imshow(negative_wordcloud, interpolation='bilinear')
axes[1].axis('off')
axes[1].set_title('Negative Reviews', fontsize=16)

plt.tight_layout()
plt.show()

## Text Preprocessing

In [None]:
# Import NLTK for text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download NLTK resources if needed
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

In [None]:
# Initialize NLTK tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

In [None]:
# Apply preprocessing to a sample of reviews
sample_size = 1000
df_sample = df_binary.sample(sample_size, random_state=42)
df_sample['Text_processed'] = df_sample['Text'].apply(preprocess_text)

# Display original and processed text
for i in range(5):
    print(f"Original: {df_sample.iloc[i]['Text']}")
    print(f"Processed: {df_sample.iloc[i]['Text_processed']}")
    print()

## Save Processed Data

In [None]:
# Save the processed data for model training
df_sample.to_csv('../data/processed_reviews.csv', index=False)
print(f"Processed data saved to ../data/processed_reviews.csv")