# Amazon Review Sentiment Analysis

This notebook performs sentiment analysis on Amazon review data from Kaggle using various machine learning models.

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import sys
sys.path.append('../src')
from data_preprocessing import TextPreprocessor, load_data, create_sentiment_labels
from model import SentimentModel
from visualization import plot_sentiment_distribution, plot_rating_distribution, plot_word_cloud

# Set style for plots
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

## Load and Preprocess Data

In [None]:
# Update this path to your data file
data_path = '../data/train.csv'

# Load the data
df = load_data(data_path)
if df is None:
    print("Failed to load data. Please check the data path.")
else:
    print(f"Data loaded successfully! Shape: {df.shape}")

In [None]:
# Create sentiment labels
df_labeled = create_sentiment_labels(df, 'Score', 'Text')
print(f"Dataset shape after creating sentiment labels: {df_labeled.shape}")

In [None]:
# Preprocess text
preprocessor = TextPreprocessor()
df_processed = preprocessor.preprocess_dataframe(df_labeled, 'Text')
print(f"Dataset shape after preprocessing: {df_processed.shape}")

In [None]:
# Display the first few rows of processed data
df_processed[['Text', 'Text_processed', 'sentiment_binary']].head()

## Data Visualization

In [None]:
# Plot sentiment distribution
sentiment_fig = plot_sentiment_distribution(df_processed, 'sentiment_binary')
plt.show()

In [None]:
# Plot rating distribution
rating_fig = plot_rating_distribution(df_labeled, 'Score')
plt.show()

In [None]:
# Plot word cloud for positive reviews
positive_text = ' '.join(df_processed[df_processed['sentiment_binary'] == 1]['Text_processed'])
positive_wordcloud = plot_word_cloud(positive_text, "Word Cloud of Positive Reviews")
plt.show()

In [None]:
# Plot word cloud for negative reviews
negative_text = ' '.join(df_processed[df_processed['sentiment_binary'] == 0]['Text_processed'])
negative_wordcloud = plot_word_cloud(negative_text, "Word Cloud of Negative Reviews")
plt.show()

## Model Training and Evaluation

### Logistic Regression

In [None]:
# Initialize and train Logistic Regression model
lr_model = SentimentModel(model_type='logistic_regression')
X_train, X_test, y_train, y_test = lr_model.prepare_data(
    df_processed, 'Text_processed', 'sentiment_binary'
)
lr_model.train(X_train, y_train)

In [None]:
# Evaluate Logistic Regression model
lr_results = lr_model.evaluate(X_test, y_test)

### Naive Bayes

In [None]:
# Initialize and train Naive Bayes model
nb_model = SentimentModel(model_type='naive_bayes')
nb_model.train(X_train, y_train)

In [None]:
# Evaluate Naive Bayes model
nb_results = nb_model.evaluate(X_test, y_test)

### Support Vector Machine

In [None]:
# Initialize and train SVM model
svm_model = SentimentModel(model_type='svm')
svm_model.train(X_train, y_train)

In [None]:
# Evaluate SVM model
svm_results = svm_model.evaluate(X_test, y_test)

### Random Forest

In [None]:
# Initialize and train Random Forest model
rf_model = SentimentModel(model_type='random_forest')
rf_model.train(X_train, y_train)

In [None]:
# Evaluate Random Forest model
rf_results = rf_model.evaluate(X_test, y_test)

## Model Comparison

In [None]:
# Compare model accuracies
model_names = ['Logistic Regression', 'Naive Bayes', 'SVM', 'Random Forest']
accuracies = [lr_results['accuracy'], nb_results['accuracy'], 
              svm_results['accuracy'], rf_results['accuracy']]

# Create a DataFrame for comparison
comparison_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracies
})

# Sort by accuracy
comparison_df = comparison_df.sort_values('Accuracy', ascending=False)

# Plot model comparison
plt.figure(figsize=(10, 6))
sns.barplot(x='Accuracy', y='Model', data=comparison_df)
plt.title('Model Accuracy Comparison', fontsize=16)
plt.xlabel('Accuracy', fontsize=12)
plt.ylabel('Model', fontsize=12)

# Add accuracy labels on bars
for i, acc in enumerate(comparison_df['Accuracy']):
    plt.text(acc + 0.01, i, f"{acc:.4f}", ha='left', va='center', fontsize=12)

plt.xlim(0, 1.0)
plt.tight_layout()
plt.show()

## Test with Custom Reviews

In [None]:
# Select the best performing model for predictions
best_model = lr_model  # Change this to the best performing model

# Test with custom reviews
test_reviews = [
    "This product is amazing! I love it so much.",
    "Terrible product. Waste of money. Would not recommend.",
    "It's okay, not great but not terrible either.",
    "I would definitely buy this again. Highly recommended!",
    "Poor quality, broke after just one week of use."
]

# Preprocess the test reviews
preprocessed_reviews = [preprocessor.preprocess_text(review) for review in test_reviews]

# Make predictions
predictions = [best_model.predict(review) for review in preprocessed_reviews]
sentiments = ['positive' if pred == 1 else 'negative' for pred in predictions]

# Display results
results_df = pd.DataFrame({
    'Review': test_reviews,
    'Predicted Sentiment': sentiments
})

results_df

## Save the Best Model

In [None]:
# Save the best performing model
model_path = '../models/best_sentiment_model.pkl'
best_model.save_model(model_path)
print(f"Best model saved to {model_path}")