In [2]:
# Sentiment Modeling - Customer Feedback Analytics
# This notebook trains and evaluates multiple sentiment analysis models.

# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
import sys
sys.path.append('..')
from src.models.sentiment_analyzer import SentimentAnalyzer

print("Libraries loaded successfully!")

# 1. Load Preprocessed Data
df = pd.read_csv('../data/processed/preprocessed_reviews.csv')
print(f"Loaded {len(df)} preprocessed reviews")

# Check class distribution
print("\nSentiment distribution:")
print(df['true_sentiment'].value_counts())

# 2. Prepare Data for Modeling
analyzer = SentimentAnalyzer()
X_train, X_test, y_train, y_test = analyzer.prepare_data(df)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"\nClass distribution in training set:")
unique, counts = np.unique(y_train, return_counts=True)
for u, c in zip(unique, counts):
    print(f"  Class {u}: {c} ({c/len(y_train)*100:.1f}%)")

# 3. Train Traditional ML Models
tfidf_results = analyzer.train_tfidf_models(X_train, X_test, y_train, y_test)

for model_name, results in tfidf_results.items():
    print(f"\n{model_name.upper()} Results:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print("\nClassification Report:")
    print(results['report'])

# Confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for idx, (model_name, results) in enumerate(tfidf_results.items()):
    cm = confusion_matrix(y_test, results['predictions'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=['Negative', 'Neutral', 'Positive'],
                yticklabels=['Negative', 'Neutral', 'Positive'])
    axes[idx].set_title(f'{model_name.upper()} Confusion Matrix')
    axes[idx].set_xlabel('Predicted')
    axes[idx].set_ylabel('Actual')
plt.tight_layout()
plt.show()

# 4. Feature Importance Analysis
rf_model = analyzer.models['random_forest']
with open('../models/sentiment/tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)
feature_names = vectorizer.get_feature_names_out()
importances = rf_model.feature_importances_
indices = np.argsort(importances)[-20:]  # Top 20 features
plt.figure(figsize=(10, 8))
plt.barh(range(len(indices)), importances[indices])
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Feature Importance')
plt.title('Top 20 Most Important Features (Random Forest)')
plt.tight_layout()
plt.show()

# 5. Cross-Validation
X_full = vectorizer.fit_transform(df['cleaned_text'])
y_full = df['true_sentiment'].map({'positive': 2, 'negative': 0, 'neutral': 1}).values
cv_scores = cross_val_score(rf_model, X_full, y_full, cv=5, scoring='accuracy')
print("Cross-Validation Results (Random Forest):")
print(f"Scores: {cv_scores}")
print(f"Mean accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# 6. Model Comparison
comparison_df = analyzer.compare_models(tfidf_results)
print("Model Comparison:")
print(comparison_df)

# Visualize model comparison
metrics = ['Accuracy', 'Negative Precision', 'Neutral Precision', 'Positive Precision']
models = comparison_df['Model'].values

fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(models))
width = 0.2
for i, metric in enumerate(metrics):
    values = comparison_df[metric].values
    ax.bar(x + i * width, values, width, label=metric)
ax.set_xlabel('Models')
ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(models)
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# 7. Prediction Examples
sample_reviews = [
    "This product is absolutely amazing! Best purchase ever!",
    "Terrible quality, broke after one day. Very disappointed.",
    "It's okay, nothing special but works as expected.",
    "DO NOT BUY! Complete waste of money!!!",
    "Love it! Exceeded all my expectations. Highly recommend."
]
predictions, confidences = analyzer.predict_with_confidence(sample_reviews, 'random_forest')
print("Sample Predictions:")
print("=" * 60)
for review, pred, conf in zip(sample_reviews, predictions, confidences):
    print(f"Review: {review}")
    print(f"Prediction: {pred} (Confidence: {conf:.2%})")
    print("-" * 60)

# 8. Generate Final Predictions
best_model = comparison_df.iloc[0]['Model'].lower().replace(' ', '_')
print(f"Using best model: {best_model}")
all_predictions, all_confidences = analyzer.predict_with_confidence(df['cleaned_text'].values, best_model)
df['predicted_sentiment'] = all_predictions
df['confidence_score'] = all_confidences
output_path = '../data/processed/sentiment_predictions.csv'
df.to_csv(output_path, index=False)
print(f"\nPredictions saved to: {output_path}")

# 9. Error Analysis
misclassified = df[df['true_sentiment'] != df['predicted_sentiment']]
print(f"Total misclassifications: {len(misclassified)} ({len(misclassified)/len(df)*100:.1f}%)")
misclass_patterns = pd.crosstab(misclassified['true_sentiment'], misclassified['predicted_sentiment'])
print("\nMisclassification Patterns:")
print(misclass_patterns)
print("\nSample Misclassifications:")
for _, row in misclassified.sample(min(5, len(misclassified))).iterrows():
    print(f"\nReview: {row['review_text'][:100]}...")
    print(f"True: {row['true_sentiment']} | Predicted: {row['predicted_sentiment']} | Confidence: {row['confidence_score']:.2f}")


Libraries loaded successfully!
Loaded 5063 preprocessed reviews

Sentiment distribution:
true_sentiment
positive    2993
negative    1394
neutral      676
Name: count, dtype: int64


NameError: name 'analyzer' is not defined

In [2]:
%pip install nltk
%pip install textblob
%pip install spacy
%pip install textstat
%pip install gensim
%pip install torch
%pip install transformers
%pip install scikit-learn
%pip install bertopic
%pip install yake
%pip install wordcloud

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting numpy>=1.19.0
  Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl (5.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into acco