This notebook provides a test walkthrough of the sentiment analysis pipeline.

## Setup and Imports

In [None]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.append('..')

from src.data_loader import load_amazon_reviews
from src.preprocessor import prepare_review_data, TextPreprocessor
from src.tfidf_model import TFIDFSentimentAnalyzer
from src.llm_model import LLMSentimentAnalyzer
from src.visualizer import SentimentVisualizer
from src.summarizer import CustomerInsightSummarizer

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)


: 

## 1. Load and Explore Data

In [None]:
# Load Amazon reviews
df = load_amazon_reviews(category='Electronics', n_samples=1000)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

In [None]:
# Explore rating distribution
print("Rating Distribution:")
print(df['rating'].value_counts().sort_index())

# Plot
df['rating'].value_counts().sort_index().plot(kind='bar', color='skyblue')
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

## 2. Text Preprocessing

In [None]:
# Prepare data
df = prepare_review_data(df)

# Show example of preprocessing
print("Original vs Preprocessed Text:\n")
sample_idx = 0
print(f"Original:\n{df.iloc[sample_idx]['full_text']}")
print(f"\nProcessed:\n{df.iloc[sample_idx]['processed_text']}")
print(f"\nSentiment Label: {df.iloc[sample_idx]['sentiment_label']}")

In [None]:
# View sentiment distribution
print("\nSentiment Distribution:")
print(df['sentiment_label'].value_counts())

# Visualize
viz = SentimentVisualizer()
viz.plot_sentiment_distribution(df, 'sentiment_label')
plt.show()

## 3. TF-IDF Sentiment Analysis

In [None]:
# Train TF-IDF model
tfidf_analyzer = TFIDFSentimentAnalyzer(classifier_type='logistic')

metrics = tfidf_analyzer.train(
    texts=df['processed_text'].tolist(),
    labels=df['sentiment_label'].tolist(),
    test_size=0.2
)

print(f"\n✅ TF-IDF Model Performance:")
print(f"Test Accuracy: {metrics['test_accuracy']:.2%}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1 Score: {metrics['f1_score']:.4f}")

In [None]:
# Show top features
top_features = tfidf_analyzer.get_top_features(n_features=10)

print("\nTop Predictive Features:\n")
for sentiment, features in top_features.items():
    print(f"{sentiment.upper()}: {', '.join(features)}")

In [None]:
# Plot confusion matrix
tfidf_analyzer.plot_confusion_matrix()
plt.show()

## 4. Test on New Reviews

In [None]:
# Test on new reviews
test_reviews = [
    "This product is absolutely amazing! Best purchase ever!",
    "Terrible quality. Broke after one day. Don't waste your money.",
    "It's okay, nothing special but does the job.",
    "Love it! Works perfectly and shipping was fast."
]

# Preprocess
preprocessor = TextPreprocessor()
processed_reviews = [preprocessor.preprocess(review) for review in test_reviews]

# Predict
predictions = tfidf_analyzer.predict(processed_reviews)

print("\nPredictions on New Reviews:\n")
for review, prediction in zip(test_reviews, predictions):
    print(f"Review: {review}")
    print(f"Sentiment: {prediction}\n")

## 5. LLM-Based Analysis (Optional - Takes Longer)

In [None]:
# Initialize LLM analyzer (using smaller sample for speed)
df_sample = df.sample(n=min(200, len(df)), random_state=42)

llm_analyzer = LLMSentimentAnalyzer(model_name='distilbert')
df_sample = llm_analyzer.analyze_reviews(df_sample, text_column='full_text')

print("✅ LLM Analysis complete!")

In [None]:
# Compare LLM with actual labels
from sklearn.metrics import accuracy_score, classification_report

llm_accuracy = accuracy_score(
    df_sample['sentiment_label'], 
    df_sample['llm_sentiment_normalized']
)

print(f"LLM Accuracy: {llm_accuracy:.2%}\n")
print("Classification Report:")
print(classification_report(
    df_sample['sentiment_label'], 
    df_sample['llm_sentiment_normalized']
))

## 6. Generate Customer Insights

In [None]:
# Generate insights
summarizer = CustomerInsightSummarizer()
insights = summarizer.generate_product_insights(
    df,
    product_id='Electronics Sample',
    text_column='full_text'
)

# Create report
report = summarizer.create_txt_report(insights)
print(report)

## 7. Create Comprehensive Dashboard

In [None]:
# Create dashboard
viz.create_dashboard(df, sentiment_column='sentiment_label')
plt.show()

## Conclusion

This notebook demonstrates:
- Data loading and preprocessing
- TF-IDF sentiment classification
- LLM-based sentiment analysis
- Model comparison
- Insight generation
- Comprehensive visualizations

For the full pipeline, run `python quick_start.py`