# Advanced Sentiment Analysis

This notebook demonstrates advanced sentiment analysis techniques including multiple models and comparison.

In [None]:
import sys
import os
sys.path.append(os.path.abspath('../src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sentiment_analysis.data_utils import create_sample_data
from sentiment_analysis.preprocessing import preprocess_lyrics_dataframe
from sentiment_analysis.analyzer import SentimentAnalyzer
from sentiment_analysis.visualization import (
    plot_sentiment_distribution,
    plot_sentiment_comparison,
    plot_correlation_matrix
)

%matplotlib inline

## Load and Prepare Data

In [None]:
df = create_sample_data()
df = preprocess_lyrics_dataframe(df)
print(f"Loaded {len(df)} songs")

## Compare VADER and TextBlob

In [None]:
# Analyze with VADER
vader_analyzer = SentimentAnalyzer(method='vader')
df_vader = vader_analyzer.analyze_dataframe(df.copy(), text_column='lyrics')
df_vader = df_vader.rename(columns={'sentiment': 'vader_sentiment', 'compound': 'vader_compound'})

# Analyze with TextBlob
textblob_analyzer = SentimentAnalyzer(method='textblob')
df_textblob = textblob_analyzer.analyze_dataframe(df.copy(), text_column='lyrics')
df_textblob = df_textblob.rename(columns={'sentiment': 'textblob_sentiment'})

# Combine results
df_combined = df.copy()
df_combined['vader_sentiment'] = df_vader['vader_sentiment']
df_combined['vader_compound'] = df_vader['vader_compound']
df_combined['textblob_sentiment'] = df_textblob['textblob_sentiment']
df_combined['textblob_polarity'] = df_textblob['polarity']

df_combined[['song_title', 'vader_sentiment', 'vader_compound', 'textblob_sentiment', 'textblob_polarity']].head()

## Agreement Analysis

In [None]:
# Calculate agreement between methods
agreement = (df_combined['vader_sentiment'] == df_combined['textblob_sentiment']).sum()
agreement_pct = (agreement / len(df_combined)) * 100

print(f"Agreement between VADER and TextBlob: {agreement_pct:.1f}%")
print(f"\nDisagreement cases:")
disagreement = df_combined[df_combined['vader_sentiment'] != df_combined['textblob_sentiment']]
print(disagreement[['song_title', 'vader_sentiment', 'textblob_sentiment', 'lyrics']])

## Correlation Analysis

In [None]:
# Plot correlation between sentiment scores
plt.figure(figsize=(8, 6))
plt.scatter(df_combined['vader_compound'], df_combined['textblob_polarity'], alpha=0.6)
plt.xlabel('VADER Compound Score')
plt.ylabel('TextBlob Polarity')
plt.title('Correlation between VADER and TextBlob Sentiment Scores')
plt.axhline(y=0, color='r', linestyle='--', alpha=0.3)
plt.axvline(x=0, color='r', linestyle='--', alpha=0.3)
plt.grid(True, alpha=0.3)
plt.show()

# Calculate correlation coefficient
correlation = df_combined['vader_compound'].corr(df_combined['textblob_polarity'])
print(f"Correlation coefficient: {correlation:.3f}")

## Sentiment by Genre

In [None]:
if 'genre' in df_combined.columns:
    # Use VADER sentiment for comparison
    fig = plot_sentiment_comparison(df_combined, group_column='genre', 
                                   sentiment_column='vader_sentiment',
                                   title='Sentiment Distribution by Genre')
    plt.show()
    
    # Average sentiment scores by genre
    genre_sentiment = df_combined.groupby('genre')['vader_compound'].mean().sort_values()
    print("\nAverage Sentiment by Genre:")
    print(genre_sentiment)

## Sentiment by Artist

In [None]:
if 'artist' in df_combined.columns:
    fig = plot_sentiment_comparison(df_combined, group_column='artist',
                                   sentiment_column='vader_sentiment',
                                   title='Sentiment Distribution by Artist')
    plt.show()

## Feature Importance for Sentiment

In [None]:
# Analyze which features correlate with sentiment
if 'word_count' in df_combined.columns:
    features = ['vader_compound', 'textblob_polarity', 'word_count']
    fig = plot_correlation_matrix(df_combined, features)
    plt.show()

## Summary and Insights

In [None]:
print("=== Sentiment Analysis Summary ===")
print(f"\nTotal songs analyzed: {len(df_combined)}")
print(f"\nVADER Sentiment Distribution:")
print(df_combined['vader_sentiment'].value_counts())
print(f"\nTextBlob Sentiment Distribution:")
print(df_combined['textblob_sentiment'].value_counts())
print(f"\nAverage VADER compound score: {df_combined['vader_compound'].mean():.3f}")
print(f"Average TextBlob polarity: {df_combined['textblob_polarity'].mean():.3f}")