# 02 - Feature Analysis

**Purpose:** Deep dive into audio features and their relationship with song popularity.

- Feature correlations
- Popularity tier comparisons
- Hit song profile analysis
- Temporal trends
- Duration impact

**Prerequisite:** Run [01_exploration.ipynb](01_exploration.ipynb) first to generate the cleaned dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path

#load cleaned data from notebook 01
df = pd.read_csv(Path("../data/processed/spotify_cleaned.csv"))
df['release_date'] = pd.to_datetime(df['release_date'])

audio_features = ['danceability', 'energy', 'loudness', 'speechiness', 
                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

print(f"Loaded {len(df):,} tracks from cleaned dataset")

## 1. Feature Correlations with Popularity

Which audio characteristics are most associated with popular songs?

In [None]:
correlations = df[audio_features + ['popularity']].corr()['popularity'].drop('popularity').sort_values()

plt.figure(figsize=(10, 6))
colors = ['#e74c3c' if x < 0 else '#27ae60' for x in correlations]
correlations.plot(kind='barh', color=colors)
plt.xlabel('Correlation with Popularity')
plt.title('Which Audio Features Correlate with Popularity?')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

print("\nCorrelation values:")
print(correlations.round(3))

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(df[audio_features + ['popularity']].corr(), annot=True, cmap='RdYlGn', 
            center=0, fmt='.2f', square=True)
plt.title('Correlation Heatmap: Audio Features & Popularity')
plt.tight_layout()
plt.show()

## 2. Popularity Tiers Comparison

How do audio features differ across low, medium, and high popularity songs?

In [None]:
tier_avg = df.groupby('popularity_tier')[audio_features].mean()

fig, axes = plt.subplots(3, 3, figsize=(14, 12))
axes = axes.flatten()

for i, feature in enumerate(audio_features):
    ax = axes[i]
    tier_avg[feature].plot(kind='bar', ax=ax, color=['#e74c3c', '#f39c12', '#27ae60'])
    ax.set_title(f'{feature.title()}')
    ax.set_xlabel('')
    ax.tick_params(axis='x', rotation=45)
    
plt.suptitle('Average Audio Features by Popularity Tier', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 3. Feature vs Popularity Scatter Plots

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

top_features = ['instrumentalness', 'loudness', 'danceability', 'energy', 'acousticness', 'valence']

for i, feature in enumerate(top_features):
    ax = axes[i]
    ax.scatter(df[feature], df['popularity'], alpha=0.3, s=15)
    
    #add trend line
    z = np.polyfit(df[feature], df['popularity'], 1)
    p = np.poly1d(z)
    x_line = np.linspace(df[feature].min(), df[feature].max(), 100)
    ax.plot(x_line, p(x_line), "r--", linewidth=2, label='Trend')
    
    ax.set_xlabel(feature.title())
    ax.set_ylabel('Popularity')
    ax.set_title(f'{feature.title()} vs Popularity')
    
plt.tight_layout()
plt.show()

## 4. Hit Song Profile (Top 10%)

What makes a hit different from the rest?

In [None]:
top_10_pct = df['popularity'].quantile(0.9)
hits = df[df['popularity'] >= top_10_pct]
non_hits = df[df['popularity'] < top_10_pct]

print(f"Top 10% threshold: popularity >= {top_10_pct}")
print(f"Number of 'hit' songs: {len(hits):,}")
print(f"Number of other songs: {len(non_hits):,}")

comparison = pd.DataFrame({
    'Hits (Top 10%)': hits[audio_features].mean(),
    'Others': non_hits[audio_features].mean(),
    'Difference': hits[audio_features].mean() - non_hits[audio_features].mean(),
    'Diff %': ((hits[audio_features].mean() - non_hits[audio_features].mean()) / non_hits[audio_features].mean() * 100)
}).round(3)

print("\n" + "="*60)
print(comparison)

In [None]:
plt.figure(figsize=(12, 6))
x = np.arange(len(audio_features))
width = 0.35

plt.bar(x - width/2, comparison['Hits (Top 10%)'], width, label=f'Hits (n={len(hits):,})', color='#f1c40f')
plt.bar(x + width/2, comparison['Others'], width, label=f'Others (n={len(non_hits):,})', color='#95a5a6')

plt.xlabel('Audio Features')
plt.ylabel('Average Value')
plt.title('Audio Feature Comparison: Hits vs Non-Hits')
plt.xticks(x, [f.title() for f in audio_features], rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()

## 5. Top Artists Analysis

In [None]:
artist_stats = df.groupby('artist').agg({
    'popularity': ['mean', 'count']
}).round(2)
artist_stats.columns = ['avg_popularity', 'song_count']
artist_stats = artist_stats[artist_stats['song_count'] >= 3]
top_artists = artist_stats.sort_values('avg_popularity', ascending=False).head(15)

plt.figure(figsize=(12, 6))
plt.barh(top_artists.index, top_artists['avg_popularity'], color='steelblue')
plt.xlabel('Average Popularity')
plt.title('Top 15 Artists by Average Popularity (min 3 songs)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 6. Trends Over Time

How have audio features and popularity changed over the years?

In [None]:
yearly_stats = df.groupby('release_year').agg({
    'popularity': 'mean',
    'danceability': 'mean',
    'energy': 'mean',
    'loudness': 'mean',
    'acousticness': 'mean'
}).round(2)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

axes[0, 0].plot(yearly_stats.index, yearly_stats['popularity'], marker='o', color='green')
axes[0, 0].set_title('Average Popularity by Release Year')
axes[0, 0].set_xlabel('Year')
axes[0, 0].set_ylabel('Popularity')

axes[0, 1].plot(yearly_stats.index, yearly_stats['danceability'], marker='o', color='orange')
axes[0, 1].set_title('Average Danceability by Release Year')
axes[0, 1].set_xlabel('Year')
axes[0, 1].set_ylabel('Danceability')

axes[1, 0].plot(yearly_stats.index, yearly_stats['energy'], marker='o', color='red')
axes[1, 0].set_title('Average Energy by Release Year')
axes[1, 0].set_xlabel('Year')
axes[1, 0].set_ylabel('Energy')

axes[1, 1].plot(yearly_stats.index, yearly_stats['acousticness'], marker='o', color='purple')
axes[1, 1].set_title('Average Acousticness by Release Year')
axes[1, 1].set_xlabel('Year')
axes[1, 1].set_ylabel('Acousticness')

plt.tight_layout()
plt.show()

## 7. Duration Analysis

Is there an optimal song length for popularity?

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

#scatter plot
axes[0].scatter(df['duration_min'], df['popularity'], alpha=0.3, s=15)
axes[0].axvline(x=df['duration_min'].median(), color='red', linestyle='--', 
                label=f'Median: {df["duration_min"].median():.1f} min')
axes[0].set_xlabel('Duration (minutes)')
axes[0].set_ylabel('Popularity')
axes[0].set_title('Song Duration vs Popularity')
axes[0].legend()

#bar chart by duration bins
duration_bins = pd.cut(df['duration_min'], bins=[0, 2, 3, 4, 5, 10], 
                       labels=['<2 min', '2-3 min', '3-4 min', '4-5 min', '5+ min'])
duration_popularity = df.groupby(duration_bins)['popularity'].mean()

colors = ['#3498db' if x != duration_popularity.max() else '#e74c3c' for x in duration_popularity]
duration_popularity.plot(kind='bar', ax=axes[1], color=colors)
axes[1].set_xlabel('Duration Range')
axes[1].set_ylabel('Average Popularity')
axes[1].set_title('Average Popularity by Song Duration')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print(f"Optimal duration range: {duration_popularity.idxmax()} (avg popularity: {duration_popularity.max():.1f})")

---
**Next:** [03_final_report.ipynb](03_final_report.ipynb) - Executive summary with key findings