# 🎵 Spotify Data Analytics — Notebook

This notebook loads your cleaned Spotify streaming history from `data/processed/streaming_history_clean.csv`
and produces a few quick insights & charts:

- Daily listening minutes (time series)
- 7-day rolling average of listening minutes
- Top artists (by total minutes) — bar chart
- Top tracks (by total minutes) — bar chart
- Monthly listening minutes (aggregate)

> **Note:** Keep your private raw export out of git; this notebook only reads the cleaned CSV produced by `src/process_spotify_data.py`.


In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

DATA_PROCESSED = Path('data/processed/streaming_history_clean.csv')
AN_DIR = Path('data/analytics')
FIG_DIR = AN_DIR / 'figures'
FIG_DIR.mkdir(parents=True, exist_ok=True)

assert DATA_PROCESSED.exists(), f"Could not find {DATA_PROCESSED}. Run the script: python src/process_spotify_data.py --input data/raw"

# Load data
df = pd.read_csv(DATA_PROCESSED)

# Parse timestamps
df['played_at'] = pd.to_datetime(df['played_at'], utc=True, errors='coerce')
df = df.dropna(subset=['played_at'])
df['date'] = pd.to_datetime(df['played_at'].dt.date)
df['ms_played'] = pd.to_numeric(df['ms_played'], errors='coerce').fillna(0).astype('int64')

df.head()

## Quick sanity checks

In [None]:
total_minutes = df['ms_played'].sum() / 60000.0
n_plays = len(df)
date_range = (df['date'].min(), df['date'].max())
print(f"Total plays: {n_plays:,}")
print(f"Total listening minutes: {total_minutes:,.2f}") 
print(f"Date range: {date_range[0]} → {date_range[1]}")

## Daily listening minutes — time series

In [None]:
daily = df.groupby('date', dropna=True)['ms_played'].sum().reset_index()
daily['minutes'] = daily['ms_played'] / 60000.0
daily = daily[['date','minutes']].sort_values('date')

plt.figure()
plt.plot(daily['date'], daily['minutes'])
plt.title('Listening Minutes by Day')
plt.xlabel('Date')
plt.ylabel('Minutes')
plt.tight_layout()
plt.savefig(FIG_DIR / 'daily_minutes_timeseries.png')
plt.show()

daily.tail(10)

## 7-day rolling average of daily minutes

In [None]:
daily_rolling = daily.copy()
daily_rolling['roll7'] = daily_rolling['minutes'].rolling(window=7, min_periods=1).mean()

plt.figure()
plt.plot(daily_rolling['date'], daily_rolling['roll7'])
plt.title('7-Day Rolling Average of Listening Minutes')
plt.xlabel('Date')
plt.ylabel('Minutes (7-day avg)')
plt.tight_layout()
plt.savefig(FIG_DIR / 'daily_minutes_rolling7.png')
plt.show()

daily_rolling.tail(10)

## Top artists by total minutes

In [None]:
top_artists = (df.groupby('artist', dropna=True)['ms_played']
                  .sum()
                  .sort_values(ascending=False)
                  .head(15)
                  .reset_index())

top_artists['minutes'] = top_artists['ms_played'] / 60000.0

plt.figure()
plt.barh(top_artists['artist'][::-1], top_artists['minutes'][::-1])
plt.title('Top 15 Artists by Minutes')
plt.xlabel('Minutes')
plt.ylabel('Artist')
plt.tight_layout()
plt.savefig(FIG_DIR / 'top_artists_minutes.png')
plt.show()

top_artists[['artist','minutes']].head(15)

## Top tracks by total minutes

In [None]:
top_tracks = (df.groupby(['track','artist'], dropna=True)['ms_played']
                .sum()
                .sort_values(ascending=False)
                .head(15)
                .reset_index())

top_tracks['minutes'] = top_tracks['ms_played'] / 60000.0

labels = top_tracks.apply(lambda r: f"{r['track']} — {r['artist']}", axis=1)

plt.figure()
plt.barh(labels[::-1], top_tracks['minutes'][::-1])
plt.title('Top 15 Tracks by Minutes')
plt.xlabel('Minutes')
plt.ylabel('Track — Artist')
plt.tight_layout()
plt.savefig(FIG_DIR / 'top_tracks_minutes.png')
plt.show()

top_tracks[['track','artist','minutes']].head(15)

## Monthly listening minutes

In [None]:
df['month'] = df['date'].astype('datetime64[M]')
monthly = df.groupby('month')['ms_played'].sum().reset_index()
monthly['minutes'] = monthly['ms_played'] / 60000.0

plt.figure()
plt.plot(monthly['month'], monthly['minutes'])
plt.title('Monthly Listening Minutes')
plt.xlabel('Month')
plt.ylabel('Minutes')
plt.tight_layout()
plt.savefig(FIG_DIR / 'monthly_minutes.png')
plt.show()

monthly.tail(12)