# Spotify Streaming Analytics - Exploratory Data Analysis

## Project Overview
This notebook provides a comprehensive exploratory data analysis of Spotify streaming history data to replicate real-world Product Analyst workflows. We'll analyze user listening patterns, skip behavior, platform preferences, and generate actionable business insights.

**Dataset**: ~150k event-level streaming sessions  
**Goal**: Extract insights about user engagement, retention, and behavioral patterns

## 1. Environment Setup and Data Loading

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sqlite3
from datetime import datetime, timedelta
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8')
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# Load the Spotify streaming data
data_path = '../data/spotify_history.csv'
df = pd.read_csv(data_path)

# Display basic dataset information
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Dataset structure and basic statistics
print("Dataset Info:")
print(df.info())
print("\n" + "="*50)
print("\nDataset Description:")
df.describe(include='all')

## 2. Data Quality Assessment and Cleaning

In [None]:
# Check for missing values
print("Missing Values Analysis:")
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percent
})
print(missing_df[missing_df['Missing Count'] > 0])

# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Data type analysis
print(f"\nData Types:")
print(df.dtypes)

In [None]:
# Analyze unique values and potential data quality issues
print("Unique Value Counts:")
for col in df.columns:
    unique_count = df[col].nunique()
    print(f"{col}: {unique_count} unique values")
    
    # Show top values for categorical columns
    if unique_count < 20 and col not in ['spotify_track_uri']:
        print(f"  Top values: {df[col].value_counts().head(5).to_dict()}")
    print()

In [None]:
# Data cleaning and standardization
# Create a copy for cleaning
df_clean = df.copy()

# Remove any rows with critical missing values
initial_rows = len(df_clean)
df_clean = df_clean.dropna(subset=['spotify_track_uri', 'ts', 'ms_played'])
print(f"Removed {initial_rows - len(df_clean)} rows with missing critical values")

# Fill missing values for categorical columns
categorical_cols = ['track_name', 'artist_name', 'album_name', 'reason_start', 'reason_end']
for col in categorical_cols:
    df_clean[col] = df_clean[col].fillna('Unknown')

# Convert boolean columns
df_clean['shuffle'] = df_clean['shuffle'].map({'TRUE': True, 'FALSE': False})
df_clean['skipped'] = df_clean['skipped'].map({'TRUE': True, 'FALSE': False})

print(f"\nCleaned dataset shape: {df_clean.shape}")
print(f"Missing values after cleaning: {df_clean.isnull().sum().sum()}")

## 3. Feature Engineering and Timestamp Processing

In [None]:
# Convert timestamp to datetime
df_clean['timestamp'] = pd.to_datetime(df_clean['ts'], format='%d-%m-%Y %H:%M')

# Extract temporal features
df_clean['hour_of_day'] = df_clean['timestamp'].dt.hour
df_clean['day_of_week'] = df_clean['timestamp'].dt.day_name()
df_clean['day_of_week_num'] = df_clean['timestamp'].dt.dayofweek
df_clean['month'] = df_clean['timestamp'].dt.month
df_clean['year'] = df_clean['timestamp'].dt.year
df_clean['date'] = df_clean['timestamp'].dt.date

# Convert ms_played to seconds and minutes
df_clean['seconds_played'] = df_clean['ms_played'] / 1000
df_clean['minutes_played'] = df_clean['seconds_played'] / 60

# Create derived metrics
# Estimate track length based on maximum play time for each track
track_lengths = df_clean.groupby('spotify_track_uri')['ms_played'].max().reset_index()
track_lengths.rename(columns={'ms_played': 'estimated_track_length_ms'}, inplace=True)
df_clean = df_clean.merge(track_lengths, on='spotify_track_uri', how='left')

# Calculate percent played (handling division by zero)
df_clean['percent_played'] = np.where(
    df_clean['estimated_track_length_ms'] > 0,
    (df_clean['ms_played'] / df_clean['estimated_track_length_ms']) * 100,
    0
)

# Cap percent_played at 100% (some tracks might have been played longer than estimated)
df_clean['percent_played'] = np.minimum(df_clean['percent_played'], 100)

print("Feature engineering completed!")
print(f"New columns added: {list(df_clean.columns[len(df.columns):])}") 
df_clean.head()

In [None]:
# Create additional behavioral features
# Skip indicator (more refined)
df_clean['is_skip'] = (
    (df_clean['skipped'] == True) | 
    (df_clean['percent_played'] < 30) |  # Less than 30% played
    (df_clean['reason_end'].isin(['nextbtn', 'backbtn']))
)

# Listening quality score
df_clean['listening_quality'] = np.where(
    df_clean['percent_played'] >= 80, 'High',
    np.where(df_clean['percent_played'] >= 50, 'Medium', 'Low')
)

# Time of day categories
def categorize_time(hour):
    if 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 22:
        return 'Evening'
    else:
        return 'Night'

df_clean['time_of_day'] = df_clean['hour_of_day'].apply(categorize_time)

print("Additional features created:")
print(f"Skip rate: {df_clean['is_skip'].mean():.1%}")
print(f"Listening quality distribution:\n{df_clean['listening_quality'].value_counts()}")
print(f"Time of day distribution:\n{df_clean['time_of_day'].value_counts()}")

## 4. Exploratory Data Analysis - Listening Patterns

In [None]:
# Analyze listening patterns by time
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Listening by hour of day
hourly_plays = df_clean.groupby('hour_of_day').size()
axes[0, 0].bar(hourly_plays.index, hourly_plays.values, color='skyblue')
axes[0, 0].set_title('Listening Activity by Hour of Day')
axes[0, 0].set_xlabel('Hour of Day')
axes[0, 0].set_ylabel('Number of Plays')

# Listening by day of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_plays = df_clean.groupby('day_of_week').size().reindex(day_order)
axes[0, 1].bar(range(len(daily_plays)), daily_plays.values, color='lightcoral')
axes[0, 1].set_title('Listening Activity by Day of Week')
axes[0, 1].set_xlabel('Day of Week')
axes[0, 1].set_ylabel('Number of Plays')
axes[0, 1].set_xticks(range(len(daily_plays)))
axes[0, 1].set_xticklabels([day[:3] for day in day_order], rotation=45)

# Listening by time of day categories
time_plays = df_clean['time_of_day'].value_counts()
axes[1, 0].pie(time_plays.values, labels=time_plays.index, autopct='%1.1f%%', startangle=90)
axes[1, 0].set_title('Listening Distribution by Time of Day')

# Monthly trends
monthly_plays = df_clean.groupby('month').size()
axes[1, 1].plot(monthly_plays.index, monthly_plays.values, marker='o', color='green')
axes[1, 1].set_title('Listening Activity by Month')
axes[1, 1].set_xlabel('Month')
axes[1, 1].set_ylabel('Number of Plays')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print peak listening insights
peak_hour = hourly_plays.idxmax()
peak_day = daily_plays.idxmax()
print(f"Peak listening hour: {peak_hour}:00 ({hourly_plays[peak_hour]:,} plays)")
print(f"Peak listening day: {peak_day} ({daily_plays[peak_day]:,} plays)")

In [None]:
# Create heatmap of listening patterns
# Hour vs Day of Week heatmap
pivot_data = df_clean.groupby(['day_of_week_num', 'hour_of_day']).size().unstack(fill_value=0)
day_labels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

plt.figure(figsize=(12, 6))
sns.heatmap(pivot_data, 
           yticklabels=day_labels,
           cmap='YlOrRd', 
           annot=False,
           fmt='d',
           cbar_kws={'label': 'Number of Plays'})
plt.title('Listening Activity Heatmap: Day of Week vs Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Day of Week')
plt.tight_layout()
plt.show()

## 5. Platform and Device Analysis

In [None]:
# Platform usage analysis
platform_stats = df_clean.groupby('platform').agg({
    'spotify_track_uri': 'count',
    'seconds_played': ['sum', 'mean'],
    'is_skip': 'mean',
    'percent_played': 'mean'
}).round(2)

platform_stats.columns = ['Total_Plays', 'Total_Time_Sec', 'Avg_Time_Sec', 'Skip_Rate', 'Avg_Percent_Played']
platform_stats['Total_Time_Hours'] = (platform_stats['Total_Time_Sec'] / 3600).round(1)
platform_stats['Skip_Rate_Percent'] = (platform_stats['Skip_Rate'] * 100).round(1)

print("Platform Comparison:")
print(platform_stats)

# Visualize platform metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Total plays by platform
platforms = platform_stats.index
axes[0, 0].bar(platforms, platform_stats['Total_Plays'], color=['#1DB954', '#FF6B6B', '#4ECDC4'])
axes[0, 0].set_title('Total Plays by Platform')
axes[0, 0].set_ylabel('Number of Plays')

# Average listening time by platform
axes[0, 1].bar(platforms, platform_stats['Avg_Time_Sec'], color=['#1DB954', '#FF6B6B', '#4ECDC4'])
axes[0, 1].set_title('Average Listening Time by Platform')
axes[0, 1].set_ylabel('Seconds')

# Skip rate by platform
axes[1, 0].bar(platforms, platform_stats['Skip_Rate_Percent'], color=['#1DB954', '#FF6B6B', '#4ECDC4'])
axes[1, 0].set_title('Skip Rate by Platform')
axes[1, 0].set_ylabel('Skip Rate (%)')

# Average percent played by platform
axes[1, 1].bar(platforms, platform_stats['Avg_Percent_Played'], color=['#1DB954', '#FF6B6B', '#4ECDC4'])
axes[1, 1].set_title('Average Percent Played by Platform')
axes[1, 1].set_ylabel('Percent Played')

plt.tight_layout()
plt.show()

In [None]:
# Platform usage patterns by time
platform_time = df_clean.groupby(['platform', 'time_of_day']).size().unstack(fill_value=0)
platform_time_pct = platform_time.div(platform_time.sum(axis=1), axis=0) * 100

plt.figure(figsize=(12, 6))
platform_time_pct.plot(kind='bar', stacked=True, 
                      color=['#FFE5B4', '#FFCC99', '#FF6B6B', '#4ECDC4'])
plt.title('Platform Usage Distribution by Time of Day (%)')
plt.xlabel('Platform')
plt.ylabel('Percentage of Usage')
plt.legend(title='Time of Day', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("Platform usage by time of day (%):\n")
print(platform_time_pct.round(1))

## 6. Skip Behavior Analysis

In [None]:
# Overall skip behavior analysis
skip_summary = {
    'Total Tracks': len(df_clean),
    'Skipped Tracks': df_clean['is_skip'].sum(),
    'Skip Rate': f"{df_clean['is_skip'].mean():.1%}",
    'Avg Percent Played (All)': f"{df_clean['percent_played'].mean():.1f}%",
    'Avg Percent Played (Non-Skip)': f"{df_clean[~df_clean['is_skip']]['percent_played'].mean():.1f}%",
    'Avg Percent Played (Skip)': f"{df_clean[df_clean['is_skip']]['percent_played'].mean():.1f}%"
}

print("Skip Behavior Summary:")
for key, value in skip_summary.items():
    print(f"{key}: {value}")

# Skip behavior by different factors
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Skip rate by platform
skip_by_platform = df_clean.groupby('platform')['is_skip'].mean() * 100
axes[0, 0].bar(skip_by_platform.index, skip_by_platform.values, color='coral')
axes[0, 0].set_title('Skip Rate by Platform')
axes[0, 0].set_ylabel('Skip Rate (%)')

# Skip rate by time of day
skip_by_time = df_clean.groupby('time_of_day')['is_skip'].mean() * 100
axes[0, 1].bar(skip_by_time.index, skip_by_time.values, color='lightblue')
axes[0, 1].set_title('Skip Rate by Time of Day')
axes[0, 1].set_ylabel('Skip Rate (%)')
axes[0, 1].tick_params(axis='x', rotation=45)

# Skip rate by shuffle mode
skip_by_shuffle = df_clean.groupby('shuffle')['is_skip'].mean() * 100
axes[1, 0].bar(['No Shuffle', 'Shuffle'], skip_by_shuffle.values, color='lightgreen')
axes[1, 0].set_title('Skip Rate by Shuffle Mode')
axes[1, 0].set_ylabel('Skip Rate (%)')

# Skip rate by hour of day
skip_by_hour = df_clean.groupby('hour_of_day')['is_skip'].mean() * 100
axes[1, 1].plot(skip_by_hour.index, skip_by_hour.values, marker='o', color='purple')
axes[1, 1].set_title('Skip Rate by Hour of Day')
axes[1, 1].set_xlabel('Hour of Day')
axes[1, 1].set_ylabel('Skip Rate (%)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Analyze skip reasons
print("Analysis of Reason End (Why tracks stopped):")
reason_end_counts = df_clean['reason_end'].value_counts()
print(reason_end_counts)

# Skip rate by reason_end
skip_by_reason_end = df_clean.groupby('reason_end')['is_skip'].mean() * 100
skip_by_reason_end = skip_by_reason_end.sort_values(ascending=False)

plt.figure(figsize=(12, 6))
plt.bar(range(len(skip_by_reason_end)), skip_by_reason_end.values, color='orange')
plt.title('Skip Rate by Reason End')
plt.xlabel('Reason End')
plt.ylabel('Skip Rate (%)')
plt.xticks(range(len(skip_by_reason_end)), skip_by_reason_end.index, rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("\nSkip Rate by Reason End:")
for reason, rate in skip_by_reason_end.items():
    print(f"{reason}: {rate:.1f}%")

In [None]:
# Analyze most skipped artists and tracks
# Most skipped artists (by count)
artist_skip_stats = df_clean.groupby('artist_name').agg({
    'spotify_track_uri': 'count',
    'is_skip': ['sum', 'mean']
}).round(3)
artist_skip_stats.columns = ['Total_Plays', 'Total_Skips', 'Skip_Rate']
artist_skip_stats = artist_skip_stats[artist_skip_stats['Total_Plays'] >= 10]  # Filter for artists with at least 10 plays

# Top 10 most skipped artists by skip rate
most_skipped_artists = artist_skip_stats.sort_values('Skip_Rate', ascending=False).head(10)
print("Top 10 Artists with Highest Skip Rates (min 10 plays):")
print(most_skipped_artists)

# Top 10 least skipped artists
least_skipped_artists = artist_skip_stats.sort_values('Skip_Rate', ascending=True).head(10)
print("\nTop 10 Artists with Lowest Skip Rates (min 10 plays):")
print(least_skipped_artists)

## 7. Session Analysis and Sessionization

In [None]:
# Implement sessionization logic (30-minute gap rule)
# Sort by timestamp
df_sessions = df_clean.sort_values('timestamp').copy()

# Calculate time differences between consecutive plays
df_sessions['time_diff'] = df_sessions['timestamp'].diff()

# Create session breaks where gap > 30 minutes
session_break_threshold = timedelta(minutes=30)
df_sessions['is_session_start'] = (
    (df_sessions['time_diff'] > session_break_threshold) | 
    (df_sessions['time_diff'].isna())
)

# Assign session IDs
df_sessions['session_id'] = df_sessions['is_session_start'].cumsum()

print(f"Total number of sessions identified: {df_sessions['session_id'].nunique()}")
print(f"Average tracks per session: {len(df_sessions) / df_sessions['session_id'].nunique():.1f}")

# Calculate session-level metrics
session_stats = df_sessions.groupby('session_id').agg({
    'timestamp': ['min', 'max'],
    'spotify_track_uri': 'count',
    'seconds_played': 'sum',
    'is_skip': 'mean',
    'platform': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else x.iloc[0]  # Most common platform in session
}).round(2)

session_stats.columns = ['Session_Start', 'Session_End', 'Tracks_Count', 'Total_Seconds', 'Skip_Rate', 'Primary_Platform']

# Calculate session duration
session_stats['Session_Duration_Minutes'] = (
    (session_stats['Session_End'] - session_stats['Session_Start']).dt.total_seconds() / 60
).round(1)

session_stats['Total_Minutes'] = (session_stats['Total_Seconds'] / 60).round(1)

print("\nSession Statistics Summary:")
print(session_stats.describe())

In [None]:
# Visualize session characteristics
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Session duration distribution
axes[0, 0].hist(session_stats['Session_Duration_Minutes'], bins=50, color='skyblue', alpha=0.7)
axes[0, 0].set_title('Distribution of Session Durations')
axes[0, 0].set_xlabel('Session Duration (Minutes)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(session_stats['Session_Duration_Minutes'].median(), color='red', 
                  linestyle='--', label=f'Median: {session_stats["Session_Duration_Minutes"].median():.1f} min')
axes[0, 0].legend()

# Tracks per session distribution
axes[0, 1].hist(session_stats['Tracks_Count'], bins=30, color='lightgreen', alpha=0.7)
axes[0, 1].set_title('Distribution of Tracks per Session')
axes[0, 1].set_xlabel('Number of Tracks')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(session_stats['Tracks_Count'].median(), color='red', 
                  linestyle='--', label=f'Median: {session_stats["Tracks_Count"].median():.0f} tracks')
axes[0, 1].legend()

# Session skip rate distribution
axes[1, 0].hist(session_stats['Skip_Rate'] * 100, bins=30, color='coral', alpha=0.7)
axes[1, 0].set_title('Distribution of Session Skip Rates')
axes[1, 0].set_xlabel('Skip Rate (%)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].axvline(session_stats['Skip_Rate'].median() * 100, color='red', 
                  linestyle='--', label=f'Median: {session_stats["Skip_Rate"].median()*100:.1f}%')
axes[1, 0].legend()

# Platform distribution in sessions
platform_sessions = session_stats['Primary_Platform'].value_counts()
axes[1, 1].pie(platform_sessions.values, labels=platform_sessions.index, autopct='%1.1f%%')
axes[1, 1].set_title('Primary Platform Distribution in Sessions')

plt.tight_layout()
plt.show()

# Session insights
print("\nSession Insights:")
print(f"Average session duration: {session_stats['Session_Duration_Minutes'].mean():.1f} minutes")
print(f"Median session duration: {session_stats['Session_Duration_Minutes'].median():.1f} minutes")
print(f"Average tracks per session: {session_stats['Tracks_Count'].mean():.1f}")
print(f"Average session skip rate: {session_stats['Skip_Rate'].mean():.1%}")

## 8. Top Content Analysis

In [None]:
# Top tracks analysis
track_stats = df_clean.groupby(['track_name', 'artist_name']).agg({
    'spotify_track_uri': 'count',
    'seconds_played': 'sum',
    'is_skip': 'mean',
    'percent_played': 'mean'
}).round(2)

track_stats.columns = ['Play_Count', 'Total_Seconds', 'Skip_Rate', 'Avg_Percent_Played']
track_stats['Total_Minutes'] = (track_stats['Total_Seconds'] / 60).round(1)

# Top tracks by play count
top_tracks_plays = track_stats.sort_values('Play_Count', ascending=False).head(15)
print("Top 15 Tracks by Play Count:")
print(top_tracks_plays[['Play_Count', 'Total_Minutes', 'Skip_Rate']])

# Top tracks by total listening time
top_tracks_time = track_stats.sort_values('Total_Minutes', ascending=False).head(15)
print("\nTop 15 Tracks by Total Listening Time:")
print(top_tracks_time[['Play_Count', 'Total_Minutes', 'Skip_Rate']])

In [None]:
# Top artists analysis
artist_stats = df_clean.groupby('artist_name').agg({
    'spotify_track_uri': 'count',
    'track_name': 'nunique',
    'seconds_played': 'sum',
    'is_skip': 'mean',
    'percent_played': 'mean'
}).round(2)

artist_stats.columns = ['Total_Plays', 'Unique_Tracks', 'Total_Seconds', 'Skip_Rate', 'Avg_Percent_Played']
artist_stats['Total_Hours'] = (artist_stats['Total_Seconds'] / 3600).round(1)

# Top artists by play count
top_artists_plays = artist_stats.sort_values('Total_Plays', ascending=False).head(15)
print("Top 15 Artists by Play Count:")
print(top_artists_plays[['Total_Plays', 'Unique_Tracks', 'Total_Hours', 'Skip_Rate']])

# Visualize top artists
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Top 10 artists by plays
top_10_artists = top_artists_plays.head(10)
axes[0].barh(range(len(top_10_artists)), top_10_artists['Total_Plays'], color='lightblue')
axes[0].set_yticks(range(len(top_10_artists)))
axes[0].set_yticklabels([name[:20] + '...' if len(name) > 20 else name for name in top_10_artists.index])
axes[0].set_xlabel('Total Plays')
axes[0].set_title('Top 10 Artists by Play Count')
axes[0].invert_yaxis()

# Top 10 artists by listening time
top_10_time = artist_stats.sort_values('Total_Hours', ascending=False).head(10)
axes[1].barh(range(len(top_10_time)), top_10_time['Total_Hours'], color='lightcoral')
axes[1].set_yticks(range(len(top_10_time)))
axes[1].set_yticklabels([name[:20] + '...' if len(name) > 20 else name for name in top_10_time.index])
axes[1].set_xlabel('Total Hours')
axes[1].set_title('Top 10 Artists by Listening Time')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

## 9. Key Performance Indicators (KPIs)

In [None]:
# Calculate key business metrics
total_listening_hours = df_clean['seconds_played'].sum() / 3600
unique_tracks = df_clean['spotify_track_uri'].nunique()
unique_artists = df_clean['artist_name'].nunique()
daily_active_sessions = df_sessions.groupby(df_sessions['Session_Start'].dt.date)['session_id'].nunique().mean()
avg_session_length = session_stats['Session_Duration_Minutes'].mean()
avg_tracks_per_session = session_stats['Tracks_Count'].mean()
overall_skip_rate = df_clean['is_skip'].mean()
avg_percent_played = df_clean['percent_played'].mean()

# Create KPI dashboard
kpis = {
    'Total Listening Hours': f"{total_listening_hours:,.1f}",
    'Total Tracks Played': f"{len(df_clean):,}",
    'Unique Tracks': f"{unique_tracks:,}",
    'Unique Artists': f"{unique_artists:,}",
    'Total Sessions': f"{df_sessions['session_id'].nunique():,}",
    'Avg Daily Sessions': f"{daily_active_sessions:.1f}",
    'Avg Session Length (min)': f"{avg_session_length:.1f}",
    'Avg Tracks per Session': f"{avg_tracks_per_session:.1f}",
    'Overall Skip Rate': f"{overall_skip_rate:.1%}",
    'Avg Percent Played': f"{avg_percent_played:.1f}%",
    'Platform Distribution': f"Web: {(df_clean['platform'] == 'web player').mean():.1%}, iOS: {(df_clean['platform'] == 'iOS').mean():.1%}, Android: {(df_clean['platform'] == 'Android').mean():.1%}"
}

print("🎵 SPOTIFY STREAMING ANALYTICS - KEY PERFORMANCE INDICATORS 🎵\n")
print("=" * 60)

for kpi, value in kpis.items():
    print(f"{kpi:<25}: {value}")

print("\n" + "=" * 60)

## 10. Data Export for Further Analysis

In [None]:
# Save processed datasets for further analysis
# Save the main cleaned dataset
df_clean.to_csv('../data/spotify_cleaned.csv', index=False)
print("Cleaned dataset saved to: ../data/spotify_cleaned.csv")

# Save session-level data
session_export = df_sessions[['session_id', 'timestamp', 'spotify_track_uri', 'platform', 
                             'seconds_played', 'is_skip', 'time_of_day', 'day_of_week']]
session_export.to_csv('../data/spotify_sessions.csv', index=False)
print("Session data saved to: ../data/spotify_sessions.csv")

# Save aggregated session statistics
session_stats.to_csv('../data/session_statistics.csv')
print("Session statistics saved to: ../data/session_statistics.csv")

# Save top content analysis
top_tracks_plays.to_csv('../data/top_tracks.csv')
top_artists_plays.to_csv('../data/top_artists.csv')
print("Top content analysis saved to: ../data/top_tracks.csv and ../data/top_artists.csv")

print("\n✅ All datasets exported successfully for further analysis!")