EDA on Spotify Songs Dataset

## Import necessary libraries:

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scipy.stats as stats
from statsmodels.graphics.gofplots import qqplot
from sklearn.preprocessing import MinMaxScaler

In [41]:
import json
from IPython.display import display, HTML
import ipywidgets as widgets

## 1. Loading the dataset

In [15]:
data = pd.read_csv('/content/drive/MyDrive/ProjectsData/spotify-2023.csv', encoding='latin-1')
data.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6


In [16]:
data.shape

(953, 24)

In [17]:
data.columns

Index(['track_name', 'artist(s)_name', 'artist_count', 'released_year',
       'released_month', 'released_day', 'in_spotify_playlists',
       'in_spotify_charts', 'streams', 'in_apple_playlists', 'in_apple_charts',
       'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts', 'bpm',
       'key', 'mode', 'danceability_%', 'valence_%', 'energy_%',
       'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%'],
      dtype='object')

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   track_name            953 non-null    object
 1   artist(s)_name        953 non-null    object
 2   artist_count          953 non-null    int64 
 3   released_year         953 non-null    int64 
 4   released_month        953 non-null    int64 
 5   released_day          953 non-null    int64 
 6   in_spotify_playlists  953 non-null    int64 
 7   in_spotify_charts     953 non-null    int64 
 8   streams               953 non-null    object
 9   in_apple_playlists    953 non-null    int64 
 10  in_apple_charts       953 non-null    int64 
 11  in_deezer_playlists   953 non-null    object
 12  in_deezer_charts      953 non-null    int64 
 13  in_shazam_charts      903 non-null    object
 14  bpm                   953 non-null    int64 
 15  key                   858 non-null    ob

In [19]:
# Checking Null values
data.isnull().sum() / len(data) * 100

Unnamed: 0,0
track_name,0.0
artist(s)_name,0.0
artist_count,0.0
released_year,0.0
released_month,0.0
released_day,0.0
in_spotify_playlists,0.0
in_spotify_charts,0.0
streams,0.0
in_apple_playlists,0.0


In [20]:
# Checking for duplicate values
data.duplicated().sum()

0

## 2. Feature Engineering

In [21]:
data['in_deezer_playlists'] = data['in_deezer_playlists'].str.replace(",","").astype(float)
data['in_shazam_charts'] = data['in_shazam_charts'].str.replace(",","").astype(float)
data['streams'] = pd.to_numeric(data['streams'],errors='coerce')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   track_name            953 non-null    object 
 1   artist(s)_name        953 non-null    object 
 2   artist_count          953 non-null    int64  
 3   released_year         953 non-null    int64  
 4   released_month        953 non-null    int64  
 5   released_day          953 non-null    int64  
 6   in_spotify_playlists  953 non-null    int64  
 7   in_spotify_charts     953 non-null    int64  
 8   streams               952 non-null    float64
 9   in_apple_playlists    953 non-null    int64  
 10  in_apple_charts       953 non-null    int64  
 11  in_deezer_playlists   953 non-null    float64
 12  in_deezer_charts      953 non-null    int64  
 13  in_shazam_charts      903 non-null    float64
 14  bpm                   953 non-null    int64  
 15  key                   8

In [22]:
data.nunique()

Unnamed: 0,0
track_name,943
artist(s)_name,645
artist_count,8
released_year,50
released_month,12
released_day,31
in_spotify_playlists,879
in_spotify_charts,82
streams,948
in_apple_playlists,234


In [23]:
df = data.copy()

#### Creating a new column to get released_date by combining released_year, released_month, released_day.

In [24]:
# Converting columns to 'year', 'month' and 'day'
released_date = df[['released_year', 'released_month', 'released_day']].copy()
released_date.rename({
    'released_year': 'year',
    'released_month': 'month',
    'released_day': 'day'
}, axis = 1, inplace = True)
released_date.head()

Unnamed: 0,year,month,day
0,2023,7,14
1,2023,3,23
2,2023,6,30
3,2019,8,23
4,2023,5,18


In [25]:
df['released_date'] = pd.to_datetime(released_date)
df.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%,released_date
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703.0,43,...,B,Major,80,89,83,31,0,8,4,2023-07-14
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286.0,48,...,C#,Major,71,61,74,7,0,10,4,2023-03-23
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974.0,94,...,F,Major,51,32,53,17,0,31,6,2023-06-30
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817.0,116,...,A,Major,55,58,72,11,0,11,15,2019-08-23
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322.0,84,...,A,Minor,65,23,80,14,63,11,6,2023-05-18


Creating a new column for total_playlists by combining playlists from Spotify, Apple and Deezer

In [26]:
df['total_playlists'] = df['in_spotify_playlists'] + df['in_apple_playlists'] + df['in_deezer_playlists']
df.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%,released_date,total_playlists
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703.0,43,...,Major,80,89,83,31,0,8,4,2023-07-14,641.0
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286.0,48,...,Major,71,61,74,7,0,10,4,2023-03-23,1580.0
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974.0,94,...,Major,51,32,53,17,0,31,6,2023-06-30,1582.0
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817.0,116,...,Major,55,58,72,11,0,11,15,2019-08-23,8099.0
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322.0,84,...,Minor,65,23,80,14,63,11,6,2023-05-18,3304.0


Creating a new column for total_charts by combining charts from Spotify, Apple, Deezer and Shazam

In [27]:
df['total_charts'] = df['in_spotify_charts'] + df['in_apple_charts'] + df['in_deezer_charts'] + df['in_shazam_charts']
df.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%,released_date,total_playlists,total_charts
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703.0,43,...,80,89,83,31,0,8,4,2023-07-14,641.0,1246.0
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286.0,48,...,71,61,74,7,0,10,4,2023-03-23,1580.0,570.0
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974.0,94,...,51,32,53,17,0,31,6,2023-06-30,1582.0,1283.0
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817.0,116,...,55,58,72,11,0,11,15,2019-08-23,8099.0,867.0
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322.0,84,...,65,23,80,14,63,11,6,2023-05-18,3304.0,623.0


Seperating categorial and numerical column

In [28]:
cat_cols = df.select_dtypes(include = ['object']).columns
num_cols = df.select_dtypes(include = np.number).columns
cat_cols, num_cols

(Index(['track_name', 'artist(s)_name', 'key', 'mode'], dtype='object'),
 Index(['artist_count', 'released_year', 'released_month', 'released_day',
        'in_spotify_playlists', 'in_spotify_charts', 'streams',
        'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists',
        'in_deezer_charts', 'in_shazam_charts', 'bpm', 'danceability_%',
        'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%',
        'liveness_%', 'speechiness_%', 'total_playlists', 'total_charts'],
       dtype='object'))

# 3. Univariate Analysis

Univariate Analysis - Numerical Columns

In [42]:
for col in num_cols:
    fig = make_subplots(rows = 1, cols = 2, subplot_titles = [f"Histogram of {col}", f"Boxplot of {col}"])
    fig.add_trace(go.Histogram(x = df[col], name = col), row = 1, col = 1)
    fig.add_trace(go.Box(x = df[col], name = col), row = 1, col = 2)
    fig.update_layout(showlegend = False, title = {'text': f"Univariate Analysis on {col}", 'xanchor': 'center', 'x': 0.5 })
    fig.show()

Univariate Analysis on Categorical Columns

In [43]:
# Univariate Analysis on key column
df_key_val = df['key'].value_counts()

fig = make_subplots(rows = 1, cols = 2, subplot_titles = [f"Barplot of Key", f"Boxplot of key"])
fig.add_trace(go.Bar(x = df_key_val.index, y = df_key_val.values, name = "Key"), row = 1, col = 1)
fig.add_trace(go.Box(x = df['key'], name = "Key"), row = 1, col = 2)
fig.update_layout(showlegend = False, title = {'text': f"Univariate Analysis on Key", 'xanchor': 'center', 'x': 0.5 })
fig.show()

In [44]:
# Univariate Analysis on artist name column
df_artist_val = df['artist(s)_name'].value_counts()
#df_artist_val[:11]

fig = make_subplots(rows = 1, cols = 2, subplot_titles = [f"Barplot of Artist Name", f"Boxplot of Artist Name"])
fig.add_trace(go.Bar(x = df_artist_val[:11].index, y = df_artist_val[:11].values, name = "Artist Name"), row = 1, col = 1)
fig.add_trace(go.Box(x = df['artist(s)_name'], name = "artist(s)_name"), row = 1, col = 2)
fig.update_layout(showlegend = False, title = {'text': f"Univariate Analysis on artist(s)_name", 'xanchor': 'center', 'x': 0.5 })
# Updating showticklabels to False for Box plot because of space constraints
fig.update_xaxes(showticklabels=False, row=1, col=2)
fig.show()

Univariate Analysis on released_date

In [45]:
# Univariate Analysis on artist name column
df_released_date_val = df['released_date'].value_counts().sort_index()

fig = make_subplots(subplot_titles = ["Scatterplot on Released_date"])
fig.add_trace(go.Scatter(x = df_released_date_val.index, y = df_released_date_val.values, name = "Released Date", mode = "lines+markers"), row = 1, col = 1)
fig.update_layout(showlegend = False, title = {'text': f"Univariate Analysis on released_date", 'xanchor': 'center', 'x': 0.5 })
fig.show()

# 4. Bivariate Analysis

Pairplot

In [35]:
pairplot_cols = ['artist_count', 'streams','bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%',
       'liveness_%', 'speechiness_%', 'total_playlists', 'total_charts']
pairplot_cols_playlists = ['in_spotify_playlists','in_apple_playlists','in_deezer_playlists','total_playlists']
pairplot_cols_charts = ['in_spotify_charts','in_apple_charts','in_deezer_charts','in_shazam_charts','total_charts']


In [36]:
fig = go.Figure(
            data = go.Splom(
                dimensions = [dict(label = col, values = df[col]) for col in pairplot_cols],

            )
)
fig.update_layout(showlegend = False, title = {'text': f"Pairplot", 'xanchor': 'center', 'x': 0.5 }, width = 1400, height = 1000, xaxis=dict(tickangle=90) )
fig.show()

This code creates an interactive D3.js bubble chart visualization for the Spotify Most Streamed Songs 2023 dataset. Here's a 5-point description for documentation:
• Data Preparation: Extracts relevant columns from the DataFrame and converts them to JSON format for D3.js.
• SVG Setup: Creates an SVG element with specified dimensions to host the bubble chart.
• Bubble Creation: Generates circles (bubbles) for each song, with size based on stream count and color based on artist name.
• Interactivity: Implements mouseover tooltips displaying detailed song information and enables dragging of bubbles.
• Force Simulation: Applies a force-directed layout to position the bubbles, creating an organic, interactive visualization of the dataset.

In [51]:
# Add this cell after the correlation heatmap
from IPython.display import HTML
import json

# Prepare data for D3.js visualization
chart_data = df[['artist(s)_name', 'track_name', 'streams', 'energy_%', 'danceability_%']].to_dict('records')
json_data = json.dumps(chart_data)

html_content = f"""
<div id="bubble-chart"></div>
<script src="https://d3js.org/d3.v5.min.js"></script>
<script>
    const width = 800;
    const height = 600;
    const svg = d3.select("#bubble-chart")
        .append("svg")
        .attr("width", width)
        .attr("height", height);

    const color = d3.scaleOrdinal(d3.schemeCategory10);
    const simulation = d3.forceSimulation()
        .force("center", d3.forceCenter(width / 2, height / 2))
        .force("charge", d3.forceManyBody().strength(5))
        .force("collide", d3.forceCollide().radius(d => Math.sqrt(d.streams) / 500 + 5));

    const data = {json_data};

    const nodes = svg.selectAll("circle")
        .data(data)
        .enter()
        .append("circle")
        .attr("r", d => Math.sqrt(d.streams) / 500 + 5)
        .attr("fill", d => color(d['artist(s)_name']))
        .call(d3.drag()
            .on("start", dragstarted)
            .on("drag", dragged)
            .on("end", dragended));

    const tooltip = d3.select("body").append("div")
        .attr("class", "tooltip")
        .style("opacity", 0)
        .style("position", "absolute")
        .style("background-color", "white")
        .style("border", "solid")
        .style("border-width", "1px")
        .style("border-radius", "5px")
        .style("padding", "10px");

    nodes.on("mouseover", function(d) {{
        tooltip.transition()
            .duration(200)
            .style("opacity", .9);
        tooltip.html(`Artist: ${{d['artist(s)_name']}}<br/>
                      Track: ${{d.track_name}}<br/>
                      Streams: ${{d.streams}}<br/>
                      Energy: ${{d['energy_%']}}%<br/>
                      Danceability: ${{d['danceability_%']}}%`)
            .style("left", (d3.event.pageX + 10) + "px")
            .style("top", (d3.event.pageY - 28) + "px");
    }})
    .on("mouseout", function(d) {{
        tooltip.transition()
            .duration(500)
            .style("opacity", 0);
    }});

    simulation.nodes(data)
        .on("tick", ticked);

    function ticked() {{
        nodes
            .attr("cx", d => d.x)
            .attr("cy", d => d.y);
    }}

    function dragstarted(d) {{
        if (!d3.event.active) simulation.alphaTarget(0.3).restart();
        d.fx = d.x;
        d.fy = d.y;
    }}

    function dragged(d) {{
        d.fx = d3.event.x;
        d.fy = d3.event.y;
    }}

    function dragended(d) {{
        if (!d3.event.active) simulation.alphaTarget(0);
        d.fx = null;
        d.fy = null;
    }}
</script>
"""

display(HTML(html_content))

# 5. Normalization and Standardization

In [34]:
scaler = MinMaxScaler()
df_norm = df.copy()
num_cols_specific = num_cols.drop(['released_year', 'released_month', 'released_day'])
df_norm[num_cols_specific] = scaler.fit_transform(df_norm[num_cols_specific])
df_norm.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%,released_date,total_playlists,total_charts
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",0.142857,2023,7,14,0.009874,1.0,0.03817,0.063988,...,0.780822,0.913978,0.840909,0.319588,0.0,0.053191,0.032258,2023-07-14,0.009698,0.744325
1,LALA,Myke Towers,0.0,2023,3,23,0.027295,0.326531,0.036101,0.071429,...,0.657534,0.612903,0.738636,0.072165,0.0,0.074468,0.032258,2023-03-23,0.024701,0.340502
2,vampire,Olivia Rodrigo,0.0,2023,6,30,0.025838,0.768707,0.037798,0.139881,...,0.383562,0.301075,0.5,0.175258,0.0,0.297872,0.064516,2023-06-30,0.024733,0.766428
3,Cruel Summer,Taylor Swift,0.0,2019,8,23,0.148051,0.680272,0.216215,0.172619,...,0.438356,0.580645,0.715909,0.113402,0.0,0.085106,0.209677,2019-08-23,0.128857,0.517921
4,WHERE SHE GOES,Bad Bunny,0.0,2023,5,18,0.058676,0.340136,0.081869,0.125,...,0.575342,0.204301,0.806818,0.14433,0.692308,0.085106,0.064516,2023-05-18,0.052246,0.372162


# 6. Hypothesis Testing

Check if there is any significance difference between streams and danceability_%?

Assumptions of T-Test:

*   Normality -> Data should be normally distributed
*   Homogeneity of Variance -> Two sample variance should match
*   Independence of Observations -> Observations should be independence of each other
*   Continuous Data



Using Shapiro-Wilk Test:
*   Setup Null and Alternate Hypothesis
*   Perform Shapiro-Wilk test and compute p-value
*   Compare p_value with significance level

#### ==> STEP 1): Setup Null and Alternate Hypothesis

*   Null Hypothesis (H0): Data is normally distributed

*   Alternate Hypothesis (Ha) : Data is not normally distributed




#### ==> STEP 2): Perform Shapiro-Wilk test and compute p-value

In [37]:
np.random.sample(40)
streams_subset = df_norm['streams'].sample(200)
danceability_subset = df_norm['danceability_%'].sample(200)
streams_test_stat, streams_p_value = stats.shapiro(streams_subset)
danceability_test_stat, danceability_p_value = stats.shapiro(danceability_subset)
print(f"Test Statistics for Streams: {streams_test_stat} and P-Value for Streams: {streams_p_value}")
print(f"Test Statistics for Danceability: {danceability_test_stat} and P-Value for Danceability: {danceability_p_value}")

Test Statistics for Streams: 0.7370015494397091 and P-Value for Streams: 1.5442194259097167e-17
Test Statistics for Danceability: 0.9634637748684126 and P-Value for Danceability: 4.776995707160958e-05


#### ==>STEP 3): Compare P_Value with Significance Level

In [38]:
# Assume significance level as 5%
alpha = 0.05
# Comparision of p_value with significance for streams
if streams_p_value < alpha:
    print("Reject Null Hypothesis(H0) for Streams data. Streams data is not normally distributed.")
else:
    print("Failed to Reject Null Hypothesis (Ha) for Streams data. Streams data is normally distributed")
# Comparision of p_value with significance for Artist_count

if danceability_p_value < alpha:
    print("Reject Null Hypothesis(H0) for Danceability data. Danceability data is not normally distributed.")
else:
    print("Failed to Reject Null Hypothesis (Ha) for Danceability data. Danceability data is normally distributed")

Reject Null Hypothesis(H0) for Streams data. Streams data is not normally distributed.
Reject Null Hypothesis(H0) for Danceability data. Danceability data is not normally distributed.


# 7. Conclusions and Insights




*   **Data Distribution:** Both streams and danceability_% are not normally distributed, as revealed by the Shapiro-Wilk test.
*   **Correlation Analysis:** A moderate positive correlation exists between streams and in_spotify_playlists (0.56), suggesting that playlist inclusion may influence stream counts.
*   **Top Artists:** Ed Sheeran, The Weeknd, and Taylor Swift dominate the most streamed songs list, indicating their significant popularity on Spotify.
*   **Temporal Trends:** Songs released in recent years (2019-2023) tend to have higher stream counts, possibly due to recency bias or changes in Spotify's user base.
*   **Genre Impact:** Pop and dance-pop genres are prevalent among the most streamed songs, suggesting these genres are particularly popular on Spotify.

## Here I have Created an interactive visualization using ipywidgets and D3.js. It defines three sliders for
*   minimum streams,
*   energy, and
*   danceability.

we can adjusts these sliders, the update_chart function filters the DataFrame based on the selected criteria.

The filtered data is then converted to JSON and used to update a D3.js visualization that's already defined in the html_content variable.

This allows to dynamically explore the dataset by adjusting the minimum thresholds for streams, energy, and danceability.

In [50]:

import ipywidgets as widgets

@widgets.interact(
    min_streams=widgets.IntSlider(min=0, max=df['streams'].max(), step=1000000, value=0, description='Min Streams:'),
    min_energy=widgets.IntSlider(min=0, max=100, step=5, value=0, description='Min Energy %:'),
    min_danceability=widgets.IntSlider(min=0, max=100, step=5, value=0, description='Min Danceability %:')
)
def update_chart(min_streams, min_energy, min_danceability):
    filtered_df = df[
        (df['streams'] >= min_streams) &
        (df['energy_%'] >= min_energy) &
        (df['danceability_%'] >= min_danceability)
    ]

    chart_data = filtered_df[['artist(s)_name', 'track_name', 'streams', 'energy_%', 'danceability_%']].to_dict('records')
    json_data = json.dumps(chart_data)

    updated_html_content = html_content.replace('{json_data}', json_data)

    display(HTML(updated_html_content))

interactive(children=(IntSlider(value=0, description='Min Streams:', max=3703895074, step=1000000), IntSlider(…