<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/2/26/Spotify_logo_with_text.svg/800px-Spotify_logo_with_text.svg.png" alt="Логотип Spotify" width="350" height="100">


# <font color='#1DB954'>**About dataset**</font>



This dataset contains audio statistics of the top 2000 tracks on Spotify. The data contains about 15 columns each describing the track and it's qualities.
Songs released from 1956 to 2019 are included from some notable and famous artists like Queen, The Beatles, Guns N' Roses, etc.


---


*http://sortyourmusic.playlistmachinery.com/ by https://www.kaggle.com/plamere uses Spotify API to extract the audio features from the tracks given the Spotify Playlist URI. This data contains audio features like Danceability, BPM, Liveness, Valence(Positivity) and many more.*

In [None]:
%%html
<marquee style='width: 50%; color: #1DB954; font-size: 20px' scrollamount="12"><b>🎧 This project involves data analysis based on audio statistics of the top 2000 tracks on Spotify 🎧</b></marquee>

# <font color='#1DB954'>**Pandas** </font>

##*Some general information about dataset.*



In [None]:
import pandas as pd

In [None]:
spotify_2000 = pd.read_csv('Spotify-2000.csv')
spotify_2000

In [None]:
spotify_2000.info()

In [None]:
spotify_2000.shape

In [None]:
spotify_2000.columns

In [None]:
spotify_2000.describe()

##*Data extraction, sorting, and grouping.*

In [None]:
sorted(spotify_2000['Artist'].unique()) #unique artists, sorted

In [None]:
spotify_2000['Artist'].nunique() #number of unique

In [None]:
spotify_2000.loc[100] # entry 101 in the dataset

In [None]:
spotify_2000.head(5) #first 5 entries


In [None]:
spotify_2000.tail(5) #last 5 entries

In [None]:
spotify_2000.count() #number of entries in comulms

In [None]:
max_popularity = spotify_2000['Popularity'].max()
max_popularity_record = spotify_2000.loc[spotify_2000['Popularity'] == max_popularity, ['Title', 'Artist', 'Year', 'Popularity']]
max_popularity_record

In [None]:
min_popularity = spotify_2000['Popularity'].min()
min_popularity_record = spotify_2000.loc[spotify_2000['Popularity'] == min_popularity, ['Title', 'Artist', 'Year', 'Popularity']]
min_popularity_record

In [None]:
later_2000 = spotify_2000[spotify_2000['Year']  > 2000] #song later 2000 year
later_2000

In [None]:
ealier_2000 = spotify_2000[spotify_2000['Year']  < 2000] #song earlier 2000 year
ealier_2000

In [None]:
max_energy = spotify_2000['Energy'].max()
max_energetic_songs = spotify_2000.loc[spotify_2000['Energy'] == max_energy, ['Title', 'Artist', 'Year', 'Popularity']]
max_energetic_songs

In [None]:
max_bpm = spotify_2000['Beats Per Minute (BPM)'].max()
max_bpm_songs = spotify_2000.loc[spotify_2000['Beats Per Minute (BPM)'] == max_bpm, ['Title', 'Artist', 'Year', 'Beats Per Minute (BPM)','Popularity']]
max_bpm_songs

In [None]:
max_length = spotify_2000['Length (Duration)'].max()
max_length_record = spotify_2000.loc[spotify_2000['Length (Duration)'] == max_length, ['Title', 'Artist', 'Year', 'Length (Duration)', 'Popularity']]
max_length_record

In [None]:
min_length = spotify_2000['Length (Duration)'].min()
min_length_record = spotify_2000.loc[spotify_2000['Length (Duration)'] == min_length, ['Title', 'Artist', 'Year', 'Length (Duration)', 'Popularity']]
min_length_record

In [None]:
def combine_songs_and_artists(group):
    songs_artists = group[['Title', 'Artist']].values.tolist()
    return songs_artists

result = spotify_2000.groupby('Top Genre').apply(combine_songs_and_artists)
for genre, songs in result.items():
    if len(songs) > 0:
        genre_songs = '\n'.join([f"{song[1]} - {song[0]}" for song in songs])
        print(f'{genre}:\n{genre_songs}\n')

In [None]:
spotify_2000.duplicated() #finding duplicates

In [None]:
spotify_2000.isnull() #finding empty values

##*Editing the dataset (with copy of dataset)*

In [None]:
copy_of_spotify_2000 = spotify_2000.copy()
copy_of_spotify_2000

In [None]:
# rename column 'Year' and new copy of dataset
copy_of_spotify_2000_2 = copy_of_spotify_2000.rename(columns={'Year': 'Year of recording'})
copy_of_spotify_2000_2

In [None]:
# delete column 'Energy'
copy_of_spotify_2000.drop(columns=['Valence'], inplace=True)
copy_of_spotify_2000

In [None]:
#change the value of Top Genre of entry 2
copy_of_spotify_2000_2.at[2, 'Top Genre'] = 'rock'
copy_of_spotify_2000_2

# <font color='#1DB954'>**Matplotlib, Seaborn, Plotly**</font>

##*import*

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

##*Plotly*

In [None]:
fig = px.sunburst(spotify_2000, path=['Top Genre', 'Artist'], values='Popularity', title='Popularity by Genre and Artist (Plotly)')
fig.show()

In [None]:
fig = px.bar(spotify_2000, x='Artist', y='Popularity', title='Popularity by Artist (Plotly)', labels={'Artist': 'Artist', 'Popularity': 'Popularity'})
fig.show()

In [None]:
fig = px.scatter(spotify_2000, x='Energy', y='Danceability', title='Energy vs Danceability (Plotly)', labels={'Energy': 'Energy', 'Danceability': 'Danceability'})
fig.show()


In [None]:
fig = px.histogram(spotify_2000, x='Beats Per Minute (BPM)', title='Distribution of Beats Per Minute (BPM)', labels={'Beats Per Minute (BPM)': 'BPM', 'count': 'Frequency'})
fig.show()


In [None]:
fig = px.line(spotify_2000, x='Year', y='Popularity', title='Popularity over Years (Plotly)', labels={'Year': 'Year', 'Popularity': 'Popularity'})
fig.show()


In [None]:
fig = px.scatter_3d(spotify_2000, x='Energy', y='Danceability', z='Popularity', title='Energy, Danceability, and Popularity (3D Scatter) (Plotly)')
fig.show()

In [None]:
spotify_2000['Primary Genre'] = spotify_2000['Top Genre'].str.split(',').str[0]
genre_popularity = spotify_2000.groupby(['Year', 'Primary Genre'])['Popularity'].mean().reset_index()
fig = px.line(genre_popularity, x='Year', y='Popularity', color='Primary Genre', title='Average Popularity by Genre over Years (Plotly)')
fig.show()

In [None]:
spotify_2000['Primary Genre'] = spotify_2000['Top Genre'].str.split(',').str[0]
artist_genre_popularity = spotify_2000.groupby(['Artist', 'Primary Genre'])['Popularity'].mean().reset_index()
fig = px.sunburst(artist_genre_popularity, path=['Primary Genre', 'Artist'], values='Popularity', title='Popularity by Artist and Genre (Plotly)')
fig.show()


##*Seaborn*

In [None]:
top_artists = spotify_2000['Artist'].value_counts().head(10)
plt.figure(figsize=(8, 4))
sns.barplot(x=top_artists.values, y=top_artists.index, palette='viridis')
plt.xlabel('Count')
plt.ylabel('Artist')
plt.title('Top 10 Most Popular Artists (Seaborn)')
plt.show()

In [None]:
sns.histplot(data=spotify_2000['Beats Per Minute (BPM)'], color='purple', kde=True)
plt.xlabel('BPM')
plt.ylabel('Frequency')
plt.title('Distribution of Beats Per Minute (BPM) (Seaborn)')
plt.gcf().set_size_inches(6, 4)
plt.show()

In [None]:
sns.scatterplot(data=spotify_2000, x='Energy', y='Danceability', color='green')
plt.xlabel('Energy')
plt.ylabel('Danceability')
plt.title('Energy vs Danceability (Seaborn)')
plt.gcf().set_size_inches(6, 4)
plt.show()

In [None]:
sns.lineplot(data=spotify_2000, x='Year', y='Popularity', color='blue')
plt.xlabel('Year')
plt.ylabel('Popularity')
plt.title('Popularity over Years (Seaborn)')
plt.gcf().set_size_inches(6, 4)
plt.show()

In [None]:
numeric_features = spotify_2000[['Beats Per Minute (BPM)', 'Danceability', 'Loudness (dB)']]
sns.pairplot(numeric_features)
plt.title('Pair Plot of Numerical Features (Seaborn)')
plt.show()


In [None]:
plt.figure(figsize=(30, 8))
sns.boxplot(data=spotify_2000, x='Top Genre', y='Energy', palette='Set2')
plt.xticks(rotation=90)
plt.xlabel('Top Genre')
plt.ylabel('Energy')
plt.title('Energy Distribution by Genre (Seaborn)')
plt.show()

In [None]:
plt.figure(figsize=(35, 8))
sns.violinplot(data=spotify_2000, x='Top Genre', y='Popularity', hue='Year', split=True, palette='husl')
plt.xticks(rotation=90)
plt.xlabel('Top Genre')
plt.ylabel('Popularity')
plt.title('Popularity Distribution by Genre and Year (Seaborn)')
plt.legend(title='Year')
plt.show()

##*Matplotlib*

In [None]:
spotify_2000['Index'].plot(figsize=(4, 1), color='orangered')

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(spotify_2000['Acousticness'], spotify_2000['Popularity'], c=spotify_2000['Energy'], cmap='coolwarm', alpha=0.6)
plt.colorbar(label='Energy')
plt.xlabel('Acousticness')
plt.ylabel('Popularity')
plt.title('Acousticness vs Popularity (Matplotlib)')
plt.show()

In [None]:
top_artists = spotify_2000['Artist'].value_counts().head(10)

plt.figure(figsize=(5, 5))
plt.pie(top_artists.values, labels=top_artists.index, autopct='%1.1f%%', startangle=140)
plt.axis('equal')
plt.title('Top 10 Most Popular Artists (Matplotlib)')
plt.show()


In [None]:
plt.gcf().set_size_inches(6, 4)
plt.hist(spotify_2000['Beats Per Minute (BPM)'], color='wheat')
plt.xlabel('BPM')
plt.ylabel('Frequency')
plt.title('Distribution of Beats Per Minute (BPM)', color='darkorange')
plt.gca().set_box_aspect(0.5)
plt.show()

In [None]:
plt.scatter(spotify_2000['Energy'], spotify_2000['Danceability'], color='lightgreen')
plt.xlabel('Energy')
plt.ylabel('Danceability')
plt.title('Energy vs Danceability')
plt.gcf().set_size_inches(7, 5)
plt.show()

In [None]:
plt.boxplot(spotify_2000['Popularity'])
plt.ylabel('Popularity')
plt.title('Distribution of Popularity')
plt.gcf().set_size_inches(6, 4)
plt.show()

In [None]:
plt.plot(spotify_2000['Year'], spotify_2000['Popularity'], color='orange')
plt.xlabel('Year')
plt.ylabel('Popularity')
plt.title('Popularity over Years')
plt.gcf().set_size_inches(6, 4)
plt.show()

In [None]:
plt.bar(spotify_2000['Year'], spotify_2000['Popularity'], color='skyblue')
plt.xlabel('Year')
plt.ylabel('Popularity')
plt.title('Popularity by Year (Matplotlib)')
plt.show()