In [2]:
#Libraries import
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

In [3]:
df = pd.read_csv("data.csv")


In [4]:
#Removing unnecessary columns
df = df.drop(['mode' , 'danceability_%' , 'cover_url' , 'valence_%' , 'energy_%' , 'acousticness_%' , 'instrumentalness_%' ,  'liveness_%' , 'speechiness_%' , 'released_year' ,  'released_month' , 'released_day'] , axis = 1)


In [5]:
df.head(10) #Output first 10 lines

Unnamed: 0,track_name,artist(s)_name,artist_count,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,in_apple_charts,in_deezer_playlists,in_deezer_charts,in_shazam_charts,bpm,key
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,553,147,141381703,43,263,45,10,826,125,B
1,LALA,Myke Towers,1,1474,48,133716286,48,126,58,14,382,92,C#
2,vampire,Olivia Rodrigo,1,1397,113,140003974,94,207,91,14,949,138,F
3,Cruel Summer,Taylor Swift,1,7858,100,800840817,116,207,125,12,548,170,A
4,WHERE SHE GOES,Bad Bunny,1,3133,50,303236322,84,133,87,15,425,144,A
5,Sprinter,"Dave, Central Cee",2,2186,91,183706234,67,213,88,17,946,141,C#
6,Ella Baila Sola,"Eslabon Armado, Peso Pluma",2,3090,50,725980112,34,222,43,13,418,148,F
7,Columbia,Quevedo,1,714,43,58149378,25,89,30,13,194,100,F
8,fukumean,Gunna,1,1096,83,95217315,60,210,48,11,953,130,C#
9,La Bebe - Remix,"Peso Pluma, Yng Lvcas",2,2953,44,553634067,49,110,66,13,339,170,D


# Data Description:
  - **Track name:** this column responsible for the names of the tracks in the dataset.
  - **Artist(s) name:** column which display artist(-s) name of the songs.
  - **Artist count:** this column shows how many artist(-s) are involved in the song.
  - **In spotify playlists:** number of playlists that contain a song.
  - **In spotify charts:** the presence of a track in Spotify charts (e.g. top 50, top 100, etc.).
  - **Streams:** total number of plays of the track on Spotify.
  - **In apple playlists:** number of playlists on Apple Music that contain a track.
  - **In apple charts:** the presence of a track in Apple charts.
  - **In deezer playlists:** number of playlists on Deezer that contain a track.
  - **In deezer charts:** the presence of a track in Deezer charts.
  - **In shazam charts:** the presence of a track in Shazam charts.

In [6]:
df.info() #General dataset information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   track_name            953 non-null    object
 1   artist(s)_name        953 non-null    object
 2   artist_count          953 non-null    int64 
 3   in_spotify_playlists  953 non-null    int64 
 4   in_spotify_charts     953 non-null    int64 
 5   streams               953 non-null    object
 6   in_apple_playlists    953 non-null    int64 
 7   in_apple_charts       953 non-null    int64 
 8   in_deezer_playlists   953 non-null    object
 9   in_deezer_charts      953 non-null    int64 
 10  in_shazam_charts      903 non-null    object
 11  bpm                   953 non-null    int64 
 12  key                   858 non-null    object
dtypes: int64(7), object(6)
memory usage: 96.9+ KB


# Analyze data types for each column
- Column 'track_name'(dtype object): There won't be any errors here, but I can check for duplicate values to make sure each **track_name** is unique
  
- Column 'artist(s)_name'(dtype object): Also to check whether the same artists appear multiple times in the dataset and identify the most popular artists
  
- Column 'artist_count'(dtype int): Check if there are songs with multiple artists and determine how often this happens
  
- Column 'in_spotify_playlists'(dtype int): Evaluate the distribution of songs by the number of playlists. There may be a sharp gap between popular and less popular tracks
  
- Column 'in_spotify_charts'(dtype int): Check if there are any tracks that did not chart at all (value = 0)
  
- Column 'streams'(dtype object): First of all we need to convert **streams** to an float value, because errors may occur while filtering data, if there are missing (NaN) values in the data, they can be handled by filling them with the mean or by deleting the corresponding rows
 
- Column 'in_apple_playlists' (dtype int): Similar to Spotify, you can plot the distribution of the number of playlists
 
- Column 'in_apple_charts' (dtype int): Compare the number of songs that hit the Apple and Spotify charts and spot the differences
  
- Column 'in_deezer_playlists' (dtype object): Strange data type (should be int, but object is listed). Again, the column needs to be converted to int values.
  
- Column 'in_deezer_charts' (dtype int): Similar to Apple and Spotify, you can see how many songs are in Deezer charts
  
- Column 'in_shazam_charts' (dtype object): Similar situation with the data type. Need to be converted to int and check if there are tracks popular only in Shazam
  
- Column 'bpm' (dtype int): Can plot a histogram by using matplotlib to see the distribution of song tempos. This can show which tempos are most popular.
  
- Column 'key' (dtype object): 858 non-empty values, so 95 rows have gaps. Need to decide how to deal with them (fill with the most frequent value or delete rows). Also analyze which sentiments are used most often
  

In [15]:
df['streams'] = pd.to_numeric(df['streams'] , errors = 'coerce')
df['in_shazam_charts'] = pd.to_numeric(df['in_shazam_charts'] , errors = 'coerce')
df['in_deezer_playlists'] = pd.to_numeric(df['in_deezer_playlists'] , errors = 'coerce')

In [7]:
df.describe()

Unnamed: 0,artist_count,in_spotify_playlists,in_spotify_charts,in_apple_playlists,in_apple_charts,in_deezer_charts,bpm
count,953.0,953.0,953.0,953.0,953.0,953.0,953.0
mean,1.556139,5200.124869,12.009444,67.812172,51.908709,2.666317,122.540399
std,0.893044,7897.60899,19.575992,86.441493,50.630241,6.035599,28.057802
min,1.0,31.0,0.0,0.0,0.0,0.0,65.0
25%,1.0,875.0,0.0,13.0,7.0,0.0,100.0
50%,1.0,2224.0,3.0,34.0,38.0,0.0,121.0
75%,2.0,5542.0,16.0,88.0,87.0,2.0,140.0
max,8.0,52898.0,147.0,672.0,275.0,58.0,206.0


In [8]:
(df.isnull().sum()) #Checking for missing values

track_name               0
artist(s)_name           0
artist_count             0
in_spotify_playlists     0
in_spotify_charts        0
streams                  0
in_apple_playlists       0
in_apple_charts          0
in_deezer_playlists      0
in_deezer_charts         0
in_shazam_charts        50
bpm                      0
key                     95
dtype: int64

In [16]:
print(df['track_name'].duplicated().sum())  #Cheking for duplicates in track name column
df = df.drop_duplicates(subset='track_name')

10


In [20]:
print(df.duplicated(subset=['track_name', 'artist(s)_name']).sum()) #Checking whether the same artists and same song appear multiple times in the dataset

0


In [10]:
#Output of the 10 most popular artists in dataset
df['artist(s)_name'].value_counts()


artist(s)_name
Taylor Swift                 34
The Weeknd                   22
Bad Bunny                    19
SZA                          19
Harry Styles                 17
                             ..
Karol G, Ovy On The Drums     1
Coolio, L.V.                  1
Kordhell                      1
Kenia OS                      1
Feid, Sech, Jhayco            1
Name: count, Length: 645, dtype: int64

In [11]:
#Finding the most popular song in dataset
df.loc[df['streams'].idxmax(), ['track_name' , 'streams']]


track_name                  Love Grows (Where My Rosemary Goes)
streams       BPM110KeyAModeMajorDanceability53Valence75Ener...
Name: 574, dtype: object

In [12]:
#Finding the most unpopular song in dataset
df.loc[df['streams'].idxmin(), ['track_name' ,'streams']]

track_name    Arcï¿½ï¿½ngel: Bzrp Music Sessions, Vol
streams                                     100409613
Name: 301, dtype: object

In [14]:
#Output of the 10  most popular songs in dataset
df.sort_values(by='streams' , ascending = False).head(10)[['track_name' ,'streams']]

Unnamed: 0,track_name,streams
574,Love Grows (Where My Rosemary Goes),BPM110KeyAModeMajorDanceability53Valence75Ener...
33,Anti-Hero,999748277
625,Arcade,991336132
253,Glimpse of Us,988515741
455,Seek & Destroy,98709329
98,Summertime Sadness,983637508
891,"Come Back Home - From ""Purple Hearts""",97610446
427,Where Are You Now,972509632
322,I Love You So,972164968
130,Queencard,96273746
