In [10]:
import pandas as pd # Importing the required module

url = 'https://github.com/syahelrusfi21/Spotify-Dashboard/raw/main/Most%20Streamed%20Spotify%20Songs%202024.csv' # URL of the data

# Trying to read file with encoding 'latin1'
try:
    df = pd.read_csv(url, encoding='latin1')
except UnicodeDecodeError as e:
    print(f"Error reading the file with encoding 'latin1': {e}")

print(df.shape) # Print the shape of data
df.head(5) # See the first five rows of data

(4600, 29)


Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,...,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity,Explicit Track
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,QM24S2402528,1,725.4,390470936,30716,196631588,...,684,62.0,17598718,114.0,18004655,22931,4818457.0,2669262,,0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,USUG12400910,2,545.9,323703884,28113,174597137,...,3,67.0,10422430,111.0,7780028,28444,6623075.0,1118279,,1
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,QZJ842400387,3,538.4,601309283,54331,211607669,...,536,136.0,36321847,172.0,5022621,5639,7208651.0,5285340,,0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,USSM12209777,4,444.9,2031280633,269802,136569078,...,2182,264.0,24684248,210.0,190260277,203384,,11822942,,0
4,Houdini,Houdini,Eminem,5/31/2024,USUG12403398,5,423.3,107034922,7223,151469874,...,1,82.0,17660624,105.0,4493884,7006,207179.0,457017,,1


In [11]:
df.info() # The information of data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Track                       4600 non-null   object 
 1   Album Name                  4600 non-null   object 
 2   Artist                      4595 non-null   object 
 3   Release Date                4600 non-null   object 
 4   ISRC                        4600 non-null   object 
 5   All Time Rank               4600 non-null   object 
 6   Track Score                 4600 non-null   float64
 7   Spotify Streams             4487 non-null   object 
 8   Spotify Playlist Count      4530 non-null   object 
 9   Spotify Playlist Reach      4528 non-null   object 
 10  Spotify Popularity          3796 non-null   float64
 11  YouTube Views               4292 non-null   object 
 12  YouTube Likes               4285 non-null   object 
 13  TikTok Posts                3427 

In [12]:
# Only takes the required columns
data = df[['Track', 'Album Name', 'Artist', 'Release Date', 'All Time Rank', 'Spotify Streams', 'Spotify Popularity']]
data.head(5) # See the first five rows of data

Unnamed: 0,Track,Album Name,Artist,Release Date,All Time Rank,Spotify Streams,Spotify Popularity
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,1,390470936,92.0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,2,323703884,92.0
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,3,601309283,92.0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,4,2031280633,85.0
4,Houdini,Houdini,Eminem,5/31/2024,5,107034922,88.0


In [13]:
data.info() # The information of data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Track               4600 non-null   object 
 1   Album Name          4600 non-null   object 
 2   Artist              4595 non-null   object 
 3   Release Date        4600 non-null   object 
 4   All Time Rank       4600 non-null   object 
 5   Spotify Streams     4487 non-null   object 
 6   Spotify Popularity  3796 non-null   float64
dtypes: float64(1), object(6)
memory usage: 251.7+ KB


In [14]:
data.isnull().sum() # Missing values checking

Track                   0
Album Name              0
Artist                  5
Release Date            0
All Time Rank           0
Spotify Streams       113
Spotify Popularity    804
dtype: int64

In [15]:
# Fill the missing value in the 'Artist' with 'Unknown'
data.loc[:,'Artist'] = data['Artist'].fillna('Unknown')

# Fill the missing value in the 'Spotify Streams' with 0
data.loc[:,'Spotify Streams'] = data['Spotify Streams'].fillna(0)

# Fill the missing value in the 'Spotify Popularity' with 0
data.loc[:,'Spotify Popularity'] = data['Spotify Popularity'].fillna(0)

In [16]:
data.isnull().sum() # Checking whether the missing values have been handled

Track                 0
Album Name            0
Artist                0
Release Date          0
All Time Rank         0
Spotify Streams       0
Spotify Popularity    0
dtype: int64

In [17]:
data[data.duplicated()] # Duplication checking

Unnamed: 0,Track,Album Name,Artist,Release Date,All Time Rank,Spotify Streams,Spotify Popularity
2450,Tennessee Orange,Tennessee Orange,Megan Moroney,9/2/2022,2424,227893586,73.0
3450,Dembow,Dembow,Danny Ocean,12/8/2017,3441,579189526,65.0


In [18]:
# Handling duplicated data
cleaned_data = data.drop_duplicates()
print(cleaned_data.shape)
cleaned_data.head(5) # See the first five rows of data

(4598, 7)


Unnamed: 0,Track,Album Name,Artist,Release Date,All Time Rank,Spotify Streams,Spotify Popularity
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,1,390470936,92.0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,2,323703884,92.0
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,3,601309283,92.0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,4,2031280633,85.0
4,Houdini,Houdini,Eminem,5/31/2024,5,107034922,88.0


In [19]:
cleaned_data.info() # The information of cleaned_data

<class 'pandas.core.frame.DataFrame'>
Index: 4598 entries, 0 to 4599
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Track               4598 non-null   object 
 1   Album Name          4598 non-null   object 
 2   Artist              4598 non-null   object 
 3   Release Date        4598 non-null   object 
 4   All Time Rank       4598 non-null   object 
 5   Spotify Streams     4598 non-null   object 
 6   Spotify Popularity  4598 non-null   float64
dtypes: float64(1), object(6)
memory usage: 416.4+ KB


In [20]:
# Convert 'All Time Rank' and 'Spotify Streams' to integers
cleaned_data['All Time Rank'] = cleaned_data['All Time Rank'].astype(str).str.replace(',', '', regex=True).astype(int)
cleaned_data['Spotify Streams'] = cleaned_data['Spotify Streams'].astype(str).str.replace(',', '', regex=True).astype(int)
cleaned_data.info() # Check the updated data types

<class 'pandas.core.frame.DataFrame'>
Index: 4598 entries, 0 to 4599
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Track               4598 non-null   object 
 1   Album Name          4598 non-null   object 
 2   Artist              4598 non-null   object 
 3   Release Date        4598 non-null   object 
 4   All Time Rank       4598 non-null   int64  
 5   Spotify Streams     4598 non-null   int64  
 6   Spotify Popularity  4598 non-null   float64
dtypes: float64(1), int64(2), object(4)
memory usage: 416.4+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['All Time Rank'] = cleaned_data['All Time Rank'].astype(str).str.replace(',', '', regex=True).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['Spotify Streams'] = cleaned_data['Spotify Streams'].astype(str).str.replace(',', '', regex=True).astype(int)


In [21]:
cleaned_data.head(5) # See the first five rows of data

Unnamed: 0,Track,Album Name,Artist,Release Date,All Time Rank,Spotify Streams,Spotify Popularity
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,1,390470936,92.0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,2,323703884,92.0
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,3,601309283,92.0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,4,2031280633,85.0
4,Houdini,Houdini,Eminem,5/31/2024,5,107034922,88.0


In [22]:
# Saving the data into CSV file
cleaned_data.to_csv('cleaned_data.csv', index=False)
print("DataFrame have been saved to 'cleaned_data.csv'")

DataFrame have been saved to 'cleaned_data.csv'
