In [1]:
import pandas as pd
data = pd.read_csv('Most Streamed Spotify Songs 2024.csv', sep=',', encoding='ISO-8859-1')

print(data.head())

                        Track                    Album Name          Artist  \
0         MILLION DOLLAR BABY  Million Dollar Baby - Single   Tommy Richman   
1                 Not Like Us                   Not Like Us  Kendrick Lamar   
2  i like the way you kiss me    I like the way you kiss me         Artemas   
3                     Flowers              Flowers - Single     Miley Cyrus   
4                     Houdini                       Houdini          Eminem   

  Release Date          ISRC All Time Rank  Track Score Spotify Streams  \
0    4/26/2024  QM24S2402528             1        725.4     390,470,936   
1     5/4/2024  USUG12400910             2        545.9     323,703,884   
2    3/19/2024  QZJ842400387             3        538.4     601,309,283   
3    1/12/2023  USSM12209777             4        444.9   2,031,280,633   
4    5/31/2024  USUG12403398             5        423.3     107,034,922   

  Spotify Playlist Count Spotify Playlist Reach  ...  SiriusXM Spins  \
0 

In [2]:
import pandas as pd
from datetime import datetime
import numpy as np

data = pd.read_csv('Most Streamed Spotify Songs 2024.csv', encoding='ISO-8859-1')

data['Release Date'] = pd.to_datetime(data['Release Date'], errors='coerce')

# Convert relevant columns to numeric by removing commas
numeric_cols = ['Spotify Streams', 'Spotify Playlist Count', 'Spotify Playlist Reach', 'Track Score']
for col in numeric_cols:
    data[col] = data[col].replace(',', '', regex=True).astype(float)

### Hypothesis 1: Genres and Popularity
# Assuming 'Genre' is present in the dataset (replace 'Genre' with the actual column name if available)

if 'Genre' in data.columns:
    genre_popularity = data.groupby('Genre')['Track Score'].mean().sort_values(ascending=False)
    print("Average Track Score by Genre:")
    print(genre_popularity)
else:
    print("No 'Genre' column found in the dataset to test Hypothesis 1.")

### Hypothesis 2: Popular Artists and Track Score
# Group by artist and calculate the mean Track Score
artist_popularity = data.groupby('Artist')['Track Score'].mean().sort_values(ascending=False)
print("\nAverage Track Score by Artist (Top 10):")
print(artist_popularity.head(10))

### Hypothesis 3: Streams in First Week and Track Score
# If you have a column for streams within the first week, you can directly use it.
# Otherwise, we can use 'Spotify Streams' as a proxy and check its correlation with 'Track Score'

correlation = data['Spotify Streams'].corr(data['Track Score'])
print(f"\nCorrelation between Spotify Streams and Track Score: {correlation:.2f}")

### Hypothesis 4: Recency of Release and Track Score
# Define "recent" as released within the last year from the latest date in the dataset
latest_date = data['Release Date'].max()
one_year_ago = latest_date - pd.DateOffset(years=1)

# Create a binary column for "Is Recent" and analyze the Track Score for recent vs older songs
data['Is_Recent'] = (data['Release Date'] >= one_year_ago).astype(int)
recency_analysis = data.groupby('Is_Recent')['Track Score'].mean()

print("\nAverage Track Score for Recent vs. Older Songs:")
print(recency_analysis)

# Save this result to CSV if needed
recency_analysis.to_csv('Recency_vs_Track_Score.csv', index=True)


No 'Genre' column found in the dataset to test Hypothesis 1.

Average Track Score by Artist (Top 10):
Artist
Tommy Richman            399.25
Kids With Buns           301.60
Artemas                  290.70
official sound studio    277.50
WZ Beat                  268.10
Kenya Grace              233.50
sped up 8282             211.20
Mae Stephens             203.60
Mvua                     200.30
Maian                    194.30
Name: Track Score, dtype: float64

Correlation between Spotify Streams and Track Score: 0.25

Average Track Score for Recent vs. Older Songs:
Is_Recent
0    38.907961
1    48.846137
Name: Track Score, dtype: float64


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('Most Streamed Spotify Songs 2024.csv', encoding='ISO-8859-1')

# Convert 'Release Date' to datetime format
data['Release Date'] = pd.to_datetime(data['Release Date'], errors='coerce')

# Convert numerical columns from strings with commas to floats for calculations
numeric_cols = ['Spotify Streams', 'Spotify Playlist Count', 'Spotify Playlist Reach', 
                'Track Score', 'YouTube Views', 'AirPlay Spins', 'SiriusXM Spins']

for col in numeric_cols:
    data[col] = data[col].replace(',', '', regex=True).astype(float)

# 1. Handle Missing Values
# Drop rows with missing 'Track', 'Artist', or 'Track Score' as they are crucial
data.dropna(subset=['Track', 'Artist', 'Track Score'], inplace=True)

# Impute missing values for columns that can tolerate it, using median for numerical columns
imputer = SimpleImputer(strategy='median')
data[numeric_cols] = imputer.fit_transform(data[numeric_cols])

# Fill categorical missing values with a placeholder
data['Genre'] = data['Genre'].fillna('Unknown') if 'Genre' in data.columns else None

# 2. Categorical Encoding
# For categorical variables (e.g., 'Artist', 'Genre'), use OneHotEncoder or OrdinalEncoder as appropriate

# Example for OneHotEncoder with 'Genre' (for non-ordinal data)
if 'Genre' in data.columns:
    ohe = OneHotEncoder(drop='first', sparse_output=False)  # Updated parameter name
    genre_encoded = ohe.fit_transform(data[['Genre']])
    genre_df = pd.DataFrame(genre_encoded, columns=ohe.get_feature_names_out(['Genre']))
    data = data.join(genre_df).drop('Genre', axis=1)

# OrdinalEncoder can be used if you have an ordinal column (e.g., 'Popularity Rank')

# 3. Feature Engineering
# Create "new" features based on existing ones:
# Example: Relative Popularity and Playlist Reach per Playlist Count

data['Relative Popularity'] = data['Spotify Streams'] / data['Spotify Playlist Reach']
data['Reach per Playlist Count'] = data['Spotify Playlist Reach'] / data['Spotify Playlist Count']

# Binary feature for Recency: Was the track released within the last year?
latest_date = data['Release Date'].max()
one_year_ago = latest_date - pd.DateOffset(years=1)
data['Is_Recent'] = (data['Release Date'] >= one_year_ago).astype(int)

# 4. Handle Outliers
# Cap extreme outliers using the 1st and 99th percentile
for col in numeric_cols:
    q_low = data[col].quantile(0.01)
    q_high = data[col].quantile(0.99)
    data[col] = np.clip(data[col], q_low, q_high)

# 5. Feature Scaling
# Scale numerical features using StandardScaler
scaler = StandardScaler()
scaled_columns = ['Track Score', 'Spotify Streams', 'Spotify Playlist Count', 
                  'Spotify Playlist Reach', 'Relative Popularity', 'Reach per Playlist Count']
data[scaled_columns] = scaler.fit_transform(data[scaled_columns])

# Final Dataset Preview
print(data.head())

# Save the cleaned dataset
data.to_csv('Cleaned_Most_Streamed_Spotify_Songs.csv', index=False)


                        Track                    Album Name          Artist  \
0         MILLION DOLLAR BABY  Million Dollar Baby - Single   Tommy Richman   
1                 Not Like Us                   Not Like Us  Kendrick Lamar   
2  i like the way you kiss me    I like the way you kiss me         Artemas   
3                     Flowers              Flowers - Single     Miley Cyrus   
4                     Houdini                       Houdini          Eminem   

  Release Date          ISRC All Time Rank  Track Score  Spotify Streams  \
0   2024-04-26  QM24S2402528             1     5.269555        -0.091745   
1   2024-05-04  USUG12400910             2     5.269555        -0.223053   
2   2024-03-19  QZJ842400387             3     5.269555         0.322903   
3   2023-01-12  USSM12209777             4     5.269555         3.135176   
4   2024-05-31  USUG12403398             5     5.269555        -0.649168   

   Spotify Playlist Count  Spotify Playlist Reach  ...  Amazon Playl

In [10]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('Most Streamed Spotify Songs 2024.csv', encoding='ISO-8859-1')

# Convert 'Release Date' to datetime
data['Release Date'] = pd.to_datetime(data['Release Date'], errors='coerce')

# Attempt to convert columns with commas to numeric where possible
for col in data.columns:
    if data[col].dtype == 'object':  # Check if column is of type object
        data[col] = data[col].str.replace(',', '', regex=True)  # Remove commas
        try:
            data[col] = pd.to_numeric(data[col])  # Convert to numeric if possible
        except ValueError:
            continue  # If conversion fails, leave it as an object

# Identify numeric columns based on data type after conversions
numeric_cols = data.select_dtypes(include='float64').columns

# 1. Handle Missing Values
# Drop columns with more than 50% missing values
data = data.dropna(thresh=len(data) * 0.5, axis=1)

# Fill remaining missing values in numeric columns with the median
# Re-check numeric_cols in case some columns dropped out during the previous step
numeric_cols = data.select_dtypes(include='float64').columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Fill categorical columns with a placeholder (e.g., 'Unknown') if needed
categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = data[categorical_cols].fillna('Unknown')

# Drop any remaining rows with missing values, if any
data = data.dropna()

# Final Dataset Preview
print(data.info())
print(data.head())

# Save the cleaned dataset
data.to_csv('Cleaned_Most_Streamed_Spotify_Songs.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Track                       4600 non-null   object        
 1   Album Name                  4600 non-null   object        
 2   Artist                      4600 non-null   object        
 3   Release Date                4600 non-null   datetime64[ns]
 4   ISRC                        4600 non-null   object        
 5   All Time Rank               4600 non-null   int64         
 6   Track Score                 4600 non-null   float64       
 7   Spotify Streams             4600 non-null   float64       
 8   Spotify Playlist Count      4600 non-null   float64       
 9   Spotify Playlist Reach      4600 non-null   float64       
 10  Spotify Popularity          4600 non-null   float64       
 11  YouTube Views               4600 non-null   float64     

In [None]:
from sklearn.model_selection import train_test_split


X = data.drop('Track Score', axis=1)
y = data['Track Score']


X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)


X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


print(f"Training set size: {X_train.shape[0]} samples")
print(f"Validation set size: {X_val.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

Training set size: 2760 samples
Validation set size: 920 samples
Test set size: 920 samples
