In [None]:
#imported required packages
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#the song popularity prediction dataset
df = pd.read_csv('/content/song_data.csv')


In [None]:
#the shape of the dataset
df.shape

(18835, 15)

In [None]:
#first 5 rows from the dataset
df.head(5)

Unnamed: 0,song_name,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,Boulevard of Broken Dreams,73,262333,0.00552,0.496,0.682,2.9e-05,8,0.0589,-4.095,1,0.0294,167.06,4,0.474
1,In The End,66,216933,0.0103,0.542,0.853,0.0,3,0.108,-6.407,0,0.0498,105.256,4,0.37
2,Seven Nation Army,76,231733,0.00817,0.737,0.463,0.447,0,0.255,-7.828,1,0.0792,123.881,4,0.324
3,By The Way,74,216933,0.0264,0.451,0.97,0.00355,0,0.102,-4.938,1,0.107,122.444,4,0.198
4,How You Remind Me,56,223826,0.000954,0.447,0.766,0.0,10,0.113,-5.065,1,0.0313,172.011,4,0.574


**Dataset:** https://www.kaggle.com/yasserh/song-popularity-dataset

This dataset contains song details like song name, popularity, duration etc. Predicting the Song Popularity is the main aim in this dataset.


---


# Data Description
> Observations = 18,835

> Feature Vectors = 15



---


# Feature Vectors


1.   song_name - Name of the song
2.   song_popularity - Song Popularity
3.   song_duration_ms - Song Duration (ms)
4.   acousticness - Acousticness
5.   danceability - Daceability
6.   energy - Energy
7.   instrumentalness - Instrumentalness
8.   key - Key
9.   liveness - Liveness
10.   loudness - Loudness
11.   audio_mode - Audio Mode
12.   speechiness - Speechiness
13.   tempo - Tempo
14.   time_signature - Time Signature
15.   audio_valence - Audio Valence


In [None]:
#converting categorical features to numerical form
from sklearn import preprocessing

col_cat = ['song_name']

lab_en= preprocessing.LabelEncoder()

for c in col_cat:
    df[c]= lab_en.fit_transform(df[c])

#showing fisrt 5 rows
df.head(5)

Unnamed: 0,song_name,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,1561,73,262333,0.00552,0.496,0.682,2.9e-05,8,0.0589,-4.095,1,0.0294,167.06,4,0.474
1,5541,66,216933,0.0103,0.542,0.853,0.0,3,0.108,-6.407,0,0.0498,105.256,4,0.37
2,9638,76,231733,0.00817,0.737,0.463,0.447,0,0.255,-7.828,1,0.0792,123.881,4,0.324
3,1760,74,216933,0.0264,0.451,0.97,0.00355,0,0.102,-4.938,1,0.107,122.444,4,0.198
4,4988,56,223826,0.000954,0.447,0.766,0.0,10,0.113,-5.065,1,0.0313,172.011,4,0.574


# Categorical to Numerical
Converting all the categorical features to numerical form as we have to use numerical.

In [None]:
#checking if the dataset has null values
df.isnull().sum()

song_name           0
song_popularity     0
song_duration_ms    0
acousticness        0
danceability        0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
audio_mode          0
speechiness         0
tempo               0
time_signature      0
audio_valence       0
dtype: int64

# Null Value Detection
The dataset doesn't have any null values.

In [None]:
#selecting features based on correlation value
#target feature - song popularity
corr = np.abs(df.corr()['song_popularity']).sort_values(ascending=False)
corr = corr.rename_axis('Column').reset_index(name='Correlation')
corr

Unnamed: 0,Column,Correlation
0,song_popularity,1.0
1,instrumentalness,0.130907
2,danceability,0.10429
3,loudness,0.099442
4,acousticness,0.065181
5,audio_valence,0.052895
6,liveness,0.038937
7,time_signature,0.034983
8,tempo,0.022672
9,speechiness,0.021479


# Feature Selection
Selection of feature based on correlation values so that we can drop some features vector which are not much related to our target feature (Song Popularity).

In [None]:
df.drop([ 'tempo', 'speechiness', 'song_duration_ms', 'key', 'song_name', 'audio_mode', 'energy'],  axis=1, inplace=True)

# Features Droped


1.   tempo
2.   speechiness
3.   song_duration_ms
4.   key
5.   song_name
6.   audio_mode
7.   energy

I droped these features as these are not much related to our target feature (Song Popularity).

In [None]:
df

Unnamed: 0,song_popularity,acousticness,danceability,instrumentalness,liveness,loudness,time_signature,audio_valence
0,73,0.005520,0.496,0.000029,0.0589,-4.095,4,0.474
1,66,0.010300,0.542,0.000000,0.1080,-6.407,4,0.370
2,76,0.008170,0.737,0.447000,0.2550,-7.828,4,0.324
3,74,0.026400,0.451,0.003550,0.1020,-4.938,4,0.198
4,56,0.000954,0.447,0.000000,0.1130,-5.065,4,0.574
...,...,...,...,...,...,...,...,...
18830,60,0.893000,0.500,0.000065,0.1110,-16.107,4,0.300
18831,60,0.765000,0.495,0.000001,0.1050,-14.078,4,0.265
18832,23,0.847000,0.719,0.000000,0.1250,-12.222,4,0.286
18833,55,0.945000,0.488,0.015700,0.1190,-12.020,4,0.323


In [None]:
df.shape

(18835, 8)

# After Droping, Shape of The Dataset
After droping some features, we have 8 features now.

In [None]:
#train-test split
x = df.drop('song_popularity', axis=1)
y = df.song_popularity

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=47)


In [None]:
#scaling
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_test = scaler.transform(x_test)

In [None]:
#build the mlpregressor model
model = MLPRegressor(hidden_layer_sizes=(64), 
                   activation="relu",
                   random_state=47, 
                   max_iter=2500).fit(scaled_x_train, y_train)

In [None]:
#predicting on test data
y_pred = model.predict(scaled_x_test)
y_pred

array([59.37238338, 55.20824535, 60.26784626, ..., 45.65804611,
       50.61322972, 53.6128387 ])

In [None]:
#calculating mean squared error
mean_squared_error(y_test, y_pred)

451.8936981439683