In [61]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
!gdown --id 1PY0cG1u96UAKY563Gt8bzA3tuTA4SRyN

zsh:1: command not found: gdown


In [None]:
! unzip spotify_dataset.zip

Archive:  spotify_dataset.zip
  inflating: dataset.csv             


In [33]:
# load the data into a dataframe
df = pd.read_csv("dataset.csv", index_col=[0])

In [34]:
# use the shape property to find (rows, columns)
print('There are {} rows and {} columns.'.format(df.shape[0], df.shape[1]))

There are 114000 rows and 20 columns.


In [35]:
#view samples of the data
df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [36]:
# use dtypes prooperty to find the data type of each column
print(df.dtypes)

track_id             object
artists              object
album_name           object
track_name           object
popularity            int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
track_genre          object
dtype: object


We can see there are a couple columns which the data type should be changed.

- track_id should be a integer
- artists should be string
- album_name should be string
- track_name should be string
- track_genre should be string

In [37]:
# find the amount of missing data in each column
print(df.isnull().sum())

track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64


In [38]:
#count the frequency of missing values in each row (i.e. how many columns are empty)
df.isnull().sum(axis=1).sort_values(ascending = False)

65900     3
0         0
75997     0
76008     0
76007     0
         ..
37995     0
37994     0
37993     0
37992     0
113999    0
Length: 114000, dtype: int64

Out of 114000 rows, only one row have missing data for "artists", "album_name", and "track_name", so we can simply drop that row.

In [39]:
# delete the row with missing data
df = df.dropna()

In [40]:
#remove any duplicate rows (each of the row must be unique - duplication is not allowed)
df.drop_duplicates(inplace=True)

In [41]:
# check for invalid data
df.describe(include='all')

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
count,113549,113549,113549,113549,113549.0,113549.0,113549,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549
unique,89740,31437,46589,73608,,,2,,,,,,,,,,,,,114
top,6S3JlDAGk3uu3NtZbPnuhS,The Beatles,Alternative Christmas 2022,Run Rudolph Run,,,False,,,,,,,,,,,,,acoustic
freq,9,279,195,151,,,103831,,,,,,,,,,,,,1000
mean,,,,,33.324433,228081.4,,0.567031,0.642091,5.309452,-8.243408,0.637866,0.084674,0.314064,0.155703,0.213613,0.474205,122.175745,3.904218,
std,,,,,22.283855,106413.1,,0.173409,0.251053,3.560147,5.011422,0.48062,0.105762,0.331906,0.309217,0.190462,0.259204,29.972954,0.432117,
min,,,,,0.0,8586.0,,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25%,,,,,17.0,174184.0,,0.456,0.473,2.0,-9.998,0.0,0.0359,0.0168,0.0,0.098,0.26,99.296,4.0,
50%,,,,,35.0,213000.0,,0.58,0.685,5.0,-6.997,1.0,0.0489,0.168,4.1e-05,0.132,0.464,122.02,4.0,
75%,,,,,50.0,261588.0,,0.695,0.854,8.0,-5.001,1.0,0.0845,0.596,0.0487,0.273,0.683,140.074,4.0,


There does not seem to be invalid data in the dataset. But one thing we should do to prepare the dataset for analysis later is to convert popularity column to be out of 1 instead of 100. Because that is what the other measures are out of (danceability, energy, speechiness, etc).

In [13]:
df['popularity'] = df['popularity'].div(100)

In [None]:
# check to see if things have been fixed
df.describe(include='all')

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
count,113549,113549,113549,113549,113549.0,113549.0,113549,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549
unique,89740,31437,46589,73608,,,2,,,,,,,,,,,,,114
top,6S3JlDAGk3uu3NtZbPnuhS,The Beatles,Alternative Christmas 2022,Run Rudolph Run,,,False,,,,,,,,,,,,,acoustic
freq,9,279,195,151,,,103831,,,,,,,,,,,,,1000
mean,,,,,0.333244,228081.4,,0.567031,0.642091,5.309452,-8.243408,0.637866,0.084674,0.314064,0.155703,0.213613,0.474205,122.175745,3.904218,
std,,,,,0.222839,106413.1,,0.173409,0.251053,3.560147,5.011422,0.48062,0.105762,0.331906,0.309217,0.190462,0.259204,29.972954,0.432117,
min,,,,,0.0,8586.0,,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25%,,,,,0.17,174184.0,,0.456,0.473,2.0,-9.998,0.0,0.0359,0.0168,0.0,0.098,0.26,99.296,4.0,
50%,,,,,0.35,213000.0,,0.58,0.685,5.0,-6.997,1.0,0.0489,0.168,4.1e-05,0.132,0.464,122.02,4.0,
75%,,,,,0.5,261588.0,,0.695,0.854,8.0,-5.001,1.0,0.0845,0.596,0.0487,0.273,0.683,140.074,4.0,


In [24]:
df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


## Encode and Transformation


Prepare our feature list, X

In [65]:
X = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']]
y = df['popularity']

In [66]:
explicit_encoded = pd.get_dummies(df['explicit'], prefix="Explicit", drop_first=True)

In [67]:
explicit_encoded.head()

Unnamed: 0,Explicit_True
0,0
1,0
2,0
3,0
4,0


In [68]:
X = X.join(explicit_encoded)

In [69]:
X.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,Explicit_True
0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,0
1,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,0
2,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,0
3,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,0
4,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,0


In [70]:
X_train, X_test, y_train, y_test = train_test_split(X,              #the input features
                                                    y,              #the label
                                                    test_size=0.3,  #set aside 30% of the data as the test set
                                                    random_state=7, #reproduce the results
                                                   
                                                   )

In [58]:
y


0         73
1         55
2         57
3         71
4         82
          ..
113995    21
113996    22
113997    22
113998    41
113999    22
Name: popularity, Length: 113549, dtype: int64

## Scaling/Normalizing Training Set 

In [71]:
scaler = StandardScaler()
scaler.fit(X_train[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'Explicit_True']])

In [72]:
X_train.tail()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,Explicit_True
104446,0.461,0.801,7,-11.133,1,0.0298,0.00115,3.6e-05,0.274,0.963,99.677,4,0
10756,0.65,0.759,6,-5.095,0,0.0304,0.00898,0.0024,0.0858,0.425,135.03,4,0
49898,0.823,0.837,10,-5.261,0,0.15,0.21,0.000137,0.0691,0.331,179.989,4,1
58810,0.362,0.97,8,-5.797,1,0.213,5.9e-05,0.0305,0.0489,0.297,172.189,3,0
61880,0.604,0.968,1,-1.939,1,0.0622,0.000155,0.62,0.0197,0.922,132.004,4,0


## Scaling/Normalizing Test Set 

In [73]:
scaler = StandardScaler()
scaler.fit(X_test[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'Explicit_True']])

## Building the RandomForestRegressor

In [None]:
random_forest = RandomForestRegressor()
params = {'n_estimators': np.arange(100,300,50),
         'max_depth': np.arange(1, 4, 1),}
         #'class_weight': np.arange(1,10,1),
         #'max_features':np.arange(1,10,1)}
random_forest_grid = GridSearchCV(random_forest, params, cv=5, return_train_score=True)
random_forest_grid.fit(X_train, y_train)

In [80]:
print(f'Best parameters were: {random_forest_grid.best_params_}')

print(f'Best model: {random_forest_grid.best_estimator_}')

Best parameters were: {'max_depth': 9, 'n_estimators': 9}
Best model: RandomForestRegressor(max_depth=9, n_estimators=9)


In [81]:
print(f'Best score was: {random_forest_grid.best_score_}')

Best score was: 0.1136852358977652


In [84]:
y_pred = random_forest_grid.predict(X_test)


In [85]:
mean_squared_error(y_test, y_pred)

434.5440763987305