In [28]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

In [2]:
!gdown --id 1PY0cG1u96UAKY563Gt8bzA3tuTA4SRyN

zsh:1: command not found: gdown


In [3]:
! unzip spotify_dataset.zip

unzip:  cannot find or open spotify_dataset.zip, spotify_dataset.zip.zip or spotify_dataset.zip.ZIP.


In [4]:
# load the data into a dataframe
df = pd.read_csv("dataset.csv", index_col=[0])

In [5]:
# use the shape property to find (rows, columns)
print('There are {} rows and {} columns.'.format(df.shape[0], df.shape[1]))

There are 114000 rows and 20 columns.


In [6]:
#view samples of the data
df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [7]:
# use dtypes prooperty to find the data type of each column
print(df.dtypes)

track_id             object
artists              object
album_name           object
track_name           object
popularity            int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
track_genre          object
dtype: object


We can see there are a couple columns which the data type should be changed.

- track_id should be a integer
- artists should be string
- album_name should be string
- track_name should be string
- track_genre should be string

In [8]:
# find the amount of missing data in each column
print(df.isnull().sum())

track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64


In [9]:
#count the frequency of missing values in each row (i.e. how many columns are empty)
df.isnull().sum(axis=1).sort_values(ascending = False)

65900     3
0         0
75997     0
76008     0
76007     0
         ..
37995     0
37994     0
37993     0
37992     0
113999    0
Length: 114000, dtype: int64

Out of 114000 rows, only one row have missing data for "artists", "album_name", and "track_name", so we can simply drop that row.

In [10]:
# delete the row with missing data
df = df.dropna()

In [11]:
#remove any duplicate rows (each of the row must be unique - duplication is not allowed)
df.drop_duplicates(inplace=True)

In [12]:
# check for invalid data
df.describe(include='all')

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
count,113549,113549,113549,113549,113549.0,113549.0,113549,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549
unique,89740,31437,46589,73608,,,2,,,,,,,,,,,,,114
top,6S3JlDAGk3uu3NtZbPnuhS,The Beatles,Alternative Christmas 2022,Run Rudolph Run,,,False,,,,,,,,,,,,,acoustic
freq,9,279,195,151,,,103831,,,,,,,,,,,,,1000
mean,,,,,33.324433,228081.4,,0.567031,0.642091,5.309452,-8.243408,0.637866,0.084674,0.314064,0.155703,0.213613,0.474205,122.175745,3.904218,
std,,,,,22.283855,106413.1,,0.173409,0.251053,3.560147,5.011422,0.48062,0.105762,0.331906,0.309217,0.190462,0.259204,29.972954,0.432117,
min,,,,,0.0,8586.0,,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25%,,,,,17.0,174184.0,,0.456,0.473,2.0,-9.998,0.0,0.0359,0.0168,0.0,0.098,0.26,99.296,4.0,
50%,,,,,35.0,213000.0,,0.58,0.685,5.0,-6.997,1.0,0.0489,0.168,4.1e-05,0.132,0.464,122.02,4.0,
75%,,,,,50.0,261588.0,,0.695,0.854,8.0,-5.001,1.0,0.0845,0.596,0.0487,0.273,0.683,140.074,4.0,


There does not seem to be invalid data in the dataset. But one thing we should do to prepare the dataset for analysis later is to convert popularity column to be out of 1 instead of 100. Because that is what the other measures are out of (danceability, energy, speechiness, etc).

In [13]:
df['popularity'] = df['popularity'].div(100)

In [14]:
df["duration_mins"] = df["duration_ms"]/60000
df.drop(columns="duration_ms", inplace=True)

In [15]:
# check to see if things have been fixed
df.describe(include='all')

Unnamed: 0,track_id,artists,album_name,track_name,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,duration_mins
count,113549,113549,113549,113549,113549.0,113549,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549,113549.0
unique,89740,31437,46589,73608,,2,,,,,,,,,,,,,114,
top,6S3JlDAGk3uu3NtZbPnuhS,The Beatles,Alternative Christmas 2022,Run Rudolph Run,,False,,,,,,,,,,,,,acoustic,
freq,9,279,195,151,,103831,,,,,,,,,,,,,1000,
mean,,,,,0.333244,,0.567031,0.642091,5.309452,-8.243408,0.637866,0.084674,0.314064,0.155703,0.213613,0.474205,122.175745,3.904218,,3.801356
std,,,,,0.222839,,0.173409,0.251053,3.560147,5.011422,0.48062,0.105762,0.331906,0.309217,0.190462,0.259204,29.972954,0.432117,,1.773552
min,,,,,0.0,,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.1431
25%,,,,,0.17,,0.456,0.473,2.0,-9.998,0.0,0.0359,0.0168,0.0,0.098,0.26,99.296,4.0,,2.903067
50%,,,,,0.35,,0.58,0.685,5.0,-6.997,1.0,0.0489,0.168,4.1e-05,0.132,0.464,122.02,4.0,,3.55
75%,,,,,0.5,,0.695,0.854,8.0,-5.001,1.0,0.0845,0.596,0.0487,0.273,0.683,140.074,4.0,,4.3598


In [16]:
df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,duration_mins
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,0.73,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic,3.844433
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,0.55,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic,2.4935
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,0.57,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic,3.513767
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,0.71,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic,3.36555
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,0.82,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic,3.314217


In [17]:
df.describe()

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_mins
count,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0,113549.0
mean,0.333244,0.567031,0.642091,5.309452,-8.243408,0.637866,0.084674,0.314064,0.155703,0.213613,0.474205,122.175745,3.904218,3.801356
std,0.222839,0.173409,0.251053,3.560147,5.011422,0.48062,0.105762,0.331906,0.309217,0.190462,0.259204,29.972954,0.432117,1.773552
min,0.0,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1431
25%,0.17,0.456,0.473,2.0,-9.998,0.0,0.0359,0.0168,0.0,0.098,0.26,99.296,4.0,2.903067
50%,0.35,0.58,0.685,5.0,-6.997,1.0,0.0489,0.168,4.1e-05,0.132,0.464,122.02,4.0,3.55
75%,0.5,0.695,0.854,8.0,-5.001,1.0,0.0845,0.596,0.0487,0.273,0.683,140.074,4.0,4.3598
max,1.0,0.985,1.0,11.0,4.532,1.0,0.965,0.996,1.0,1.0,0.995,243.372,5.0,87.28825


## Encode and Transformation


Prepare our feature list, X

In [231]:
#X = df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']]
X = df[['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'loudness',  'valence', 'key', 'duration_mins', 'tempo']]
y = df['popularity']

In [19]:
#explicit_encoded = pd.get_dummies(df['explicit'], prefix="Explicit", drop_first=True)

In [20]:
#explicit_encoded.head()

In [21]:
#X = X.join(explicit_encoded)

In [22]:
X.head()

Unnamed: 0,danceability,energy,speechiness,acousticness,instrumentalness,liveness,loudness,valence
0,0.676,0.461,0.143,0.0322,1e-06,0.358,-6.746,0.715
1,0.42,0.166,0.0763,0.924,6e-06,0.101,-17.235,0.267
2,0.438,0.359,0.0557,0.21,0.0,0.117,-9.734,0.12
3,0.266,0.0596,0.0363,0.905,7.1e-05,0.132,-18.515,0.143
4,0.618,0.443,0.0526,0.469,0.0,0.0829,-9.681,0.167


## Splitting the Data

In [232]:
X_train, X_test, y_train, y_test = train_test_split(X,              #the input features
                                                    y,              #the label
                                                    test_size=0.2,  #set aside 30% of the data as the test set
                                                    random_state=7, #reproduce the results
                                                    shuffle=True
                                                   )

In [228]:

def reset_test():
    X_train, X_test, y_train, y_test = train_test_split(X,              #the input features
                                                    y,              #the label
                                                    test_size=0.2,  #set aside 30% of the data as the test set
                                                    random_state=7, #reproduce the results
                                                    shuffle=True
                                                   )

In [215]:
X_train.describe()

Unnamed: 0,danceability,energy,speechiness,acousticness,instrumentalness,liveness,loudness,valence,key,duration_mins,tempo
count,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0
mean,0.566482,0.642271,0.08476,0.313801,0.155885,0.214062,-8.241697,0.473752,5.307511,3.801076,122.223644
std,0.173576,0.251179,0.106089,0.332084,0.309329,0.190953,5.010492,0.259443,3.561344,1.711484,30.002792
min,0.0,0.0,0.0,0.0,0.0,0.0,-49.531,0.0,0.0,0.1431,0.0
25%,0.456,0.473,0.0359,0.0165,0.0,0.098,-9.997,0.259,2.0,2.9031,99.391
50%,0.58,0.685,0.049,0.168,4.2e-05,0.132,-6.985,0.462,5.0,3.550267,122.037
75%,0.694,0.854,0.0845,0.596,0.04885,0.275,-4.999,0.683,8.0,4.364667,140.1035
max,0.985,1.0,0.965,0.996,1.0,1.0,4.532,0.995,11.0,79.8171,243.372


In [233]:
ctr = ColumnTransformer([('minmax', MinMaxScaler(), ['tempo', 'duration_mins',]),
                        ('categorical', OneHotEncoder(), ['key']),
                        ('std', StandardScaler(), ['instrumentalness', 'liveness', 'speechiness'])],
                       remainder='passthrough')

In [234]:
train_enc = ctr.fit_transform(X_train)
encoded_column_feature_names = ctr.get_feature_names_out(X.columns)
X_train = pd.DataFrame(train_enc, columns=encoded_column_feature_names)
test_enc = ctr.fit_transform(X_test)
encoded_column_feature_names = ctr.get_feature_names_out(X.columns)
X_test = pd.DataFrame(test_enc, columns=encoded_column_feature_names)

In [235]:
X_train.describe()

Unnamed: 0,minmax__tempo,minmax__duration_mins,categorical__key_0,categorical__key_1,categorical__key_2,categorical__key_3,categorical__key_4,categorical__key_5,categorical__key_6,categorical__key_7,...,categorical__key_10,categorical__key_11,std__instrumentalness,std__liveness,std__speechiness,remainder__danceability,remainder__energy,remainder__acousticness,remainder__loudness,remainder__valence
count,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,...,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0
mean,0.502209,0.045912,0.115061,0.09399,0.101884,0.031704,0.079878,0.082079,0.068726,0.115765,...,0.065423,0.081617,-5.86259e-17,-8.068394e-17,-2.58126e-18,0.566482,0.642271,0.313801,-8.241697,0.473752
std,0.12328,0.021481,0.319097,0.291817,0.302497,0.175213,0.271105,0.274487,0.252989,0.319945,...,0.247273,0.273782,1.000006,1.000006,1.000006,0.173576,0.251179,0.332084,5.010492,0.259443
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.5039496,-1.121029,-0.7989604,0.0,0.0,0.0,-49.531,0.0
25%,0.408391,0.034641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.5039496,-0.60781,-0.4605619,0.456,0.473,0.0165,-9.997,0.259
50%,0.501442,0.042764,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.5038148,-0.4297544,-0.3370794,0.58,0.685,0.168,-6.985,0.462
75%,0.575676,0.052985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.3460264,0.3191268,-0.002451395,0.694,0.854,0.596,-4.999,0.683
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,2.72887,4.115902,8.297266,0.985,1.0,0.996,4.532,0.995


In [236]:
X_test.describe()

Unnamed: 0,minmax__tempo,minmax__duration_mins,categorical__key_0,categorical__key_1,categorical__key_2,categorical__key_3,categorical__key_4,categorical__key_5,categorical__key_6,categorical__key_7,...,categorical__key_10,categorical__key_11,std__instrumentalness,std__liveness,std__speechiness,remainder__danceability,remainder__energy,remainder__acousticness,remainder__loudness,remainder__valence
count,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,...,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0
mean,0.569955,0.03911,0.112461,0.096962,0.102994,0.029414,0.075517,0.082299,0.072567,0.118142,...,0.06517,0.080889,1.2515060000000002e-17,-3.5667930000000005e-17,-1.26715e-17,0.569229,0.641371,0.315114,-8.250253,0.476017
std,0.139485,0.023051,0.31594,0.295912,0.303958,0.168969,0.26423,0.274825,0.25943,0.322783,...,0.24683,0.272671,1.000022,1.000022,1.000022,0.172724,0.250553,0.3312,5.015243,0.258243
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.5019094,-1.123831,-0.8074369,0.0,1.9e-05,0.0,-46.591,0.0
25%,0.462573,0.028754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.5019094,-0.6049301,-0.4646687,0.459,0.471,0.018,-9.99975,0.26225
50%,0.569915,0.036189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.5017811,-0.4287798,-0.3411573,0.582,0.685,0.171,-7.037,0.4705
75%,0.654243,0.045321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.3491242,0.3140229,0.002568344,0.697,0.85275,0.595,-5.011,0.684
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,2.73678,4.139457,8.393687,0.975,1.0,0.996,1.864,0.993


In [132]:
one = OneHotEncoder()
copye = X_train[['key']].copy()
encoded_column = one.fit_transform(copye)
encoded_column_feature_names = one.get_feature_names_out(['key'])
df_encoded_column = pd.DataFrame(encoded_column.toarray(), columns=encoded_column_feature_names)
df_encoded_column.describe()

Unnamed: 0,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
count,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0
mean,0.115061,0.09399,0.101884,0.031704,0.079878,0.082079,0.068726,0.115765,0.064851,0.099021,0.065423,0.081617
std,0.319097,0.291817,0.302497,0.175213,0.271105,0.274487,0.252989,0.319945,0.246264,0.298692,0.247273,0.273782
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Scaling/Normalizing Training Set 

In [39]:
reset_test()

In [133]:
X_train

Unnamed: 0,danceability,energy,speechiness,acousticness,instrumentalness,liveness,loudness,valence,key
109325,0.643,0.968,0.0589,0.029000,0.042600,0.0994,-5.323,0.2300,4
11902,0.328,0.223,0.0366,0.555000,0.964000,0.0822,-16.202,0.0379,10
54180,0.727,0.978,0.0437,0.000694,0.781000,0.1150,-7.284,0.3330,1
41290,0.347,0.964,0.0668,0.000230,0.343000,0.1160,-6.264,0.1430,5
99711,0.481,0.590,0.0903,0.707000,0.000370,0.1410,-7.145,0.4540,2
...,...,...,...,...,...,...,...,...,...
104446,0.461,0.801,0.0298,0.001150,0.000036,0.2740,-11.133,0.9630,7
10756,0.650,0.759,0.0304,0.008980,0.002400,0.0858,-5.095,0.4250,6
49898,0.823,0.837,0.1500,0.210000,0.000137,0.0691,-5.261,0.3310,10
58810,0.362,0.970,0.2130,0.000059,0.030500,0.0489,-5.797,0.2970,8


In [134]:
standard_columns = ['danceability', 'energy', 'acousticness', 'instrumentalness', 'valence']
min_max_columns = ['speechiness', 'liveness']
ct = ColumnTransformer(transformers=[('scaler', StandardScaler(), standard_columns), 
                                     ('minmax', MinMaxScaler(), min_max_columns),])
                       #('onehot', OneHotEncoder(), ['key'])])


In [135]:
ct.fit(X_train)

In [136]:
X_scaled = ct.transform(X_train)

In [137]:
# Get the names of the scaled columns
columns_scaled = standard_columns.copy()
min_max_scaled = min_max_columns.copy()

# Get the names of the encoded columns
# columns_encoded = []
# for transformer_name, transformer, features in ct.transformers:
#     if transformer_name == 'onehot':
#         print(features)
#         print(transformer_name)
#         columns_encoded += transformer.get_feature_names_out('key').tolist()

# Concatenate the scaled and encoded columns into a new DataFrame
df_scaled_encoded = pd.DataFrame(X_scaled, columns=columns_scaled + min_max_scaled)


In [138]:
X_train = pd.DataFrame(X_scaled, columns=standard_columns + min_max_columns)

In [141]:
X_train = pd.concat([X_train, df_encoded_column],axis=1)

In [142]:
X_train.describe()

Unnamed: 0,danceability,energy,acousticness,instrumentalness,valence,speechiness,liveness,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
count,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0,90839.0
mean,2.175298e-16,8.254166e-17,-1.5331120000000003e-17,-5.86259e-17,-6.527460000000001e-17,0.087834,0.214062,0.115061,0.09399,0.101884,0.031704,0.079878,0.082079,0.068726,0.115765,0.064851,0.099021,0.065423,0.081617
std,1.000006,1.000006,1.000006,1.000006,1.000006,0.109936,0.190953,0.319097,0.291817,0.302497,0.175213,0.271105,0.274487,0.252989,0.319945,0.246264,0.298692,0.247273,0.273782
min,-3.263605,-2.557035,-0.9449499,-0.5039496,-1.826049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.6365056,-0.6739082,-0.8952634,-0.5039496,-0.8277495,0.037202,0.098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.07788097,0.170115,-0.4390512,-0.5038148,-0.04529832,0.050777,0.132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.7346557,0.8429447,0.8497859,-0.3460264,0.8065328,0.087565,0.275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.41116,1.424206,2.054307,2.72887,2.009118,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
train_std_scaler = StandardScaler()
X_train = train_std_scaler.fit_transform(X_train)

X_train = pd.DataFrame(X_train, columns=['danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'time_signature', 'Explicit_True'])

In [None]:
X_train.describe()

## Scaling/Normalizing Test Set 

In [143]:
one = OneHotEncoder()
copye = X_test[['key']].copy()
encoded_column = one.fit_transform(copye)
encoded_column_feature_names = one.get_feature_names_out(['key'])
df_encoded_column = pd.DataFrame(encoded_column.toarray(), columns=encoded_column_feature_names)
df_encoded_column.describe()

Unnamed: 0,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
count,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0
mean,0.112461,0.096962,0.102994,0.029414,0.075517,0.082299,0.072567,0.118142,0.063672,0.099912,0.06517,0.080889
std,0.31594,0.295912,0.303958,0.168969,0.26423,0.274825,0.25943,0.322783,0.244174,0.299889,0.24683,0.272671
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [144]:
standard_columns = ['danceability', 'energy', 'acousticness', 'instrumentalness', 'valence']
min_max_columns = ['speechiness', 'liveness']
ct = ColumnTransformer(transformers=[('scaler', StandardScaler(), standard_columns), 
                                     ('minmax', MinMaxScaler(), min_max_columns),])
                       #('onehot', OneHotEncoder(), ['key'])])


In [145]:
ct.fit(X_test)

In [146]:
X_scaled = ct.transform(X_test)

In [147]:
# Get the names of the scaled columns
columns_scaled = standard_columns.copy()
min_max_scaled = min_max_columns.copy()

# Get the names of the encoded columns
# columns_encoded = []
# for transformer_name, transformer, features in ct.transformers:
#     if transformer_name == 'onehot':
#         print(transformer_name)
#         columns_encoded += transformer.get_feature_names_out(['key']).tolist()

# Concatenate the scaled and encoded columns into a new DataFrame
df_scaled_encoded = pd.DataFrame(X_scaled, columns=columns_scaled + min_max_scaled)


In [148]:
X_test = pd.DataFrame(X_scaled, columns=standard_columns + min_max_columns)

In [149]:
X_test = pd.concat([X_test, df_encoded_column],axis=1)

In [150]:
X_test.describe()

Unnamed: 0,danceability,energy,acousticness,instrumentalness,valence,speechiness,liveness,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
count,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0,22710.0
mean,1.523709e-16,8.400736e-17,1.451747e-16,1.2515060000000002e-17,-2.258969e-16,0.087754,0.213523,0.112461,0.096962,0.102994,0.029414,0.075517,0.082299,0.072567,0.118142,0.063672,0.099912,0.06517,0.080889
std,1.000022,1.000022,1.000022,1.000022,1.000022,0.108685,0.189999,0.31594,0.295912,0.303958,0.168969,0.26423,0.274825,0.25943,0.322783,0.244174,0.299889,0.24683,0.272671
min,-3.295665,-2.559803,-0.9514528,-0.5019094,-1.84333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.6381945,-0.679995,-0.8971037,-0.5019094,-0.8277922,0.037253,0.098589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.07393806,0.1741355,-0.4351365,-0.5017811,-0.02136412,0.050676,0.132056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.739753,0.8436701,0.8450864,-0.3491242,0.8053941,0.088033,0.273185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.349288,1.431384,2.055863,2.73678,2.001967,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
scaler = MinMaxScaler()
X_test = scaler.fit_transform(X_test[['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence',]])

In [None]:
test_std_scaler = StandardScaler()
X_test = test_std_scaler.fit_transform(X_test.to_numpy())

X_test = pd.DataFrame(X_test, columns=['danceability', 'energy','speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', ])

In [None]:
# lambda numpy array transformed 

In [None]:
X.describe()
# key, loudness, tempo, normalized

In [None]:
from sklearn.preprocessing import FunctionTransformer
# x < 0.1 -> 1, x > 0.95 -> 3, otherwise avg -> 2

#speechiness, 

# loudness, minmax 
def transform_instrumentalness():
    

In [None]:
X_train.head()

## Building the RandomForestRegressor

In [None]:
X_train.describe()

In [None]:
random_forest = RandomForestRegressor()
params = {'n_estimators': np.arange(60,90,10),
         #'max_depth':np.arange(10,20,2),
          "min_samples_split": np.arange(2,6,2),
         #'class_weight': np.arange(1,10,1),
         'max_features':[4,5,6]}
        
# Hyperparmeters problem
# Best possible config 
# cv 7/8
random_forest_grid = GridSearchCV(random_forest, params, cv=5, return_train_score=True)
random_forest_grid.fit(X_train, y_train)

In [None]:
print(f'Best parameters were: {random_forest_grid.best_params_}')

print(f'Best model: {random_forest_grid.best_estimator_}')

#print(f'Best features: {random_forest_grid.best_features_}')

In [None]:
print(f'Best score was: {random_forest_grid.best_score_}')

In [None]:
print(f'Best estimator was: {random_forest_grid.best_estimator_}')

In [None]:
y_pred = random_forest_grid.predict(X_test_preprocessed)


In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
random_forest_grid.score(X_test_preprocessed, y_test)

In [None]:
reset()

In [269]:
model = RandomForestRegressor(n_estimators=100, max_features=10, min_samples_split=5, min_samples_leaf=2, max_depth=10,n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

#run_model(model, "Random Forrest Regressor")
# using link:
# score : 0.26
# mse: 0.03
# r2: 0.269

In [270]:
model.score(X_test, y_test)

0.12081173522298538

In [271]:
from sklearn.metrics import accuracy_score, r2_score
mean_squared_error(y_test, y_pred)

0.04356248395621316

In [272]:
r2_score(y_test, y_pred)

0.12081173522298538

In [None]:

cv_results = pd.DataFrame(random_forest_grid.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score']].head()

In [None]:
#plot predicted vs actual y values

In [None]:
rf = RandomForestRegressor(n_estimators=10)
rf.fit(X_train, y_train)

In [None]:
y_pred_normal = rf.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred_normal)

In [None]:
rf.score(X_test, y_test)

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred, c='crimson')
plt.yscale('log')
plt.xscale('log')

p1 = max(max(y_pred), max(y_test))
p2 = min(min(y_pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
y_test.describe()

In [None]:
y.describe()

In [None]:
importances = model.feature_importances_ # get all of the feature importances
forest_importances = pd.Series(importances, index=X.columns) # create separate series
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
fig, ax = plt.subplots() # create figure
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
X_train.columns

In [None]:
X_train_preprocessed

In [None]:
importances = model.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Rearrange feature names so they match the sorted feature importances
names = [X_train.columns[i] for i in indices]

# Create plot
plt.figure()

# Create plot title
plt.title("Feature Importance")

# Add bars
plt.bar(range(X_train.shape[1]), importances[indices])

# Add feature names as x-axis labels
plt.xticks(range(X_train.shape[1]), names, rotation=90)

# Show plot
plt.show()