In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

This dataset contains over 6000 spotify tracks ranging from the 1960s through the 2010s. Each track has been assigned a target variable '1' or '0' depending on if the track made it into the Billboard Top 100 charts. We will try to train a machine learning model to be able to predict a song's hit potential. 

In [3]:
datasets = ['dataset-of-60s.csv', 'dataset-of-70s.csv', 'dataset-of-80s.csv', 'dataset-of-90s.csv', 'dataset-of-00s.csv', 'dataset-of-10s.csv']
dflist = []

for dataset in datasets:
    df = pd.read_csv(dataset)
    dflist.append(df)
    
df_concat = pd.concat(dflist)
df_concat.head()

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target
0,Jealous Kind Of Fella,Garland Green,spotify:track:1dtKN6wwlolkM8XZy2y9C1,0.417,0.62,3,-7.727,1,0.0403,0.49,0.0,0.0779,0.845,185.655,173533,3,32.94975,9,1
1,Initials B.B.,Serge Gainsbourg,spotify:track:5hjsmSnUefdUqzsDogisiX,0.498,0.505,3,-12.475,1,0.0337,0.018,0.107,0.176,0.797,101.801,213613,4,48.8251,10,0
2,Melody Twist,Lord Melody,spotify:track:6uk8tI6pwxxdVTNlNOJeJh,0.657,0.649,5,-13.392,1,0.038,0.846,4e-06,0.119,0.908,115.94,223960,4,37.22663,12,0
3,Mi Bomba Sonó,Celia Cruz,spotify:track:7aNjMJ05FvUXACPWZ7yJmv,0.59,0.545,7,-12.058,0,0.104,0.706,0.0246,0.061,0.967,105.592,157907,4,24.75484,8,0
4,Uravu Solla,P. Susheela,spotify:track:1rQ0clvgkzWr001POOPJWx,0.515,0.765,11,-3.515,0,0.124,0.857,0.000872,0.213,0.906,114.617,245600,4,21.79874,14,0


In [4]:
X = df_concat[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', 'chorus_hit', 'sections']]
y = df_concat['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
n_estimators = [50, 100, 150, 200, 250, 300]

scorelist = []

for estimator in n_estimators:
    clf = RandomForestClassifier(n_estimators=estimator, max_depth=2, random_state=42)
    clf.fit(X_train, y_train)
    scorelist.append(clf.score(X_test, y_test))

scorelist

[0.7302951670450859,
 0.729403178722024,
 0.7353227375932533,
 0.7362147259163153,
 0.7343496594226403,
 0.7375121634771327]

In [6]:
clf = RandomForestClassifier(n_estimators=300, max_depth=2, random_state=42)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7375121634771327

In [7]:
clf.score(X_train, y_train)

0.7445958156669216

The first attempt at a model with a relatively basic random forest classifier is getting 73% accuracy on the test set. The score on the train set is not much higher, so this model doesn't seem to be overfitting. Below we'll inspect the feature importances in the model.

In [8]:
cols = list(X.columns)
features = list(clf.feature_importances_)

for i in range(len(cols)):
    print(cols[i], ':', features[i])

danceability : 0.1961608604247969
energy : 0.10186251245010265
key : 0.0
loudness : 0.10804753662129706
mode : 0.0012185736458913317
speechiness : 0.02482497234444852
acousticness : 0.1508940885169172
instrumentalness : 0.2650231879735413
liveness : 0.0017610270658901823
valence : 0.06123975608998724
tempo : 0.001843649491325074
duration_ms : 0.04616232259887296
time_signature : 0.007187765634488112
chorus_hit : 0.0013198312574551716
sections : 0.032453915884986216


While 73% accuracy right away is fairly good, let's see if we can optimize the hyperparameters more with a grid search

In [9]:
param_grid = {'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3, 5],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=1)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1296 out of 1296 | elapsed: 356.3min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [29]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 100,
 'max_features': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 1000}

In [30]:
clf2 = grid_search.best_estimator_

clf2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=100, max_features=3, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=8,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [31]:
clf2.score(X_test, y_test)

0.7895718456049302

In [14]:
clf2.score(X_train, y_train)

0.9637519983318273

In [33]:
y_pred = clf2.predict(X_test)

f1_score(y_test, y_pred)

0.8037510398547985

In [32]:
cols = list(X.columns)
features = list(clf2.feature_importances_)

for i in range(len(cols)):
    print(cols[i], ':', features[i])

danceability : 0.1090932704107315
energy : 0.0840647059812641
key : 0.02313629298791549
loudness : 0.07427836670198854
mode : 0.010076323891850083
speechiness : 0.07038902012393433
acousticness : 0.12025238423451987
instrumentalness : 0.19726142550504241
liveness : 0.042531273133259564
valence : 0.06892994518585582
tempo : 0.04592572652586165
duration_ms : 0.07271957823320976
time_signature : 0.005067301791247238
chorus_hit : 0.04171871710088423
sections : 0.0345556681924354


While this model got 78% accuracy on the test set, the 96% accuracy on training set indicates that it has been overfit. We'll try reducing the max features and min_samples_split and see if this fixes the overfitting

In [19]:
clf3 = RandomForestClassifier(max_depth=90, max_features=2, min_samples_leaf=3, min_samples_split=5, bootstrap=True, n_estimators=1000)
clf3.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=90, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [20]:
clf3.score(X_test, y_test)

0.7877067791112553

In [21]:
clf3.score(X_train, y_train)

0.9633349551678598

Looks like we still have a significant amount of overfitting. Let's try using and XGBoost model instead. 

In [10]:
clf4 = xgb.XGBClassifier()
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
     }

grid = GridSearchCV(clf4,
                    parameters, 
                    scoring="neg_log_loss",
                    cv=3, verbose=1)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 3840 candidates, totalling 11520 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 11520 out of 11520 | elapsed: 563.9min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_l...
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=None,
             param_grid={'colsample_bytree': [0.3, 0.4, 0.5, 0.7],
                         'eta': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
                         'gamma': [0.0

In [11]:
grid.best_params_

{'colsample_bytree': 0.7,
 'eta': 0.05,
 'gamma': 0.3,
 'max_depth': 8,
 'min_child_weight': 5}

In [12]:
clf5 = grid.best_estimator_
clf5.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, eta=0.05, gamma=0.3,
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [13]:
clf5.score(X_test, y_test)

0.7873824197210509

In [14]:
clf5.score(X_train, y_train)

0.8517411552095642

In [15]:
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_score, roc_curve, f1_score

y_pred = clf5.predict(X_test)

roc_score = roc_auc_score(y_test, y_pred)
roc_score

0.7870413295136355

In [16]:
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix

array([[4385, 1752],
       [ 870, 5325]], dtype=int64)

In [17]:
precision_score = precision_score(y_test, y_pred)
precision_score

0.7524374735057228

In [18]:
f1_score(y_test, y_pred)

0.8024412296564195

We are getting fairly high accuracy with less overfitting on the whole dataset now. However, during exploratory data analysis we noticed differences in the features for each decade. We'll test the accuracy of this model by decade, rather than on the entire dataset. 

In [45]:
df60s = pd.read_csv('dataset-of-60s.csv')

X60s = df60s[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', 'chorus_hit', 'sections']]
y60s = df60s['target']

X_train60s, X_test60s, y_train60s, y_test60s = train_test_split(X60s, y60s, test_size=0.3, random_state=42)

In [58]:
clf5.fit(X_train60s, y_train60s)
clf5.score(X_test60s, y_test60s)

0.7705360586193598

In [59]:
clf5.score(X_train60s, y_train60s)

0.9299057695486858

In [63]:
y_pred60s = clf5.predict(X_test60s)

roc60s = roc_auc_score(y_test60s, y_pred60s)
roc60s

0.7706084799305138

In [65]:
confusion60s = confusion_matrix(y_test60s, y_pred60s)
confusion60s

array([[ 919,  379],
       [ 216, 1079]], dtype=int64)

In [56]:
df70s = pd.read_csv('dataset-of-70s.csv')

X70s = df70s[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', 'chorus_hit', 'sections']]
y70s = df70s['target']

X_train70s, X_test70s, y_train70s, y_test70s = train_test_split(X70s, y70s, test_size=0.3, random_state=42)

In [68]:
clf5.fit(X_train70s, y_train70s)
clf5.score(X_test70s, y_test70s)

0.7618025751072961

In [70]:
clf5.score(X_train70s, y_train70s)

0.9444444444444444

In [57]:
df80s = pd.read_csv('dataset-of-80s.csv')

X80s = df80s[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', 'chorus_hit', 'sections']]
y80s = df80s['target']

X_train80s, X_test80s, y_train80s, y_test80s = train_test_split(X80s, y80s, test_size=0.3, random_state=42)

In [73]:
clf5.fit(X_train80s, y_train80s)
clf5.score(X_test80s, y_test80s)

0.804630969609262

In [74]:
clf5.score(X_train80s, y_train80s)

0.9404343329886247

In [58]:
df90s = pd.read_csv('dataset-of-60s.csv')

X90s = df90s[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', 'chorus_hit', 'sections']]
y90s = df90s['target']

X_train90s, X_test90s, y_train90s, y_test90s = train_test_split(X90s, y90s, test_size=0.3, random_state=42)

In [39]:
clf5.fit(X_train90s, y_train90s)
clf5.score(X_test90s, y_test90s)

0.7705360586193598

In [40]:
clf5.score(X_train90s, y_train90s)

0.9299057695486858

In [59]:
df00s = pd.read_csv('dataset-of-00s.csv')

X00s = df00s[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', 'chorus_hit', 'sections']]
y00s = df00s['target']

X_train00s, X_test00s, y_train00s, y_test00s = train_test_split(X00s, y00s, test_size=0.3, random_state=42)

In [79]:
clf5.fit(X_train00s, y_train00s)
clf5.score(X_test00s, y_test00s)

0.8513053348467651

In [80]:
clf5.score(X_train00s, y_train00s)

0.9559610705596107

In [60]:
df10s = pd.read_csv('dataset-of-10s.csv')

X10s = df10s[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', 'chorus_hit', 'sections']]
y10s = df10s['target']

X_train10s, X_test10s, y_train10s, y_test10s = train_test_split(X10s, y10s, test_size=0.3, random_state=42)

In [82]:
clf5.fit(X_train10s, y_train10s)
clf5.score(X_test10s, y_test10s)

0.8479166666666667

In [83]:
clf5.score(X_train10s, y_train10s)

0.9519874944171505

Overall we are getting fairly high accuracy scores by decade, however the accuracy scores are all about 10% higher on the training data. We'll try a slightly different model below.

In [35]:
clf6 = xgb.XGBClassifier(colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=3, min_child_weight=0.5)

clf6.fit(X_train, y_train)
clf6.score(X_test, y_test)


0.7759487512163478

In [123]:
clf6.score(X_train, y_train)

0.8017854190775334

In [158]:
y_pred6 = clf6.predict(X_test)
f1_score(y_test, y_pred6)

0.7827317763623496

In [42]:
clf6.fit(X_train90s, y_train90s)
clf6.score(X_test90s, y_test90s)

0.763208638642499

In [43]:
clf6.score(X_train90s, y_train90s)

0.8017854190775334

While we lost a bit of accuracy, this model has significantly less overfitting on the entire dataset. I also tested it on the 90s only data, as the original model overfit the most on that data. This resulted in a .01% accuracy loss but a much less overfit model. 

In [19]:
cols = list(X.columns)
features = list(clf5.feature_importances_)

for i in range(len(cols)):
    print(cols[i], ':', features[i])

danceability : 0.10043808
energy : 0.0479939
key : 0.020840691
loudness : 0.04576259
mode : 0.07882381
speechiness : 0.05622998
acousticness : 0.11864295
instrumentalness : 0.25530356
liveness : 0.02467498
valence : 0.043647353
tempo : 0.027719742
duration_ms : 0.061845098
time_signature : 0.05478216
chorus_hit : 0.02297637
sections : 0.040318668


In [25]:
cols = list(X.columns)
features = list(clf6.feature_importances_)

for i in range(len(cols)):
    print(cols[i], ':', features[i])

danceability : 0.11059045
energy : 0.051418554
key : 0.010074946
loudness : 0.06979672
mode : 0.048084114
speechiness : 0.06723784
acousticness : 0.13022204
instrumentalness : 0.3213578
liveness : 0.012752331
valence : 0.037447594
tempo : 0.01568847
duration_ms : 0.062776536
time_signature : 0.020031685
chorus_hit : 0.0072689927
sections : 0.03525188


In [46]:
clf6.fit(X_train60s, y_train60s)
clf6.score(X_test60s, y_test60s)

0.763208638642499

In [47]:
clf6.score(X_train60s, y_train60s)

0.8017854190775334

In [49]:
clf6.fit(X_train, y_train)
clf6.score(X_test, y_test)

0.7759487512163478

In [50]:
y_pred = clf6.predict(X_test)

In [51]:
f1_score(y_test, y_pred)

0.7941592788497356

In [52]:
roc_auc_score(y_test, y_pred)

0.7755498180304096

In [54]:
confusion_matrix(y_test, y_pred)

TypeError: 'numpy.ndarray' object is not callable

In [61]:
clf60s = clf6.fit(X_train60s, y_train60s)
clf70s = clf6.fit(X_train70s, y_train70s)
clf80s = clf6.fit(X_train80s, y_train80s)
clf90s = clf6.fit(X_train90s, y_train90s)
clf00s = clf6.fit(X_train00s, y_train00s)
clf10s = clf6.fit(X_train10s, y_train10s)

In [62]:
cols = list(X.columns)
features = list(clf60s.feature_importances_)

for i in range(len(cols)):
    print(cols[i], ':', features[i])

danceability : 0.07090861
energy : 0.08476231
key : 0.013301995
loudness : 0.10419984
mode : 0.021957884
speechiness : 0.03284651
acousticness : 0.104245745
instrumentalness : 0.36600888
liveness : 0.017663071
valence : 0.042346768
tempo : 0.018244218
duration_ms : 0.066186026
time_signature : 0.031244889
chorus_hit : 0.010634071
sections : 0.015449172


In [63]:
cols = list(X.columns)
features = list(clf70s.feature_importances_)

for i in range(len(cols)):
    print(cols[i], ':', features[i])

danceability : 0.07090861
energy : 0.08476231
key : 0.013301995
loudness : 0.10419984
mode : 0.021957884
speechiness : 0.03284651
acousticness : 0.104245745
instrumentalness : 0.36600888
liveness : 0.017663071
valence : 0.042346768
tempo : 0.018244218
duration_ms : 0.066186026
time_signature : 0.031244889
chorus_hit : 0.010634071
sections : 0.015449172
