In [1]:
# mass imports

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from xgboost.sklearn import XGBClassifier


In [2]:
songs_combine = pd.read_csv("song_features/spotifybillboard_hitmiss_2014to18.csv")
songs_combine.head()

Unnamed: 0,id,popularity,release_date,hitmiss_spotify,duration,loudness,tempo,tempo_confidence,time_signature,time_signature_confidence,...,danceability,energy,instrumentalness,liveness,speechiness,valence,chart_scraped,peak_position,weeks_billboard,hitmiss_billboard
0,2bezJO9Nc1yUCKTTuU1Y93,0,31/1/2014,0,140.30766,-20.35,126.79,0.752,4,1.0,...,0.519,0.184,0.129,0.114,0.0359,0.444,0,0,0,0
1,2cMAHLrkaspfMWD8QRlODb,0,30/6/2014,0,311.12154,-13.881,103.119,0.624,4,0.62,...,0.42,0.594,3e-06,0.126,0.0537,0.824,0,0,0,0
2,37ENbdGJLFfkwzlpQhZtyf,0,13/6/2014,0,155.23084,-9.387,88.046,0.133,4,1.0,...,0.761,0.723,7.2e-05,0.111,0.0472,0.81,0,0,0,0
3,3ctaMit7CuiHIPVYrRvm15,41,21/4/2014,0,325.58195,-4.549,129.969,0.798,4,1.0,...,0.619,0.88,0.00104,0.166,0.0535,0.392,0,0,0,0
4,5HQp90TwnVEJ2rsABskmxI,1,20/12/2014,0,267.44866,-7.651,93.992,0.426,4,0.992,...,0.768,0.567,0.00116,0.0831,0.0552,0.348,0,0,0,0


In [3]:
#model training
def model_score(x):
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, test_size = 0.2, random_state = 1)
    
    xgb1 = XGBClassifier(
        learning_rate =0.0775,
        n_estimators=100,
        max_depth=5,
        min_child_weight=0.1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
    x_train = x_train.values
    x_test = x_test.values
    xgb1.fit(x_train,y_train.values.ravel())
    scores = cross_val_score(xgb1, x_train, y_train.values.ravel(), cv=10, scoring = "roc_auc")
    return scores.mean()

In [4]:
# define y = hit/miss labels with cut-off
y = songs_combine[['hitmiss_billboard']]

In [5]:
features_all=['duration','loudness', 'tempo','tempo_confidence','time_signature_confidence',
                   'time_signature','key_confidence','mode_confidence',
                   'key', 'mode', 'acousticness', 'danceability', 'energy',
                   'instrumentalness', 'liveness', 'speechiness', 'valence','popularity']
features_all.sort()
print(len(features_all))
features_all

18


['acousticness',
 'danceability',
 'duration',
 'energy',
 'instrumentalness',
 'key',
 'key_confidence',
 'liveness',
 'loudness',
 'mode',
 'mode_confidence',
 'popularity',
 'speechiness',
 'tempo',
 'tempo_confidence',
 'time_signature',
 'time_signature_confidence',
 'valence']

In [6]:
features_combination=[]
models_score_combination=[]
from itertools import combinations
for i in range(16,19):
    comb = combinations(features_all,i)
    for i in list(comb):
        features_combination.append(i)
        list_i=list(i)    
        models_score_combination.append(model_score(songs_combine[list_i]))

In [7]:
print(len(features_combination))
print(len(models_score_combination))

172
172


In [8]:
max(models_score_combination)

0.8109921904559627

In [9]:
selected_features=list(features_combination[models_score_combination.index(max(models_score_combination))])
selected_features

['acousticness',
 'danceability',
 'duration',
 'energy',
 'instrumentalness',
 'key',
 'liveness',
 'loudness',
 'mode',
 'mode_confidence',
 'popularity',
 'speechiness',
 'tempo',
 'time_signature',
 'time_signature_confidence',
 'valence']

In [10]:
X=songs_combine[selected_features]

In [22]:
#optimizing learning rate
learning_rate=[0.0773,0.0774,0.0775]
learning_rate_used=[]
scores_generated=[]
for i in learning_rate:
    x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 1)
    
    xgb1 = XGBClassifier(
        learning_rate =i,
        n_estimators=100,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
    x_train = x_train.values
    x_test = x_test.values
    xgb1.fit(x_train,y_train.values.ravel())
    scores = cross_val_score(xgb1, x_train, y_train.values.ravel(), cv=10, scoring = "roc_auc")
    scores_generated.append(scores.mean())
    learning_rate_used.append(i)
print('No. of scores generated:', len(scores_generated))
print('Scores generated:', scores_generated)
print('No. of learning rate tested:', len(learning_rate_used))
print('Best score achieved:',max(scores_generated))
best_learning_rate_used=learning_rate_used[scores_generated.index(max(scores_generated))]
print('Best learning rate:', best_learning_rate_used)

No. of scores generated: 3
Scores generated: [0.8098468046930438, 0.8110419943698313, 0.8090547727692599]
No. of learning rate tested: 3
Best score achieved: 0.8110419943698313
Best learning rate: 0.0774


In [26]:
#optimizing n estimators
n_estimators=[97,98,99]
n_estimators_used=[]
scores_generated=[]
for i in n_estimators:
    x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 1)
    
    xgb1 = XGBClassifier(
        learning_rate =0.0774,
        n_estimators=i,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
    x_train = x_train.values
    x_test = x_test.values
    xgb1.fit(x_train,y_train.values.ravel())
    scores = cross_val_score(xgb1, x_train, y_train.values.ravel(), cv=10, scoring = "roc_auc")
    scores_generated.append(scores.mean())
    n_estimators_used.append(i)
print('No. of scores generated:', len(scores_generated))
print('Scores generated:', scores_generated)
print('No. of n estimator tested:', len(n_estimators_used))
print('Best score achieved:',max(scores_generated))
best_n_estimators_used=n_estimators_used[scores_generated.index(max(scores_generated))]
print('Best n estimator:', best_n_estimators_used)

No. of scores generated: 3
Scores generated: [0.8109429441654535, 0.8112509487133691, 0.8110259575273184]
No. of n estimator tested: 3
Best score achieved: 0.8112509487133691
Best n estimator: 98


In [27]:
#optimizing max depth
max_depth=[4,5,6]
max_depth_used=[]
scores_generated=[]
for i in max_depth:
    x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 1)
    
    xgb1 = XGBClassifier(
        learning_rate =0.0774,
        n_estimators=98,
        max_depth=i,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
    x_train = x_train.values
    x_test = x_test.values
    xgb1.fit(x_train,y_train.values.ravel())
    scores = cross_val_score(xgb1, x_train, y_train.values.ravel(), cv=10, scoring = "roc_auc")
    scores_generated.append(scores.mean())
    max_depth_used.append(i)
print('No. of scores generated:', len(scores_generated))
print('Scores generated:', scores_generated)
print('No. of max depth tested:', len(max_depth_used))
print('Best score achieved:',max(scores_generated))
best_max_depth_used=max_depth_used[scores_generated.index(max(scores_generated))]
print('Best max depth:', best_max_depth_used)

No. of scores generated: 3
Scores generated: [0.8087027920117024, 0.8112509487133691, 0.8110419665682163]
No. of max depth tested: 3
Best score achieved: 0.8112509487133691
Best max depth: 5


In [29]:
#optimizing min child weight
min_child_weight=[0.04,0.05,0.06]
min_child_weight_used=[]
scores_generated=[]
for i in min_child_weight:
    x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 1)
    
    xgb1 = XGBClassifier(
        learning_rate =0.0774,
        n_estimators=98,
        max_depth=5,
        min_child_weight=i,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
    x_train = x_train.values
    x_test = x_test.values
    xgb1.fit(x_train,y_train.values.ravel())
    scores = cross_val_score(xgb1, x_train, y_train.values.ravel(), cv=10, scoring = "roc_auc")
    scores_generated.append(scores.mean())
    min_child_weight_used.append(i)
print('No. of scores generated:', len(scores_generated))
print('Scores generated:', scores_generated)
print('No. of min child weight tested:', len(min_child_weight_used))
print('Best score achieved:',max(scores_generated))
best_min_child_weight_used=min_child_weight_used[scores_generated.index(max(scores_generated))]
print('Best min child weight:', best_min_child_weight_used)

No. of scores generated: 3
Scores generated: [0.8127344094018818, 0.8127477339121214, 0.8124398183796183]
No. of min child weight tested: 3
Best score achieved: 0.8127477339121214
Best min child weight: 0.05


In [30]:
#optimizing gamma
gamma=[0, 0.05,0.1,0.5,1,2]
gamma_used=[]
scores_generated=[]
for i in gamma:
    x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 1)
    
    xgb1 = XGBClassifier(
        learning_rate =0.0774,
        n_estimators=98,
        max_depth=5,
        min_child_weight=0.05,
        gamma=i,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
    x_train = x_train.values
    x_test = x_test.values
    xgb1.fit(x_train,y_train.values.ravel())
    scores = cross_val_score(xgb1, x_train, y_train.values.ravel(), cv=10, scoring = "roc_auc")
    scores_generated.append(scores.mean())
    gamma_used.append(i)
print('No. of scores generated:', len(scores_generated))
print('Scores generated:', scores_generated)
print('No. of gamma tested:', len(gamma_used))
print('Best score achieved:',max(scores_generated))
best_gamma_used=gamma_used[scores_generated.index(max(scores_generated))]
print('Best gamma:', best_gamma_used)

No. of scores generated: 6
Scores generated: [0.8127477339121214, 0.8122417349700433, 0.8115541624233196, 0.8098882820231749, 0.8092947500307541, 0.8110663445651927]
No. of gamma tested: 6
Best score achieved: 0.8127477339121214
Best gamma: 0


In [31]:
#optimizing subsample
subsample=[0.79,0.8,0.81]
subsample_used=[]
scores_generated=[]
for i in subsample:
    x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 1)
    
    xgb1 = XGBClassifier(
        learning_rate =0.0774,
        n_estimators=98,
        max_depth=5,
        min_child_weight=0.05,
        gamma=0,
        subsample=i,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
    x_train = x_train.values
    x_test = x_test.values
    xgb1.fit(x_train,y_train.values.ravel())
    scores = cross_val_score(xgb1, x_train, y_train.values.ravel(), cv=10, scoring = "roc_auc")
    scores_generated.append(scores.mean())
    subsample_used.append(i)
print('No. of scores generated:', len(scores_generated))
print('Scores generated:', scores_generated)
print('No. of subsample tested:', len(subsample_used))
print('Best score achieved:',max(scores_generated))
best_subsample_used=subsample_used[scores_generated.index(max(scores_generated))]
print('Best subsample:', best_subsample_used)

No. of scores generated: 3
Scores generated: [0.8090346840925582, 0.8127477339121214, 0.8074010222473014]
No. of subsample tested: 3
Best score achieved: 0.8127477339121214
Best subsample: 0.8


In [34]:
#optimizing colsample_bytree
colsample_bytree=[0.74,0.75,0.76]
colsample_bytree_used=[]
scores_generated=[]
for i in colsample_bytree:
    x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 1)
    
    xgb1 = XGBClassifier(
        learning_rate =0.0774,
        n_estimators=98,
        max_depth=5,
        min_child_weight=0.05,
        gamma=0,
        subsample=0.8,
        colsample_bytree=i,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
    x_train = x_train.values
    x_test = x_test.values
    xgb1.fit(x_train,y_train.values.ravel())
    scores = cross_val_score(xgb1, x_train, y_train.values.ravel(), cv=10, scoring = "roc_auc")
    scores_generated.append(scores.mean())
    colsample_bytree_used.append(i)
print('No. of scores generated:', len(scores_generated))
print('Scores generated:', scores_generated)
print('No. of colsample_bytree tested:', len(colsample_bytree_used))
print('Best score achieved:',max(scores_generated))
best_colsample_bytree_used=colsample_bytree_used[scores_generated.index(max(scores_generated))]
print('Best colsample_bytree:', best_colsample_bytree_used)

No. of scores generated: 3
Scores generated: [0.8068894262204187, 0.8127477339121214, 0.8127477339121214]
No. of colsample_bytree tested: 3
Best score achieved: 0.8127477339121214
Best colsample_bytree: 0.75


In [36]:
#optimizing nthread
nthread=[0,4,10]
nthread_used=[]
scores_generated=[]
for i in nthread:
    x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 1)
    
    xgb1 = XGBClassifier(
        learning_rate =0.0774,
        n_estimators=98,
        max_depth=5,
        min_child_weight=0.05,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.75,
        objective= 'binary:logistic',
        nthread=i,
        scale_pos_weight=1,
        seed=27)
    x_train = x_train.values
    x_test = x_test.values
    xgb1.fit(x_train,y_train.values.ravel())
    scores = cross_val_score(xgb1, x_train, y_train.values.ravel(), cv=10, scoring = "roc_auc")
    scores_generated.append(scores.mean())
    nthread_used.append(i)
print('No. of scores generated:', len(scores_generated))
print('Scores generated:', scores_generated)
print('No. of nthread tested:', len(nthread_used))
print('Best score achieved:',max(scores_generated))
best_nthread_used=nthread_used[scores_generated.index(max(scores_generated))]
print('Best nthread:', best_nthread_used)

No. of scores generated: 3
Scores generated: [0.8127477339121214, 0.8127477339121214, 0.8127477339121214]
No. of nthread tested: 3
Best score achieved: 0.8127477339121214
Best nthread: 0


In [37]:
#optimizing scale_pos_weight
scale_pos_weight=[0.9,1,1.1]
scale_pos_weight_used=[]
scores_generated=[]
for i in scale_pos_weight:
    x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 1)
    
    xgb1 = XGBClassifier(
        learning_rate =0.0774,
        n_estimators=98,
        max_depth=5,
        min_child_weight=0.05,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.75,
        objective= 'binary:logistic',
        scale_pos_weight=i,
        seed=27)
    x_train = x_train.values
    x_test = x_test.values
    xgb1.fit(x_train,y_train.values.ravel())
    scores = cross_val_score(xgb1, x_train, y_train.values.ravel(), cv=10, scoring = "roc_auc")
    scores_generated.append(scores.mean())
    scale_pos_weight_used.append(i)
print('No. of scores generated:', len(scores_generated))
print('Scores generated:', scores_generated)
print('No. of scale_pos_weight tested:', len(scale_pos_weight_used))
print('Best score achieved:',max(scores_generated))
best_scale_pos_weight_used=scale_pos_weight_used[scores_generated.index(max(scores_generated))]
print('Best scale_pos_weight:', best_scale_pos_weight_used)

No. of scores generated: 3
Scores generated: [0.808840399622962, 0.8127477339121214, 0.8095936849716454]
No. of scale_pos_weight tested: 3
Best score achieved: 0.8127477339121214
Best scale_pos_weight: 1


In [44]:

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2,random_state=1)
    
xgb1 = XGBClassifier(
        learning_rate =0.0774,
        n_estimators=98,
        max_depth=5,
        min_child_weight=0.05,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.75,
        objective= 'binary:logistic',
        scale_pos_weight=1,
        seed=27
        )
x_train = x_train.values
x_test = x_test.values
xgb1.fit(x_train,y_train.values.ravel())
scores = cross_val_score(xgb1, x_train, y_train.values.ravel(), cv=10, scoring = "roc_auc")
scores.mean()

0.8127477339121214