# Prep Work incl. importing libraries, combining datasets

Datasets (no. of songs): <br>
2014 = 232 <br>
2015 = 370 <br>
2016 = 582 <br>
2017 = 1294 <br>
2018 = 2516 <br>
TOTAL = 4994

In [26]:
# mass imports
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from xgboost.sklearn import XGBClassifier

# combine each year's dataset into dataframe "songs_combine", containing all 4994 songs and metadata
songs_2014 = pd.read_csv("song_features/2014_10000_song_features.csv")
songs_2015 = pd.read_csv("song_features/2015_10000_song_features.csv")
songs_2016 = pd.read_csv("song_features/2016_10000_song_features.csv")
songs_2017 = pd.read_csv("song_features/2017_10000_song_features.csv")
songs_2018 = pd.read_csv("song_features/2018_10000_song_features (artcclean 280419 2120hrs).csv")

songs_combine = pd.concat([songs_2014, songs_2015, songs_2016, songs_2017, songs_2018], axis = 0, sort = False, ignore_index = True)
songs_combine = pd.DataFrame(songs_combine)
print(songs_combine.head())

                         name                      id  popularity  \
0               Sohore Ekhono  2bezJO9Nc1yUCKTTuU1Y93           0   
1  Varthinkalai - Male Vocals  2cMAHLrkaspfMWD8QRlODb           0   
2                    Hari Ohm  37ENbdGJLFfkwzlpQhZtyf           0   
3                   Raat Bhar  3ctaMit7CuiHIPVYrRvm15          41   
4     Hello Mr. How Do You Do  5HQp90TwnVEJ2rsABskmxI           1   

  release_date hit_miss   duration  loudness    tempo  tempo_confidence  \
0   2014-01-31     miss  140.30766   -20.350  126.790             0.752   
1   2014-06-30     miss  311.12154   -13.881  103.119             0.624   
2   2014-06-13     miss  155.23084    -9.387   88.046             0.133   
3   2014-04-21     miss  325.58195    -4.549  129.969             0.798   
4   2014-12-20     miss  267.44866    -7.651   93.992             0.426   

   time_signature  ...  key_confidence  mode  mode_confidence  acousticness  \
0             4.0  ...           0.531   1.0           

# Correlation Tests
Reject: Plotting popularity against single audio features shows no clear correlation

In [None]:
# plot popularity against single audio feature e.g. valence
plt.plot(songs_combine["key"], songs_combine["popularity"])
plt.show()

# #1 Multiple Linear Regression
Reject: R**2 around 0.11 to 0.16 only

In [None]:
# define x = song features
x = songs_combine[['duration','loudness', 'tempo', 'time_signature',
       'key', 'mode', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'speechiness', 'valence']]

# define y = popularity
y = songs_combine[['popularity']]

# split dataset into 80% for training, 20% for testing
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, test_size = 0.2, random_state = 1)

# create Multiple Linear Regression model and train it using .fit()
predictor = LinearRegression()
predictor.fit(x_train, y_train)

# print R**2 for both training and testing
print(predictor.score(x_train, y_train))
print(predictor.score(x_test, y_test))

# #2 K Nearest Neighbors
Some potential: Model with optimal k has accuracy of around 61.7%...slightly better than flipping a coin?

In [None]:
# define x = song features
x = songs_combine[['duration','loudness', 'tempo', 'time_signature',
       'key', 'mode', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'speechiness', 'valence']]

# define y = hit/miss labels
y = songs_combine[['hit_miss']]

# split dataset into 80% for training, 20% for testing
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, test_size = 0.2, random_state = 1)

# for KNeighborsClassifier, need to identify best k neighbors model by checking which k gives the highest validation accuracy a.k.a. score

# create empty list to store scores for each k
scores = []

# for each k from 1 to 300, create a model and add its score to the "scores" list
score = 0
for i in range(1,301):
    classifier = KNeighborsClassifier(n_neighbors = i)
    classifier.fit(x_train,y_train.values.ravel())
    if classifier.score(x_test,y_test) > score:
        score = classifier.score(x_test,y_test)
        
print(score)

## #3 XGBOOST

In [46]:
# define x = song features
x = songs_combine[['duration','loudness', 'tempo', 'time_signature',
       'key', 'mode', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'speechiness', 'valence']]

# define y = hit/miss labels
y = songs_combine[['hit_miss']]

# split dataset into 80% for training, 20% for testing
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, test_size = 0.2, random_state = 1)

xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=100,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
x_train = x_train.values
x_test = x_test.values
xgb1.fit(x_train,y_train.values.ravel())
scores = cross_val_score(xgb1, x_train, y_train.values.ravel(), cv=10, scoring = "roc_auc")
print(len(scores))

song3 = x.iloc[46]

prediction = xgb1.predict_proba(song3)
print(song3)

print("Probabily of being a hit song {:.2f}".format(prediction[0][1]))

10
duration            293.040
loudness             -9.236
tempo               133.982
time_signature        4.000
key                   5.000
mode                  1.000
acousticness          0.564
danceability          0.801
energy                0.543
instrumentalness      0.000
liveness              0.116
speechiness           0.091
valence               0.534
Name: 46, dtype: float64
Probabily of being a hit song 0.29


### Just in case: Find missing audio features by searching track id

import pandas as pd
import numpy as np
import os, json, logging
from requests import get, post
from time import time, sleep
from random import randint
from bs4 import BeautifulSoup

res = post('https://accounts.spotify.com/api/token', headers = {'Authorization': 'Basic NDM0YmFiM2VhNmM2NDg2MmI3NmJkYWUwOTA0NmU2Njg6ZjFlZmFhZmM5MjA1NDFiYzkyZGNlMTk2MzBhZjk1NzE='}, data= {'grant_type': 'client_credentials'})
token = 'Bearer {}'.format(res.json()['access_token'])

headers = {'Authorization': token, "Accept": 'application/json', 'Content-Type': "application/json"}

url="https://api.spotify.com/v1/audio-features/{}".format("7kwnOwh31BLpdSzgXgNS2D")
r=get(url, headers=headers)
print(r.json())