# Prep Work incl. importing libraries, combining datasets

Datasets (no. of songs): <br>
2014 = 232 <br>
2015 = 370 <br>
2016 = 582 <br>
2017 = 1294 <br>
2018 = 2516 <br>
TOTAL = 4994

In [1]:
# mass imports
%matplotlib inline
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from xgboost.sklearn import XGBClassifier

# combine each year's dataset into dataframe "songs_combine", containing all 4994 songs and metadata
songs_combine = pd.read_csv("song_features/spotifybillboard_hitmiss_2014to18.csv")

print(songs_combine.head())

                       id  popularity release_date  hitmiss_spotify  \
0  2bezJO9Nc1yUCKTTuU1Y93           0    31/1/2014                0   
1  2cMAHLrkaspfMWD8QRlODb           0    30/6/2014                0   
2  37ENbdGJLFfkwzlpQhZtyf           0    13/6/2014                0   
3  3ctaMit7CuiHIPVYrRvm15          41    21/4/2014                0   
4  5HQp90TwnVEJ2rsABskmxI           1   20/12/2014                0   

    duration  loudness    tempo  tempo_confidence  time_signature  \
0  140.30766   -20.350  126.790             0.752             4.0   
1  311.12154   -13.881  103.119             0.624             4.0   
2  155.23084    -9.387   88.046             0.133             4.0   
3  325.58195    -4.549  129.969             0.798             4.0   
4  267.44866    -7.651   93.992             0.426             4.0   

   time_signature_confidence  ...  danceability  energy  instrumentalness  \
0                      1.000  ...         0.519   0.184          0.129000   
1   

# Correlation Tests
Reject: Plotting popularity against single audio features shows no clear correlation

In [None]:
# plot popularity against single audio feature e.g. valence
plt.plot(songs_combine["key"], songs_combine["popularity"])
plt.show()

# #1 Multiple Linear Regression
Reject: R**2 around 0.11 to 0.16 only

In [None]:
# define x = song features
x = songs_combine[['duration','loudness', 'tempo', 'time_signature',
       'key', 'mode', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'speechiness', 'valence', 'popularity']]

# define y = popularity
y = songs_combine[['hitmiss_billboard']]

# split dataset into 80% for training, 20% for testing
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, test_size = 0.2, random_state = 1)

# create Multiple Linear Regression model and train it using .fit()
predictor = LinearRegression()
predictor.fit(x_train, y_train)

# print R**2 for both training and testing
print(predictor.score(x_train, y_train))
print(predictor.score(x_test, y_test))

# #2 K Nearest Neighbors
Some potential: Model with optimal k has accuracy of around 61.7%...slightly better than flipping a coin?

In [None]:
# define x = song features
x = songs_combine[['duration','loudness', 'tempo', 'time_signature',
       'key', 'mode', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'speechiness', 'valence', 'popularity']]

# define y = hit/miss labels
y = songs_combine[['hitmiss_billboard']]

# split dataset into 80% for training, 20% for testing
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, test_size = 0.2, random_state = 1)

# for KNeighborsClassifier, need to identify best k neighbors model by checking which k gives the highest validation accuracy a.k.a. score

# create empty list to store scores for each k
scores = []

# for each k from 1 to 300, create a model and add its score to the "scores" list
score = 0
for i in range(1,301):
    classifier = KNeighborsClassifier(n_neighbors = i)
    classifier.fit(x_train,y_train.values.ravel())
    if classifier.score(x_test,y_test) > score:
        score = classifier.score(x_test,y_test)
        
print(score)
song = x.iloc[3]
prediction = classifier.predict_proba([song])
print(prediction)


pickle.dump(classifier, open("knn", "wb"), protocol=4)

## #3 XGBOOST

In [2]:
# define x = song features
x = songs_combine[['duration','loudness', 'tempo', 'time_signature',
       'key', 'mode', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'speechiness', 'valence', 'popularity']]

# define y = hit/miss labels
y = songs_combine[['hitmiss_billboard']]

# split dataset into 80% for training, 20% for testing
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, test_size = 0.2, random_state = 1)

xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=250,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=1,
 colsample_bytree=1,
 objective= 'binary:logistic',
 n_jobs=4,
 scale_pos_weight=1,
 random_state=1)
x_train = x_train.values
x_test = x_test.values
xgb1 = xgb1.fit(x_train,y_train.values.ravel())
scores = cross_val_score(xgb1, x_train, y_train.values.ravel(), cv=10, scoring = "roc_auc")
print(scores)

song3 = x.iloc[33]
print(song3)

prediction = xgb1.predict_proba(song3)

print("Probabily of being a hit song {:.2f}".format(prediction[0][1]))
pickle.dump(xgb1, open("model3", "wb"), protocol=4)

[0.76414253 0.78640246 0.81063509 0.79011378 0.81295716 0.83002438
 0.80419134 0.8086323  0.76313184 0.82008502]
duration            199.38667
loudness             -3.41700
tempo               150.03500
time_signature        4.00000
key                   0.00000
mode                  0.00000
acousticness          0.26000
danceability          0.70600
energy                0.78600
instrumentalness      0.00000
liveness              0.38000
speechiness           0.09090
valence               0.74900
popularity           79.00000
Name: 33, dtype: float64
Probabily of being a hit song 0.40


### Just in case: Find missing audio features by searching track id

import pandas as pd
import numpy as np
import os, json, logging
from requests import get, post
from time import time, sleep
from random import randint
from bs4 import BeautifulSoup

res = post('https://accounts.spotify.com/api/token', headers = {'Authorization': 'Basic NDM0YmFiM2VhNmM2NDg2MmI3NmJkYWUwOTA0NmU2Njg6ZjFlZmFhZmM5MjA1NDFiYzkyZGNlMTk2MzBhZjk1NzE='}, data= {'grant_type': 'client_credentials'})
token = 'Bearer {}'.format(res.json()['access_token'])

headers = {'Authorization': token, "Accept": 'application/json', 'Content-Type': "application/json"}

url="https://api.spotify.com/v1/audio-features/{}".format("7kwnOwh31BLpdSzgXgNS2D")
r=get(url, headers=headers)
print(r.json())