In [2]:
import copy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from demo import get_info
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Helper Functions

In [3]:
def preview_df(_df):
    print(_df.shape)
    return _df.head()

def testingModel(model, X_train, Y_train):
    scores = cross_val_score(model, X_train, Y_train, cv=10, scoring = "roc_auc")
    print("Mean:", scores.mean())
    print("Best Score:", scores.max())
    print("Worst Score:", scores.min())
    print("Standard Deviation:", scores.std())
    return scores.mean()

# Data Preparation

In [4]:
data = pd.read_csv("Songset_wBB_marks.csv")
preview_df(data)

(10001, 23)


Unnamed: 0,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,end_of_fade_in,key,...,mode_confidence,release,song_hotttnesss,start_of_fade_out,tempo,time_signature,time_signature_confidence,title,year,bbhot
0,0.780462,0.574275,ARMQHX71187B9890D3,,"Atlanta, GA",,Mastodon,280.21506,0.238,5,...,0.5,Call of the Mastodon,0.597641,275.528,173.205,5,0.12,Deep Sea Creature,2001,0
1,0.581794,0.401998,ARD7TVE1187B99BFB1,,California - LA,,Casual,218.93179,0.247,1,...,0.636,Fear Itself,0.60212,218.932,92.198,4,0.778,I Didn't Mean To,0,0
2,0.63063,0.4175,ARMJAGH1187FB546F3,35.14968,"Memphis, TN",-90.04892,The Box Tops,148.03546,0.148,6,...,0.43,Dimensions,,137.915,121.274,4,0.384,Soul Deep,1969,1
3,0.487357,0.343428,ARKRRTF1187B9984DA,,,,Sonora Santanera,177.47546,0.282,8,...,0.565,Las Numero 1 De La Sonora Santanera,,172.304,100.07,1,0.0,Amor De Cabaret,0,0
4,0.630382,0.454231,AR7G5I41187FB4CE6C,,"London, England",,Adam Ant,233.40363,0.0,0,...,0.749,Friend Or Foe,,217.124,119.293,4,0.0,Something Girls,1982,0


In [5]:
#Encode artist id into quantitative
le = preprocessing.LabelEncoder()
data['artist_id'] = le.fit_transform(data['artist_id'].astype('str'))

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 23 columns):
artist_familiarity           9997 non-null float64
artist_hotttnesss            10001 non-null float64
artist_id                    10001 non-null int32
artist_latitude              3742 non-null float64
artist_location              5709 non-null object
artist_longitude             3742 non-null float64
artist_name                  10001 non-null object
duration                     10001 non-null float64
end_of_fade_in               10001 non-null float64
key                          10001 non-null int64
key_confidence               10001 non-null float64
loudness                     10001 non-null float64
mode                         10001 non-null int64
mode_confidence              10001 non-null float64
release                      10001 non-null object
song_hotttnesss              5649 non-null float64
start_of_fade_out            10001 non-null float64
tempo                

In [7]:
#dropping artist ID due to it being catgorical, and artist_fam due to it not being a metric spotify can give us
drop_list = ['artist_id', 'artist_familiarity', 'artist_location', 'artist_latitude', 'artist_longitude','artist_name', 'release', 'title' ,'song_hotttnesss']
train = data.drop(drop_list, axis=1)

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 14 columns):
artist_hotttnesss            10001 non-null float64
duration                     10001 non-null float64
end_of_fade_in               10001 non-null float64
key                          10001 non-null int64
key_confidence               10001 non-null float64
loudness                     10001 non-null float64
mode                         10001 non-null int64
mode_confidence              10001 non-null float64
start_of_fade_out            10001 non-null float64
tempo                        10001 non-null float64
time_signature               10001 non-null int64
time_signature_confidence    10001 non-null float64
year                         10001 non-null int64
bbhot                        10001 non-null int64
dtypes: float64(9), int64(5)
memory usage: 1.1 MB


In [9]:
Y = copy.deepcopy(train.bbhot)
Y.shape

(10001,)

In [10]:
final_training_set = train.drop("bbhot", axis=1)

# Classifiers

### KNN Classifiers

In [11]:
knn = KNeighborsClassifier(n_neighbors = 3)

knn.fit(final_training_set, Y)

acc_knn = testingModel(knn, final_training_set, Y)

Mean: 0.5236965880150429
Best Score: 0.5566106124629193
Worst Score: 0.4872304199772985
Standard Deviation: 0.022621473817312436


### Decision Tree Classifier

In [12]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(final_training_set, Y)

acc_decision_tree = testingModel(decision_tree, final_training_set, Y)

Mean: 0.527993874156824
Best Score: 0.5521471812334469
Worst Score: 0.503891681530728
Standard Deviation: 0.012792763233459714


# Observations...

Both of these classifiers offer very similar levels of accuracy so I will attempt to use both to predict the probability of a song becoming a BillBoard Hit

### KNN Classifiers

In [14]:
#Input Predictive Song

knn_song_feats, knn_popularity = get_info(artist_name='jim chappell',
                                          song_name='still')

knn_selected_song_feats = pd.DataFrame(knn_song_feats).transpose()
knn_prediction = knn.predict(knn_selected_song_feats)

if knn_prediction[0] == 1:
    print('The song is predicted to be a hit!')
else:
    print('The song is not predicted to be a hit.')

The song is not predicted to be a hit.


### Decision Tree Classifier

In [15]:
#Input Predictive Song

dtc_song_feats, dtc_popularity = get_info(artist_name='the killers',
                                          song_name='spaceman')

dtc_selected_song_feats = pd.DataFrame(dtc_song_feats).transpose()
dtc_prediction = decision_tree.predict_proba(dtc_selected_song_feats)

if dtc_prediction[0][1] == 1:
    print('The song is predicted to be a hit!')
else:
    print('The song is not predicted to be a hit.')

The song is predicted to be a hit!
