In this workbook I will split my data into testing and trianing sets and evaluate several different models: 
    1. Shuffle the data
    2. Split the data into y, and X
    3. Split the data into train and test
    4. Models:
        i) Logistic Regression
        ii) KNearestNeighbors
        iii) Decision Tree
        iv) Linear kernel SVC 
        v) SVC 
        vi) Neural Netwrok
        vii) Random Forest
        viii) Gradient Boosting

https://towardsdatascience.com/predicting-popularity-on-spotify-when-data-needs-culture-more-than-culture-needs-data-2ed3661f75f1


In [1]:
#Import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score  
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC



In [2]:
#Import data
data = pd.read_csv('C:\\Users\\ziggy\\Springboard\\Python_Projects\\Spotify Hit Prediction\\2_Exploratory Data Analysis\\Cleaned_FullDataset', index_col=0)


In [3]:
data.head()

Unnamed: 0,Title,Artist,URI,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,Target
0,hanging by a moment,lifehouse,0wqOReZDnrefefEsrIGeR4,0.541,0.864,1.0,-4.915,1.0,0.0357,0.00118,0.0,0.0896,0.435,124.557,216067.0,4.0,1
1,fallin’,alicia keys,0KQx6HOpJueiSkztcS0r7D,0.652,0.609,11.0,-7.519,0.0,0.037,0.263,0.00101,0.233,0.482,95.986,210200.0,3.0,1
2,all for you,janet jackson,5X8kkUaUlAyAUr9TYqDFTH,0.753,0.934,2.0,-3.011,1.0,0.0736,0.0174,0.065,0.128,0.73,113.525,329933.0,3.0,1
3,drops of jupiter (tell me),train,2hKdd3qO7cWr2Jo0Bcs0MA,0.481,0.638,0.0,-5.862,1.0,0.0276,0.153,0.0,0.154,0.497,79.064,259933.0,4.0,1
4,i’m real,jennifer lopez feat. ja rule,6MLsahMK3VvH9NUwXOktdS,0.708,0.587,11.0,-7.93,0.0,0.151,0.273,0.0,0.0718,0.554,83.46,262133.0,4.0,1


In [5]:
#Shuffle dataset

data = data.sample(frac=1, random_state=25).reset_index(drop=True)

In [6]:
data.head()

Unnamed: 0,Title,Artist,URI,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,Target
0,toyota,supernichts,0W37YadDSSReqrgk8vZyLj,0.506,0.922,2.0,-4.883,1.0,0.0386,0.00721,0.0,0.168,0.911,103.476,68427.0,4.0,0
1,ringside rocking furs (feat. tiona d & keisha ...,westside gunn,6FOIxFtHOV2oAywczkPERO,0.479,0.707,6.0,-4.078,1.0,0.312,0.0161,0.0,0.584,0.479,128.901,297125.0,4.0,0
2,vegas (feat. steven booth & andy mientus),kait kerrigan & brian lowdermilk,2T53E1DG2ULruBg8uXPqIe,0.543,0.704,11.0,-5.553,1.0,0.139,0.536,0.0,0.688,0.579,131.066,332326.0,4.0,0
3,in common fabrics,electric universe,0tgWKSTNgvetbPRCV9g2aY,0.637,0.978,1.0,-6.413,1.0,0.0511,0.000125,0.885,0.215,0.257,144.998,459337.0,4.0,0
4,lost and found - with tyler booth,brooks & dunn,7hzyuLLHrQALKn7kIAc6cR,0.625,0.842,4.0,-5.171,1.0,0.0321,0.01,0.0,0.286,0.678,117.998,235053.0,4.0,0


In [7]:
#preprocessing & split data into y, X and then into train and test.
def preprocess_inputs(dataframe):
    #Drop categorical variables
    dataframe = dataframe.drop(['Title','Artist', 'URI'], axis=1)
    #Split y, X
    y= dataframe['Target']
    X=dataframe.drop('Target', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7, shuffle=True, random_state=25)
    #Scale 
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index = X_train.index, columns = X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns = X_test.columns)
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)


In [14]:
y_train.value_counts()

0    10802
1     1277
Name: Target, dtype: int64

In [16]:
print('The number of hit songs represents',round(1277/10802*100, 2),'% of the y_train datset.')

The number of hit songs represents 11.82 % of the y_train datset.


In [17]:
y_test.value_counts()

0    4657
1     520
Name: Target, dtype: int64

In [18]:
print('The number of hit songs represents',round(520/4657*100, 2),'% of the y_test datset.')

The number of hit songs represents 11.17 % of the y_test datset.


In [9]:
X_train.var()

danceability        1.000083
energy              1.000083
key                 1.000083
loudness            1.000083
mode                1.000083
speechiness         1.000083
acousticness        1.000083
instrumentalness    1.000083
liveness            1.000083
valence             1.000083
tempo               1.000083
duration_ms         1.000083
time_signature      1.000083
dtype: float64

Training 


In [10]:
#Define models

models = {
    'Logistic Regression':LogisticRegression(),
    'K-Nearest Neighbors':KNeighborsClassifier(),
    'Decision Tree':DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "Support Vector Machine (RBF Kernel)":SVC(),
    "Neural Network":MLPClassifier(),
    "Random Forest":RandomForestClassifier(),
    "Gradient Boosting":GradientBoostingClassifier()
}
    


In [11]:
#Train the model

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + ' trained')
    


Logistic Regression trained
K-Nearest Neighbors trained
Decision Tree trained




Support Vector Machine (Linear Kernel) trained
Support Vector Machine (RBF Kernel) trained




Neural Network trained
Random Forest trained
Gradient Boosting trained


Results

In [12]:
#Get accuracy scores for all models for X, y test
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test)*100))

Logistic Regression: 89.99%
K-Nearest Neighbors: 87.89%
Decision Tree: 85.61%
Support Vector Machine (Linear Kernel): 89.92%
Support Vector Machine (RBF Kernel): 89.96%
Neural Network: 89.69%
Random Forest: 90.42%
Gradient Boosting: 90.15%
