In this workbook I will split my data into testing and trianing sets and evaluate several different models: 
    1. Shuffle the data
    2. Split the data into y, and X
    3. Split the data into train and test
    4. Models:
        i) Logistic Regression
        ii) KNearestNeighbors
        iii) Decision Tree
        iv) Linear kernel SVC 
        v) SVC 
        vi) Neural Netwrok
        vii) Random Forest
        viii) Gradient Boosting

https://towardsdatascience.com/predicting-popularity-on-spotify-when-data-needs-culture-more-than-culture-needs-data-2ed3661f75f1


In [59]:
#Import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score  
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC



In [20]:
#Import data
data = pd.read_csv('C:\\Users\\ziggy\\Springboard\\Python_Projects\\Spotify Hit Prediction\\2_Exploratory Data Analysis\\Cleaned_FullDataset', index_col=0)


In [21]:
data.head()

Unnamed: 0,Title,Artist,URI,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,Target
0,hanging by a moment,lifehouse,0wqOReZDnrefefEsrIGeR4,0.541,0.864,1.0,-4.915,1.0,0.0357,0.00118,0.0,0.0896,0.435,124.557,216067.0,4.0,1
1,fallin’,alicia keys,0KQx6HOpJueiSkztcS0r7D,0.652,0.609,11.0,-7.519,0.0,0.037,0.263,0.00101,0.233,0.482,95.986,210200.0,3.0,1
2,all for you,janet jackson,5X8kkUaUlAyAUr9TYqDFTH,0.753,0.934,2.0,-3.011,1.0,0.0736,0.0174,0.065,0.128,0.73,113.525,329933.0,3.0,1
3,drops of jupiter (tell me),train,2hKdd3qO7cWr2Jo0Bcs0MA,0.481,0.638,0.0,-5.862,1.0,0.0276,0.153,0.0,0.154,0.497,79.064,259933.0,4.0,1
4,i’m real,jennifer lopez feat. ja rule,6MLsahMK3VvH9NUwXOktdS,0.708,0.587,11.0,-7.93,0.0,0.151,0.273,0.0,0.0718,0.554,83.46,262133.0,4.0,1


In [22]:
#Shuffle dataset

data = data.sample(frac=1).reset_index(drop=True)

In [23]:
data.head()

Unnamed: 0,Title,Artist,URI,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,Target
0,"#bday (feat. chris brown, siya and sage the ge...",tank,1lp1YyncJKAabaSuP3vvuf,0.661,0.604,5.0,-6.827,0.0,0.052,0.24,0.0,0.116,0.64,99.969,312027.0,4.0,0
1,angels and acrobats,rod picott,0RMSsq3bIXD8Wgoezmy4GK,0.698,0.287,2.0,-13.358,1.0,0.0332,0.815,1e-06,0.342,0.606,116.633,175360.0,4.0,0
2,i still havent found what i'm looking for (wit...,u2,5qtih5X7uBV8c6NhUOMfMk,0.161,0.782,10.0,-8.935,1.0,0.0713,0.00822,0.0,0.974,0.396,202.163,342307.0,4.0,0
3,zumba he zumba ha (feat. soldat jahman & luis ...,dj mam's,00PhRxMpNWMaIgGLrYeSEB,0.854,0.8,4.0,-6.139,0.0,0.115,0.104,4e-06,0.0506,0.822,127.141,199667.0,4.0,0
4,ratings & views,jae millz,7JRNvu9xtBo7HrD0Tq4eyg,0.624,0.721,2.0,-6.937,1.0,0.235,0.242,0.0,0.196,0.485,145.035,283035.0,4.0,0


In [52]:
#preprocessing & split data into y, X and then into train and test.
def preprocess_inputs(dataframe):
    #Drop categorical variables
    dataframe = dataframe.drop(['Title','Artist', 'URI'], axis=1)
    #Split y, X
    y= dataframe['Target']
    X=dataframe.drop('Target', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7, shuffle=True, random_state=25)
    #Scale 
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index = X_train.index, columns = X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns = X_test.columns)
    return X_train, X_test, y_train, y_test

In [53]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)


In [54]:
X_train.var()

danceability        1.000083
energy              1.000083
key                 1.000083
loudness            1.000083
mode                1.000083
speechiness         1.000083
acousticness        1.000083
instrumentalness    1.000083
liveness            1.000083
valence             1.000083
tempo               1.000083
duration_ms         1.000083
time_signature      1.000083
dtype: float64

Training 


In [63]:
#Define models

models = {
    'Logistic Regression':LogisticRegression(),
    'K-Nearest Neighbors':KNeighborsClassifier(),
    'Decision Tree':DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "Support Vector Machine (RBF Kernel)":SVC(),
    "Neural Network":MLPClassifier(),
    "Random Forest":RandomForestClassifier(),
    "Gradient Boosting":GradientBoostingClassifier()
}
    


In [64]:
#Train the model

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + ' trained')
    


Logistic Regression trained
K-Nearest Neighbors trained
Decision Tree trained




Support Vector Machine (Linear Kernel) trained
Support Vector Machine (RBF Kernel) trained




Neural Network trained
Random Forest trained
Gradient Boosting trained


Results

In [66]:
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test)*100))

Logistic Regression: 89.59%
K-Nearest Neighbors: 88.06%
Decision Tree: 85.53%
Support Vector Machine (Linear Kernel): 89.63%
Support Vector Machine (RBF Kernel): 89.61%
Neural Network: 89.94%
Random Forest: 90.03%
Gradient Boosting: 89.82%
