In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
dfs = [pd.read_csv(f'dataset-of-{decade}0s.csv') for decade in ['6', '7', '8', '9', '0', '1']]

In [3]:
for i, decade in enumerate([1960, 1970, 1980, 1990, 2000, 2010]):
    dfs[i]['decade'] = pd.Series(decade, index=dfs[i].index)

data = pd.concat(dfs, axis=0).sample(frac=1.0, random_state=1).reset_index(drop=True)

In [4]:
data.head()

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target,decade
0,Attaining - Take 1 / Alternate Version,John Coltrane,spotify:track:3EwLV5hZqLKx5e0Lp1QcB7,0.342,0.462,4,-12.931,0,0.0389,0.514,0.0181,0.0977,0.397,98.466,816867,4,24.84938,33,0,1970
1,So Fly,NB Ridaz Featuring Gemini,spotify:track:2BjIi07kN0yKSur0Fwrnss,0.861,0.519,2,-6.404,1,0.112,0.136,0.0,0.123,0.519,81.966,254533,4,50.0359,11,1,2000
2,Because I Got It Like That,Jungle Brothers,spotify:track:5unLExF3iiG3YkU11u6wFO,0.9,0.916,1,-7.481,0,0.115,0.22,0.000141,0.0323,0.538,102.916,277894,4,24.71271,16,0,1980
3,Babylon a Fall - Remastered,Yabby You,spotify:track:6xfe0G2HwRDQaChxkzvNKw,0.714,0.301,2,-14.8,1,0.123,0.156,0.661,0.229,0.651,74.856,357671,4,104.64231,11,0,1970
4,Fins,Jimmy Buffett,spotify:track:4h0gZ422QxBRdTV14u0P8y,0.661,0.645,4,-13.52,1,0.0487,0.00895,0.0,0.0362,0.93,136.693,204640,4,31.99617,10,1,1970


In [5]:
data.isnull().sum()

track               0
artist              0
uri                 0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
time_signature      0
chorus_hit          0
sections            0
target              0
decade              0
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41106 entries, 0 to 41105
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track             41106 non-null  object 
 1   artist            41106 non-null  object 
 2   uri               41106 non-null  object 
 3   danceability      41106 non-null  float64
 4   energy            41106 non-null  float64
 5   key               41106 non-null  int64  
 6   loudness          41106 non-null  float64
 7   mode              41106 non-null  int64  
 8   speechiness       41106 non-null  float64
 9   acousticness      41106 non-null  float64
 10  instrumentalness  41106 non-null  float64
 11  liveness          41106 non-null  float64
 12  valence           41106 non-null  float64
 13  tempo             41106 non-null  float64
 14  duration_ms       41106 non-null  int64  
 15  time_signature    41106 non-null  int64  
 16  chorus_hit        41106 non-null  float6

In [7]:
# Preprocessing

In [8]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop high-cardinality categorical columns
    df = df.drop(['track', 'artist', 'uri'], axis=1)
    
    # Split df into X and y
    y = df['target']
    X = df.drop('target', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [9]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [10]:
X_train

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,decade
17085,-0.803046,-1.153602,-0.906748,-1.115340,0.663974,-0.469839,1.596027,2.353303,-0.732449,-0.558102,0.596289,0.893514,0.252505,-0.113837,1.143941,-0.730326
5559,-1.851264,1.471369,0.226205,1.141128,-1.506083,0.289985,-0.869966,-0.413850,0.358139,-0.546842,0.305511,-0.402735,0.252505,2.363926,-1.129718,-0.159019
32888,-0.216946,0.964588,-1.473224,0.842092,0.663974,-0.398312,0.227342,-0.509835,1.320937,1.010682,1.219771,0.261257,0.252505,-0.828083,0.523852,-0.159019
14170,-1.158088,1.471369,-0.340272,0.740650,-1.506083,-0.145038,-1.073492,0.351407,0.486512,-0.122745,-0.771205,0.672481,0.252505,1.298114,0.937245,0.983596
7958,0.656568,-1.264461,-1.473224,0.218706,0.663974,-0.521432,0.560664,-0.509835,-0.318737,-1.015976,-1.251805,-0.672715,0.252505,-0.553131,-0.716326,-0.730326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7813,1.067965,1.621820,1.075919,1.057254,-1.506083,-0.276365,-1.043411,-0.509689,1.414299,1.119521,0.240131,0.998506,0.252505,-1.142409,1.143941,-0.159019
32511,-1.242621,0.588460,-0.906748,-0.133035,-1.506083,1.072087,-0.215121,-0.509818,-0.132013,-0.381707,1.932499,1.910169,0.252505,1.016528,0.937245,-0.730326
5192,-0.819953,-1.169439,1.642396,-1.394164,-1.506083,-0.448733,-0.241669,-0.509831,0.072217,-0.081462,-0.841564,-0.668891,0.252505,-0.313843,-0.509629,0.412289
12172,0.194451,1.352592,-1.473224,0.843036,-1.506083,-0.035989,-1.068867,1.051577,2.733040,-1.068519,0.086466,0.696900,0.252505,-0.347148,-0.923022,0.983596


In [11]:
y_train

17085    0
5559     0
32888    0
14170    0
7958     1
        ..
7813     1
32511    0
5192     0
12172    0
33003    1
Name: target, Length: 28774, dtype: int64

In [13]:
X_train.shape

(28774, 16)

In [14]:
# Training

In [15]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                   Logistic Regression trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                        Neural Network trained.
                         Random Forest trained.
                     Gradient Boosting trained.


In [16]:
# Result

In [17]:
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))

                   Logistic Regression: 74.50%
                   K-Nearest Neighbors: 75.26%
                         Decision Tree: 72.34%
Support Vector Machine (Linear Kernel): 74.24%
   Support Vector Machine (RBF Kernel): 80.32%
                        Neural Network: 80.10%
                         Random Forest: 80.89%
                     Gradient Boosting: 79.87%
