In [7]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
import pickle

In [8]:
master_data = pd.read_csv('../data/processed/tracks.csv')

master_data.head()

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target,num_artists,release_type
0,Walking Blues,Big Joe Williams,spotify:track:1ZjN5X8LmUB67pWPgimW3B,0.509,0.277,6,-14.323,1,0.0495,0.827,...,0.0756,0.64,101.157,161893,4,68.4653,7,0,1,album
1,Suddenly Last Summer,The Motels,spotify:track:4fLIM0B1WwrLux9RdnMvze,0.716,0.753,2,-5.682,1,0.0286,0.162,...,0.0831,0.561,120.141,222000,4,57.71583,11,1,1,album
2,Sanctuary,Béla Fleck,spotify:track:3DwlNfiCQSdj0GOxYkR9Rq,0.36,0.542,5,-13.885,1,0.0339,0.368,...,0.116,0.803,116.831,444907,4,30.34574,17,0,6,album
3,The Wild Rover,The Pogues,spotify:track:6JyYNPLalPgGa7XnclF5FO,0.656,0.512,7,-11.872,1,0.029,0.585,...,0.072,0.88,97.5,157893,3,50.97022,7,0,1,album
4,In The Driver's Seat,John Schneider,spotify:track:6jJi8OXF5qaFdysB6sjWIT,0.642,0.889,2,-5.62,0,0.0494,0.375,...,0.18,0.764,163.351,162293,4,33.62053,7,1,1,album


In [9]:
master_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24698 entries, 0 to 24697
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track             24698 non-null  object 
 1   artist            24698 non-null  object 
 2   uri               24698 non-null  object 
 3   danceability      24698 non-null  float64
 4   energy            24698 non-null  float64
 5   key               24698 non-null  int64  
 6   loudness          24698 non-null  float64
 7   mode              24698 non-null  int64  
 8   speechiness       24698 non-null  float64
 9   acousticness      24698 non-null  float64
 10  instrumentalness  24698 non-null  float64
 11  liveness          24698 non-null  float64
 12  valence           24698 non-null  float64
 13  tempo             24698 non-null  float64
 14  duration_ms       24698 non-null  int64  
 15  time_signature    24698 non-null  int64  
 16  chorus_hit        24698 non-null  float6

In [10]:
master_data.shape

(24698, 21)

In [11]:
master_data = shuffle(master_data)
master_data.reset_index(inplace=True, drop=True)
master_data.head()

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target,num_artists,release_type
0,Night Shift,Quarterflash,spotify:track:7sBdj1AzlA4XM4Praaz8t7,0.743,0.612,7,-10.928,0,0.0316,0.251,...,0.113,0.617,124.339,244027,4,35.65859,10,1,1,compilation
1,Cat People (Putting Out Fire),David Bowie,spotify:track:3CD75JGmSMKA3RW5RtOwAz,0.73,0.526,10,-10.909,1,0.0429,0.0256,...,0.0913,0.385,118.208,252493,4,59.89523,9,1,1,compilation
2,Oh Ho Ho Ho (Remix),Sukhbir,spotify:track:3F0I9AkbRwz12yz7Hxn0bF,0.676,0.935,5,-3.499,0,0.0892,0.00917,...,0.097,0.802,144.944,244417,4,34.83474,12,0,2,single
3,Jump in the Line,Harry Belafonte,spotify:track:6KkAC41nNRiWA6w3ZD9cJ8,0.601,0.8,9,-9.509,0,0.322,0.676,...,0.611,0.868,113.612,224733,4,18.69713,12,0,1,compilation
4,Forgotten,Somewhere Off Jazz Street,spotify:track:2tdocYv2igbQr7HCsgStOM,0.314,0.388,1,-12.941,1,0.0389,0.165,...,0.0935,0.202,174.823,243728,4,61.53732,9,0,1,album


In [12]:
master_data.drop_duplicates(keep='first', inplace=True)
master_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24435 entries, 0 to 24697
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track             24435 non-null  object 
 1   artist            24435 non-null  object 
 2   uri               24435 non-null  object 
 3   danceability      24435 non-null  float64
 4   energy            24435 non-null  float64
 5   key               24435 non-null  int64  
 6   loudness          24435 non-null  float64
 7   mode              24435 non-null  int64  
 8   speechiness       24435 non-null  float64
 9   acousticness      24435 non-null  float64
 10  instrumentalness  24435 non-null  float64
 11  liveness          24435 non-null  float64
 12  valence           24435 non-null  float64
 13  tempo             24435 non-null  float64
 14  duration_ms       24435 non-null  int64  
 15  time_signature    24435 non-null  int64  
 16  chorus_hit        24435 non-null  float64
 17

In [13]:
X = master_data.drop(labels='target', axis=1, inplace=False)
y = master_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7)

In [14]:
# Gathering all numeric features together
numeric_features = [
    'danceability','energy','loudness','speechiness','acousticness',
    'instrumentalness','liveness','valence','tempo','duration_ms',
    'chorus_hit','sections', 'num_artists']

# Gathering all categorical features together
categorical_features = [
    'artist','key','mode','time_signature', 'release_type'
]

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ]
)

In [15]:
X_train = X_train[[*numeric_features, *categorical_features]]
X_test = X_test[[*numeric_features, *categorical_features]]

# Logistic Regression

In [16]:
clf_log = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression())
])

param_grid_lr = {
    'clf__penalty': ['l2'],
    'clf__fit_intercept': [True, False],
    'clf__class_weight': ['balanced', None],
    'clf__solver': ['newton-cg','lbfgs','liblinear','sag','saga']
}

In [17]:
random_log = RandomizedSearchCV(clf_log, param_grid_lr, cv=5, verbose=1, n_jobs=-1, scoring='accuracy')

random_log.fit(X_train, y_train)

print(f'Best Score = {random_log.best_score_}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best Score = 0.8353342428376536


# Random Forest Classifier

In [18]:
clf_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier())
])

param_grid_rf = {
    'clf__n_estimators': [20,50,100,200],
    'clf__criterion': ['gini','entropy'],
    'clf__max_depth': [20,50],
    'clf__max_features': ['auto','sqrt','log2'],
    'clf__class_weight': ['balanced','balanced_subsample']
}

In [19]:
random_rf = RandomizedSearchCV(clf_rf, param_grid_rf, cv=5, verbose=1, n_jobs=-1, scoring='accuracy')

random_rf.fit(X_train, y_train)

print(f'Best score = {random_rf.best_score_}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
13 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/song-savvy/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/song-savvy/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/song-savvy/lib/python3.11/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/opt/anacond

Best score = 0.826193724420191


# Multi-Layered Perceptron

In [20]:
clf_mlp = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', MLPClassifier())
])

param_grid_mlp = {
    'clf__hidden_layer_sizes': [(10,), (20,), (10, 10), (20, 20)],
    'clf__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'clf__solver': ['lbfgs', 'sgd', 'adam'],
    'clf__learning_rate': ['constant', 'invscaling', 'adaptive'],
    'clf__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [21]:
random_mlp = RandomizedSearchCV(clf_mlp, param_grid_mlp, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

random_mlp.fit(X_train, y_train)

print(f'Best score = {random_mlp.best_score_}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Best score = 0.8905866302864938


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [22]:
print(f'Best score = {random_mlp.best_score_}')

Best score = 0.8905866302864938


In [23]:
y_pred = random_mlp.predict(X_test)


# Getting the accuracy of the model
conf_mat = confusion_matrix(y_pred, y_test)
acc = np.sum(conf_mat.diagonal()) / np.sum(conf_mat)
print('Overall accuracy: {} %'.format(acc*100))

Overall accuracy: 89.99123063431746 %


In [24]:
pickle.dump(random_mlp, open('model.pkl', 'wb'))