In [1]:
from imblearn.over_sampling import SMOTE
import numpy as np, pandas as pd
from matplotlib import pyplot
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from tensorflow import keras
import os
import tempfile
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from numpy import log
from sklearn.metrics import f1_score, make_scorer, confusion_matrix, accuracy_score, precision_score, recall_score, precision_recall_curve
from keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
import eli5
from eli5.sklearn import PermutationImportance


In [3]:
df = pd.read_csv("C:/Users/szymb/Desktop/Blazej/ML/spotify_preprocessed.csv")

## Splitting dataset - train, validation, test

In [4]:
def split_dataset(df, sample=1):
    df = df.sample(int(sample * df.shape[0]))
    train, validate, test = np.split(df.sample(frac=1, random_state=42), [int(.6*len(df)), int(.8*len(df))])

    print(f" Train: {train.shape}\n", f"Validate: {validate.shape}\n", f"Test: {test.shape}\n")

    Xtrain = train.loc[:, df.columns != 'label']

    Xvalidate = validate.loc[:, df.columns != 'label']

    Xtest = test.loc[:, df.columns != 'label']


    ytrain = train.label.values.ravel()

    yvalidate = validate.label.values.ravel()

    ytest = test.label.values.ravel()

    return Xtrain, Xvalidate, Xtest, ytrain, yvalidate, ytest

In [5]:
trainX, validateX, testX, trainy, validatey, testy = split_dataset(df,sample=1)

 Train: (345369, 19)
 Validate: (115123, 19)
 Test: (115123, 19)



Although the opposite is unlikely, it is worth making sure that the distribution of the variables labeled as 1 is comparable in all subsets

In [6]:
round(trainy.sum()/len(trainy),4)

0.1006

In [7]:
round(validatey.sum()/len(validatey),4)

0.1005

In [8]:
round(testy.sum()/len(testy),4)

0.0991

## Data normalization

Although data normalization is not necessary for artificial neural networks, and especially in 'MLP'. 
This is due to the fact that weights and biases can always be appropriately 'shifted' and scaled up or down. 
However, there are practical reasons for this approach - especially speeding up the calculation and reducing the risk of 
getting stuck at local minima.

In [9]:
s = StandardScaler()
trainX = s.fit_transform(trainX)
validateX = s.transform(validateX)
testX = s.transform(testX)

## Model preparation

Since the ratio of the dependent variables is skewed, additional metrics than accuracy are necessary.

In order to compare the results of the neural network with other models, we decided that the best metric would be f1. However, it is not built into the library, so custom functions are required.

In [11]:
from tensorflow.keras.callbacks import Callback, EarlyStopping, ModelCheckpoint 
class Metrics(Callback):
    def __init__(self, validation):   
        super(Metrics, self).__init__()
        self.validation = validation    
            
        print('validation shape', len(self.validation[0]))
        
    def on_train_begin(self, logs={}):        
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
     
    def on_epoch_end(self, epoch, logs={}):
        val_targ = self.validation[1]   
        val_predict = (np.asarray(self.model.predict(self.validation[0]))).round()        
    
        val_f1 = f1_score(val_targ, val_predict)
        val_recall = recall_score(val_targ, val_predict)         
        val_precision = precision_score(val_targ, val_predict)
        
        self.val_f1s.append(round(val_f1, 6))
        self.val_recalls.append(round(val_recall, 6))
        self.val_precisions.append(round(val_precision, 6))
 
        print(f' — val_f1: {val_f1} — val_precision: {val_precision}, — val_recall: {val_recall}')

In [12]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [13]:
def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

In [14]:
def f1_weighted_loss(true, pred): #shapes (batch, 4)
    ground_positives = K.sum(true, axis=0) + K.epsilon()
    pred_positives = K.sum(pred, axis=0) + K.epsilon()
    true_positives = K.sum(true * pred, axis=0) + K.epsilon() 
    precision = true_positives / pred_positives 
    recall = true_positives / ground_positives
    f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
    weighted_f1 = f1 * ground_positives / K.sum(ground_positives) 
    weighted_f1 = K.sum(weighted_f1)
    return 1 - weighted_f1

In [None]:
def f1_weighted_metric(true, pred):
    predLabels = K.argmax(pred, axis=-1)
    pred = K.one_hot(predLabels, 4) 
    ground_positives = K.sum(true, axis=0) + K.epsilon()
    pred_positives = K.sum(pred, axis=0) + K.epsilon()
    true_positives = K.sum(true * pred, axis=0) + K.epsilon()
    precision = true_positives / pred_positives 
    recall = true_positives / ground_positives
    f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
    weighted_f1 = f1 * ground_positives / K.sum(ground_positives) 
    weighted_f1 = K.sum(weighted_f1)
    return weighted_f1

Relu function is used for hidden layer as it has the best performance for that architecture (and architecture is best for problem type).

And sigmoid function is used for output layer, as the output is binar

In [None]:
# This function is intended to help deciding on the architecture of the network

def prepare_model (output_bias = None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    model = Sequential()
    model.add(Dense(40, input_dim=18, activation='relu'))
    model.add(Dense(20, input_dim=40, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', 
                  metrics=[f1,
                           keras.metrics.Precision(name='precision'),
                           keras.metrics.Recall(name='recall'),
                           keras.metrics.AUC(name='auc'),
                           keras.metrics.AUC(name='prc', curve='PR'),])
    return model

In [None]:
model = prepare_model()
model.summary()

In [None]:
model.predict(trainX[:50])

In [None]:
results = model.evaluate(trainX, trainy, batch_size=50, verbose=0)
print("Loss of the library built-in initialization: " + str(round(results[0],5)))

As the 'loss' value, although obviously too large to be accepted in the final model, does not exceed .9 (in the iterations so far).

Therefore, it will not be necessary to manually modify the biases before initializing the model - the first few epochs are enough for the model to learn that the ratio of the dependent variables is not comparable (values labeled as 0 and 1)

In [None]:
num_positives = df['label'].sum()
print(num_positives)
num_negative = len(df['label'])-num_positives
initial_bias = np.log([num_positives/num_negative])
initial_bias

In [None]:
model = prepare_model(output_bias=initial_bias)
model.predict(trainX[:50])

In [None]:
results = model.evaluate(trainX, trainy, batch_size=50, verbose=0)
print("Loss of the initialization with self output-bias initialization: " + str(round(results[0],5)))

To prevent overfitting, early stopping is used.

## Grid search - checking best suited number of nods in hidden layer

In [None]:
from keras import backend as K
K.set_value(model.optimizer.learning_rate, 0.2)
def grid_search(
    trainX, trainy, validateX, validatey, testX, testy, max_hidden_layers, nods
):
    sm = SMOTE(random_state=42, n_jobs=3)
    trainX, trainy = sm.fit_resample(trainX, trainy)
    trainX, trainy, validateX, validatey, testX, testy = (
        trainX.astype(np.float32),
        trainy.astype(np.float32),
        validateX.astype(np.float32),
        validatey.astype(np.float32),
        testX.astype(np.float32),
        testy.astype(np.float32),
    )
    best_model = [-1, 0]
    if max_hidden_layers == 1:
        for nod1 in nods:

            def prepare_model(nod1=nod1):
                model = Sequential()
                model.add(Dense(nod1, input_dim=18, activation="relu"))
                model.add(Dense(1, activation="sigmoid"))
                model.compile(
                    loss=f1_loss,
                    optimizer="adam",
                    metrics=[
                        f1,
                        keras.metrics.Precision(name="precision"),
                        keras.metrics.Recall(name="recall"),
                        keras.metrics.AUC(name="auc"),
                        keras.metrics.AUC(name="prc", curve="PR"),
                    ],
                )
                return model

            model = prepare_model()
            history = (
                model.fit(
                    trainX,
                    trainy,
                    callbacks=[Metrics(validation=(validateX, validatey))],
                    epochs=100,
                    batch_size=100,
                    verbose=1,
                ),
            )
            test_results = model.evaluate(testX, testy, verbose=0)
            if test_results[1] > best_model[0]:
                best_model = [test_results[1], nod1]
            print(best_model)
    if max_hidden_layers == 2:
        for nod1 in nods:
            for nod2 in nods:

                def prepare_model(nod1=nod1, nod2=nod2):
                    model = Sequential()
                    model.add(Dense(nod1, input_dim=18, activation="relu"))
                    model.add(Dense(nod2, activation="relu"))
                    model.add(Dense(1, activation="sigmoid"))
                    model.compile(
                        loss=f1_loss,
                        optimizer="adam",
                        metrics=[
                            f1,
                            keras.metrics.Precision(name="precision"),
                            keras.metrics.Recall(name="recall"),
                            keras.metrics.AUC(name="auc"),
                            keras.metrics.AUC(name="prc", curve="PR"),
                        ],
                    )
                    return model

                model = prepare_model()
                history = (
                    model.fit(
                        trainX,
                        trainy,
                        callbacks=[Metrics(validation=(validateX, validatey))],
                        epochs=100,
                        batch_size=100,
                        verbose=1,
                    ),
                )
                tf.keras.callbacks.EarlyStopping(
                    monitor="val_loss",
                    verbose=1,
                    patience=20,
                    mode="max",
                    restore_best_weights=True,
                ),
                test_results = model.evaluate(testX, testy, verbose=0)
                if test_results[1] > best_model[0]:
                    best_model = [test_results[1], nod1, nod2]
                print(best_model)
        
    if max_hidden_layers == 4:
        for nod1 in nods:
            for nod2 in nods:
                for nod3 in nods:
                    for nod4 in nods:

                        def prepare_model(nod1=nod1, nod2=nod2, nod3=nod3,nod4=nod4):
                            
                            model = Sequential()
                            model.add(Dense(nod1, input_dim=18, activation="relu"))
                            model.add(Dense(nod2, activation="relu"))
                            model.add(Dense(nod3, activation="relu"))
                            model.add(Dense(nod4, activation="relu"))
                            model.add(Dense(1, activation="sigmoid"))
                            model.compile(
                                loss=f1_loss,
                                optimizer="adam",
                                metrics=[
                                    f1,
                                    keras.metrics.Precision(name="precision"),
                                    keras.metrics.Recall(name="recall"),
                                    keras.metrics.AUC(name="auc"),
                                    keras.metrics.AUC(name="prc", curve="PR"),
                                ],
                            )
                            return model

                        model = prepare_model()
                        history = (
                            model.fit(
                                trainX,
                                trainy,
                                callbacks=[Metrics(validation=(validateX, validatey))],
                                epochs=100,
                                batch_size=100,
                                verbose=1,
                            ),
                        )
                        tf.keras.callbacks.EarlyStopping(
                            monitor="val_loss",
                            verbose=1,
                            patience=20,
                            mode="max",
                            restore_best_weights=True,
                        ),
                        test_results = model.evaluate(testX, testy, verbose=0)
                        if test_results[1] > best_model[0]:
                            best_model = [test_results[1], nod1, nod2,nod3,nod4]
                        print(best_model)
    return best_model


In [None]:
nods = [4,5,8,10,12,15,20,30,40]
grid_search(trainX,trainy,testX,testy,validateX,validatey, 4 , nods)

In [None]:
nods = [1,2,3,4,5,10,15,20,25,30,35,40,45,50,55,60]
grid_search(trainX,trainy,testX,testy,validateX,validatey, 1 , nods)

In [None]:
nods = [2,3,4,5,10,15,20,25,30,35,40,45,50,55,60]
grid_search(trainX,trainy,testX,testy,validateX,validatey, 2 , nods)

## Running model with best performacne (regarding f1 score)

In [41]:

sm = SMOTE(random_state=42, n_jobs=3)
trainX, trainy = sm.fit_resample(trainX, trainy)
trainX, trainy, validateX, validatey, testX, testy = (
    trainX.astype(np.float32),
    trainy.astype(np.float32),
    validateX.astype(np.float32),
    validatey.astype(np.float32),
    testX.astype(np.float32),
    testy.astype(np.float32),)
def prepare_model(nod1=40, nod2=30, nod3=20,nod4=10):
                            model = Sequential()
                            model.add(Dense(nod1, input_dim=18, activation="relu"))
                            model.add(Dense(nod2, activation="relu"))
                            model.add(Dense(nod3, activation="relu"))
                            model.add(Dense(nod4, activation="relu"))
                            model.add(Dense(1, activation="sigmoid"))
                            model.compile(
                                loss=f1_loss,
                                optimizer="adam",
                                metrics=[
                                    f1,
                                    keras.metrics.Precision(name="precision"),
                                    keras.metrics.Recall(name="recall"),
                                    keras.metrics.AUC(name="auc"),
                                    keras.metrics.AUC(name="prc", curve="PR"),],)
                            return model

model = prepare_model()
history = (
    model.fit(
        trainX,
        trainy,
        callbacks=[Metrics(validation=(validateX, validatey))],
        epochs=150,
        batch_size=100,
        verbose=1,),)
tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    verbose=1,
    patience=20,
    mode="max",
    restore_best_weights=True,),
test_results = model.evaluate(testX, testy, verbose=0)

validation shape 115123
Epoch 1/150
 — val_f1: 0.3726526611182655 — val_precision: 0.2396242738845631, — val_recall: 0.8377117179398548
Epoch 2/150
 — val_f1: 0.3690245661996431 — val_precision: 0.23646051287041994, — val_recall: 0.8398721050812306
Epoch 3/150
 — val_f1: 0.38164646686500403 — val_precision: 0.24816286255874945, — val_recall: 0.8258727964051158
Epoch 4/150
 — val_f1: 0.3635131173970394 — val_precision: 0.23065656095973217, — val_recall: 0.8573280331835464
Epoch 5/150
 — val_f1: 0.37574926542605286 — val_precision: 0.24294543796544912, — val_recall: 0.8288109229173868
Epoch 6/150
 — val_f1: 0.406048507377068 — val_precision: 0.2764222562404324, — val_recall: 0.7646042170756999
Epoch 7/150
 — val_f1: 0.3699958338067644 — val_precision: 0.23691613716835622, — val_recall: 0.844192879363982
Epoch 8/150
 — val_f1: 0.37614247048209315 — val_precision: 0.2432891865834475, — val_recall: 0.8286380919460767
Epoch 9/150
 — val_f1: 0.3705585666967781 — val_precision: 0.2368636319892

 — val_f1: 0.3954370964927587 — val_precision: 0.26447076671723657, — val_recall: 0.7833563774628414
Epoch 14/150
 — val_f1: 0.3796484515126151 — val_precision: 0.24672330725793917, — val_recall: 0.8231075008641549
Epoch 15/150
 — val_f1: 0.39809536010932467 — val_precision: 0.26437140183205243, — val_recall: 0.8055651572761839
Epoch 16/150
 — val_f1: 0.406028723002819 — val_precision: 0.27318617640909226, — val_recall: 0.7903560318008988
Epoch 17/150
 — val_f1: 0.3811167996190023 — val_precision: 0.24735974447478234, — val_recall: 0.8298479087452472
Epoch 18/150
 — val_f1: 0.3737489934430001 — val_precision: 0.24015670428226482, — val_recall: 0.8422917386795714
Epoch 19/150
 — val_f1: 0.372515486641584 — val_precision: 0.23875870804306523, — val_recall: 0.847044590390598
Epoch 20/150
 — val_f1: 0.3849418262299742 — val_precision: 0.251585848130159, — val_recall: 0.8191323885240235
Epoch 21/150
 — val_f1: 0.39782494299245746 — val_precision: 0.2665413092020214, — val_recall: 0.78396128

 — val_f1: 0.3940630953128598 — val_precision: 0.26002154874713374, — val_recall: 0.8133425509851365
Epoch 32/150
 — val_f1: 0.3821937435325957 — val_precision: 0.24826783867631852, — val_recall: 0.8298479087452472
Epoch 33/150
 — val_f1: 0.38903757183460924 — val_precision: 0.25597204400884543, — val_recall: 0.8102315935015555
Epoch 34/150
 — val_f1: 0.3980743192417631 — val_precision: 0.2649254798752753, — val_recall: 0.8002938126512271
Epoch 35/150
 — val_f1: 0.3602954755309326 — val_precision: 0.22910892949410494, — val_recall: 0.8429830625648116
Epoch 36/150
 — val_f1: 0.3810017133521935 — val_precision: 0.2475790999948216, — val_recall: 0.8263048738333909
Epoch 37/150
 — val_f1: 0.40143788302042055 — val_precision: 0.2678359869390586, — val_recall: 0.8009851365364673
Epoch 38/150
 — val_f1: 0.381484373435842 — val_precision: 0.24826058631921824, — val_recall: 0.8232803318354649
Epoch 39/150
 — val_f1: 0.3683816090231199 — val_precision: 0.2351801349217741, — val_recall: 0.8495506

 — val_f1: 0.37766573149437915 — val_precision: 0.24429554049595167, — val_recall: 0.8317490494296578
Epoch 45/150
 — val_f1: 0.3771815475611417 — val_precision: 0.24338983050847457, — val_recall: 0.8376253024541997
Epoch 46/150
 — val_f1: 0.36508437447788067 — val_precision: 0.2324861094692044, — val_recall: 0.8497234704459039
Epoch 47/150
 — val_f1: 0.3998853312163712 — val_precision: 0.26844504973946, — val_recall: 0.7835292084341514
Epoch 48/150
 — val_f1: 0.38069852220610717 — val_precision: 0.2461850443599493, — val_recall: 0.8392671966816454
Epoch 49/150
 — val_f1: 0.3902642153698807 — val_precision: 0.25706997284472116, — val_recall: 0.8098859315589354
Epoch 50/150
 — val_f1: 0.3346390780683647 — val_precision: 0.2060819603211147, — val_recall: 0.8895610093328724
Epoch 51/150
 — val_f1: 0.38430805677688773 — val_precision: 0.2503072939822685, — val_recall: 0.8270826132042862
Epoch 52/150
 — val_f1: 0.3721045601984354 — val_precision: 0.23877271169009257, — val_recall: 0.8426374

 — val_f1: 0.372242141407243 — val_precision: 0.23847256994239324, — val_recall: 0.8478223297614933
Epoch 66/150
 — val_f1: 0.34660572081562685 — val_precision: 0.21843633253814645, — val_recall: 0.8387487037677152
Epoch 67/150
 — val_f1: 0.3901486709716173 — val_precision: 0.2594960189492309, — val_recall: 0.7857760110611821
Epoch 68/150
 — val_f1: 0.37086449182029646 — val_precision: 0.23848493089722353, — val_recall: 0.8335637746284134
Epoch 69/150
 — val_f1: 0.3844697363766879 — val_precision: 0.2531995943983118, — val_recall: 0.7983926719668164
Epoch 70/150
 — val_f1: 0.3724201017666228 — val_precision: 0.2388939145742158, — val_recall: 0.8443657103352921
Epoch 71/150
 — val_f1: 0.39346288246825195 — val_precision: 0.2611142533936652, — val_recall: 0.7978741790528863
Epoch 72/150
 — val_f1: 0.3890005595043206 — val_precision: 0.25585389123620006, — val_recall: 0.8110957483581058
Epoch 73/150
 — val_f1: 0.3776586273074412 — val_precision: 0.2449000538862231, — val_recall: 0.8247493

 — val_f1: 0.37376342430424575 — val_precision: 0.24042985192888625, — val_recall: 0.8390943657103352
Epoch 77/150
 — val_f1: 0.384595221292469 — val_precision: 0.2508218709728322, — val_recall: 0.8241444866920152
Epoch 78/150
 — val_f1: 0.3894118148132374 — val_precision: 0.25837901995139323, — val_recall: 0.7900967853439337
Epoch 79/150
 — val_f1: 0.3845730476792303 — val_precision: 0.25164732559690545, — val_recall: 0.8151572761838921
Epoch 80/150
 — val_f1: 0.3681503946352214 — val_precision: 0.23538480067445502, — val_recall: 0.8444521258209471
Epoch 81/150
 — val_f1: 0.4013320732640295 — val_precision: 0.27131122604198354, — val_recall: 0.770653301071552
Epoch 82/150
 — val_f1: 0.36841201061332013 — val_precision: 0.23643268406223203, — val_recall: 0.8339094365710336
Epoch 83/150
 — val_f1: 0.34683587140439925 — val_precision: 0.2156413061774112, — val_recall: 0.8856723124783962
Epoch 84/150
 — val_f1: 0.36326916306905566 — val_precision: 0.23120828321422676, — val_recall: 0.8471

 — val_f1: 0.35625380317530564 — val_precision: 0.22644693968447455, — val_recall: 0.8347735914275838
Epoch 89/150
 — val_f1: 0.3987473903966598 — val_precision: 0.26833861421653615, — val_recall: 0.7757518147251987
Epoch 90/150
 — val_f1: 0.4069406477320113 — val_precision: 0.2779958219915174, — val_recall: 0.758987210508123
Epoch 91/150
 — val_f1: 0.3903660104187809 — val_precision: 0.2641054590949315, — val_recall: 0.7479260283442793
Epoch 92/150
 — val_f1: 0.4079511132963405 — val_precision: 0.279777770641318, — val_recall: 0.752851711026616
Epoch 93/150
 — val_f1: 0.37748743718592964 — val_precision: 0.24595316674524595, — val_recall: 0.8114414103007259
Epoch 94/150
 — val_f1: 0.39069457738565905 — val_precision: 0.2573041714630269, — val_recall: 0.8112685793294159
Epoch 95/150
 — val_f1: 0.4091610792378056 — val_precision: 0.2789594395048541, — val_recall: 0.7672830971310058
Epoch 96/150
 — val_f1: 0.3916325299952174 — val_precision: 0.2578657685040664, — val_recall: 0.8137746284

 — val_f1: 0.3646583654400952 — val_precision: 0.2323805005337445, — val_recall: 0.8465260974766678
Epoch 99/150
 — val_f1: 0.35730908697299346 — val_precision: 0.2257917417177774, — val_recall: 0.8557725544417559
Epoch 100/150
 — val_f1: 0.38592198364633673 — val_precision: 0.25255544583522377, — val_recall: 0.817749740753543
Epoch 101/150
 — val_f1: 0.38817960120499206 — val_precision: 0.2544257891202149, — val_recall: 0.8184410646387833
Epoch 102/150
 — val_f1: 0.3996428898452523 — val_precision: 0.27182984554060785, — val_recall: 0.7543207742827515
Epoch 103/150
 — val_f1: 0.40158555976082233 — val_precision: 0.27103171004504095, — val_recall: 0.7748012443829935
Epoch 104/150
 — val_f1: 0.3724198506807203 — val_precision: 0.2390254663104488, — val_recall: 0.8427238161078465
Epoch 105/150
 — val_f1: 0.3776007516000862 — val_precision: 0.24408512361142742, — val_recall: 0.8335637746284134
Epoch 106/150
 — val_f1: 0.38851493479175425 — val_precision: 0.2567560053380783, — val_recall: 

 — val_f1: 0.3585233480807466 — val_precision: 0.2261337189597126, — val_recall: 0.8648461804355341
Epoch 111/150
 — val_f1: 0.3591265087994103 — val_precision: 0.22823413674372847, — val_recall: 0.8420324922226063
Epoch 112/150
 — val_f1: 0.37601052102270527 — val_precision: 0.24221358449195196, — val_recall: 0.8400449360525406
Epoch 113/150
 — val_f1: 0.3865726238354115 — val_precision: 0.2522883747922657, — val_recall: 0.826477704804701
Epoch 114/150
 — val_f1: 0.3859415341051053 — val_precision: 0.2542242703533026, — val_recall: 0.8008987210508123
Epoch 115/150
 — val_f1: 0.4052861442375145 — val_precision: 0.274533247493792, — val_recall: 0.7738506740407881
Epoch 116/150
 — val_f1: 0.3476321936716925 — val_precision: 0.21857753249065337, — val_recall: 0.8487729001036985
Epoch 117/150
 — val_f1: 0.40201729106628237 — val_precision: 0.2718304713189624, — val_recall: 0.7715174559281023
Epoch 118/150
 — val_f1: 0.39912590198265463 — val_precision: 0.27092926233814396, — val_recall: 0.

 — val_f1: 0.39655966825372446 — val_precision: 0.2657550360241141, — val_recall: 0.7809367438645005
Epoch 122/150
 — val_f1: 0.39807558342041105 — val_precision: 0.26606332208124783, — val_recall: 0.7900967853439337
Epoch 123/150
 — val_f1: 0.3852130325814536 — val_precision: 0.2539936102236422, — val_recall: 0.7969236087106809
Epoch 124/150
 — val_f1: 0.40048645867440014 — val_precision: 0.27454627076252164, — val_recall: 0.7398893881783616
Epoch 125/150
 — val_f1: 0.3974171958125965 — val_precision: 0.2643248487615569, — val_recall: 0.8004666436225372
Epoch 126/150
 — val_f1: 0.3759645250141626 — val_precision: 0.24288851308715514, — val_recall: 0.8315762184583477
Epoch 127/150
 — val_f1: 0.38434712084347117 — val_precision: 0.2510861502596164, — val_recall: 0.8190459730383685
Epoch 128/150
 — val_f1: 0.37141352987628323 — val_precision: 0.2373480078819628, — val_recall: 0.8535257518147252
Epoch 129/150
 — val_f1: 0.40612349355544186 — val_precision: 0.27787328876154094, — val_recal

 — val_f1: 0.3556536322711625 — val_precision: 0.2234819071284361, — val_recall: 0.870463187003111
Epoch 131/150
 — val_f1: 0.3587411707615162 — val_precision: 0.22647332471010054, — val_recall: 0.8624265468371932
Epoch 132/150
 — val_f1: 0.37641563131759215 — val_precision: 0.24351778957259315, — val_recall: 0.8286380919460767
Epoch 133/150
 — val_f1: 0.3924126455906822 — val_precision: 0.26397636032594096, — val_recall: 0.7642585551330798
Epoch 134/150
 — val_f1: 0.3694167906320327 — val_precision: 0.2370172776662914, — val_recall: 0.8369339785689596
Epoch 135/150
 — val_f1: 0.3843511296124156 — val_precision: 0.25083082766260484, — val_recall: 0.8218112685793294
Epoch 136/150
 — val_f1: 0.36536184697350893 — val_precision: 0.23244044816872234, — val_recall: 0.8533529208434152
Epoch 137/150
 — val_f1: 0.37668057037540015 — val_precision: 0.24286393315487953, — val_recall: 0.8389215347390252
Epoch 138/150
 — val_f1: 0.39020147847514136 — val_precision: 0.2565834263772773, — val_recall

 — val_f1: 0.3738092505451624 — val_precision: 0.24003832358866015, — val_recall: 0.8443657103352921
Epoch 141/150
 — val_f1: 0.38556027044037927 — val_precision: 0.25198375839282394, — val_recall: 0.820515036294504
Epoch 142/150
 — val_f1: 0.3910533311181259 — val_precision: 0.2573810825587753, — val_recall: 0.8136017974421016
Epoch 143/150
 — val_f1: 0.3860702128945195 — val_precision: 0.25170050161514823, — val_recall: 0.8282060145178016
Epoch 144/150
 — val_f1: 0.39966804272885903 — val_precision: 0.2651177234487042, — val_recall: 0.8115278257863809
Epoch 145/150
 — val_f1: 0.3887603170377046 — val_precision: 0.25475775063749834, — val_recall: 0.8201693743518839
Epoch 146/150
 — val_f1: 0.3915877761169241 — val_precision: 0.25770576019237074, — val_recall: 0.8149844452125821
Epoch 147/150
 — val_f1: 0.3909660359364501 — val_precision: 0.2609286850860697, — val_recall: 0.77938126512271
Epoch 148/150
 — val_f1: 0.40054965428490413 — val_precision: 0.26789204959883295, — val_recall: 0

## Results

In [42]:
test_results = model.evaluate(testX, testy, verbose=0)
for x, y in zip(model.metrics_names, test_results):
  print(x, ': ', y)
print()

loss :  0.6198608875274658
f1 :  0.3801007568836212
precision :  0.7631115317344666
recall :  0.8763002157211304
auc :  0.803955078125
prc :  0.7452502250671387



## Feature importance

In [None]:
trainX_raw, validateX_raw, testX_raw, trainy_raw, validatey_raw, testy_raw = split_dataset(df,sample=1)

In [None]:
my_model = KerasRegressor(build_fn=prepare_model)    
my_model.fit(trainX,trainy)

perm = PermutationImportance(my_model, random_state=1).fit(trainX,trainy)
eli5.show_weights(perm, feature_names = trainX_raw.columns.tolist())

In [62]:
eli5.show_weights(perm, feature_names = trainX_raw.columns.tolist())

Weight,Feature
0.1126  ± 0.0012,combined_followers
0.0313  ± 0.0003,loudness
0.0258  ± 0.0007,danceability
0.0149  ± 0.0004,valence
0.0140  ± 0.0004,acousticness
0.0093  ± 0.0006,duration_ms
0.0082  ± 0.0004,energy
0.0063  ± 0.0003,release_day
0.0044  ± 0.0003,explicit
0.0025  ± 0.0002,speechiness


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=efdb1931-d19c-4850-b12b-726a7087f8c0' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>