In [121]:
import pandas as pd
import numpy as np
import sklearn as skl
from sklearn import linear_model, svm, neighbors, preprocessing, metrics, model_selection, ensemble, multiclass, neural_network
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import random
from collections import Counter
from tqdm.notebook import tqdm
from imblearn.ensemble import BalancedRandomForestClassifier
import imblearn
import xgboost as xgb

In [2]:
RANDOM_STATE = 789
genres_df = pd.read_csv("src/data/genres_v2.csv")
playlist_df = pd.read_csv("src/data/playlists.csv")

# data cleaning
take_notna = lambda s1,s2: s1 if type(s1) == str else s2
genres_df["song_title"]=genres_df["song_name"].combine(genres_df["title"],take_notna, fill_value=None)
genres_df.drop(columns = ["song_name","title","Unnamed: 0"], inplace=True)
genres_df.dropna(inplace=True)
# changing dtype
genres_df["key"] = genres_df["key"].astype(int)
genres_df["mode"] = genres_df["mode"].astype(int)
genres_df["duration_ms"] = genres_df["duration_ms"].astype(int)
genres_df["time_signature"] = genres_df["time_signature"].astype(int)
# selecting columns to use
valid_columns = ["danceability","energy","key","loudness","mode","speechiness","acousticness","instrumentalness",
                "liveness","valence","tempo","duration_ms","time_signature","genre"]
genres_df = genres_df[valid_columns].reset_index(drop=True)

genres_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,genre
0,0.831,0.814,2,-7.364,1,0.42,0.0598,0.0134,0.0556,0.389,156.985,124539,4,Dark Trap
1,0.719,0.493,8,-7.23,1,0.0794,0.401,0.0,0.118,0.124,115.08,224427,4,Dark Trap
2,0.85,0.893,5,-4.783,1,0.0623,0.0138,4e-06,0.372,0.0391,218.05,98821,4,Dark Trap
3,0.476,0.781,0,-4.71,1,0.103,0.0237,0.0,0.114,0.175,186.948,123661,3,Dark Trap
4,0.798,0.624,2,-7.668,1,0.293,0.217,0.0,0.166,0.591,147.988,123298,4,Dark Trap


In [3]:
train_data, test_data = model_selection.train_test_split(genres_df, random_state = RANDOM_STATE)
train_data.reset_index(drop = True, inplace = True)
test_data.reset_index(drop = True, inplace = True)

# train_data_sub, val_data = model_selection.train_test_split(train_data, random_state = RANDOM_STATE)
# train_data_sub.reset_index(drop = True, inplace = True)
# val_data.reset_index(drop = True, inplace = True)

In [31]:
# encoding target variable
le = preprocessing.LabelEncoder()
train_label = le.fit_transform(train_data["genre"])
test_label = le.transform(test_data["genre"])
train_data["labels"] = train_label
test_data["labels"] = test_label

# create val dataset
train_data_sub, val_data = model_selection.train_test_split(train_data, test_size = 0.1, random_state = RANDOM_STATE)
train_data_sub.reset_index(drop = True, inplace = True)
val_data.reset_index(drop = True, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [33]:
# preparing datasets to train on xgboost
data_cols = []
for col in train_data.columns.tolist():
    if col == "genre" or col =="labels":
        pass
    else:
        data_cols.append(col)
dtrain = xgb.DMatrix(data = train_data[data_cols], label = train_data["labels"])
dtest = xgb.DMatrix(data = test_data[data_cols], label = test_data["labels"])
dtrain_sub = xgb.DMatrix(data = train_data_sub[data_cols], label = train_data_sub["labels"])
dval = xgb.DMatrix(data = val_data[data_cols], label = val_data["labels"])
# saving to binary to decrease the loading speed
dtrain.save_binary("src/data/xbg_train.buffer")
dtest.save_binary("src/data/xgb_test.buffer")
dtrain_sub.save_binary("src/data/xgb_train_sub.buffer")
dval.save_binary("src/data/xgb_val.buffer")


In [27]:
# training xgboost
num_classes = len(train_data["labels"].unique())
param = {"max_depth":6,
         "eta":0.1,
         "nthread":-1,
         "gpu_id":0,
         "tree_method":"gpu_hist",
         "objective":"multi:softmax",
         "num_class":num_classes,
         "eval_metric":"auc"}
n_rounds = 100
bst = xgb.train(param, dtrain, n_rounds)

In [28]:
# classification report on xgboost classifier
print(metrics.classification_report(train_label, bst.predict(dtrain), target_names = le.classes_))
print(metrics.classification_report(test_label, bst.predict(dtest), target_names = le.classes_))

                 precision    recall  f1-score   support

      Dark Trap       0.73      0.59      0.65      3445
            Emo       0.84      0.83      0.83      1265
         Hiphop       0.64      0.58      0.61      2275
            Pop       0.81      0.32      0.46       344
            Rap       0.89      0.36      0.52      1401
            RnB       0.61      0.61      0.61      1558
     Trap Metal       0.69      0.46      0.55      1488
Underground Rap       0.52      0.76      0.62      4380
            dnb       0.99      1.00      0.99      2250
      hardstyle       0.91      0.97      0.93      2205
      psytrance       0.96      0.95      0.95      2172
      techhouse       0.93      0.96      0.94      2278
         techno       0.92      0.92      0.92      2204
         trance       0.88      0.93      0.90      2216
           trap       0.91      0.91      0.91      2243

       accuracy                           0.78     31724
      macro avg       0.81   

In [29]:
# imbalanced classification report on xgboost
print(imblearn.metrics.classification_report_imbalanced(train_label, bst.predict(dtrain), target_names = le.classes_))
print(imblearn.metrics.classification_report_imbalanced(test_label, bst.predict(dtest), target_names = le.classes_))

                       pre       rec       spe        f1       geo       iba       sup

      Dark Trap       0.73      0.59      0.97      0.65      0.76      0.55      3445
            Emo       0.84      0.83      0.99      0.83      0.91      0.82      1265
         Hiphop       0.64      0.58      0.97      0.61      0.75      0.54      2275
            Pop       0.81      0.32      1.00      0.46      0.56      0.30       344
            Rap       0.89      0.36      1.00      0.52      0.60      0.34      1401
            RnB       0.61      0.61      0.98      0.61      0.78      0.58      1558
     Trap Metal       0.69      0.46      0.99      0.55      0.68      0.43      1488
Underground Rap       0.52      0.76      0.89      0.62      0.82      0.67      4380
            dnb       0.99      1.00      1.00      0.99      1.00      1.00      2250
      hardstyle       0.91      0.97      0.99      0.93      0.98      0.96      2205
      psytrance       0.96      0.95      

In [67]:
# listing out the metrics system available from sklearn
metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [84]:
# optimize xgboost
train_data_sub, val_data = model_selection.train_test_split(train_data, test_size = 0.1, random_state = RANDOM_STATE)
num_classes  = len(train_data["genre"].unique())
eval_list = [(dtrain,"train"),(dval,"validate"),(dtest,"test")]
param = {"nthread":-1,
         "gpu_id":0,
         "tree_method":"gpu_hist",
         "objective":"multi:softmax",
         "num_class":num_classes,
         "eval_metric":"multi:softmax",
         "use_label_encoder":False,
        "seed":RANDOM_STATE}
xgb_pipe = xgb.XGBClassifier(param)

params = {"eta":[0.03, 0.01, 0.005, 0.001],
         "min_child_weight":[1,3,5,7,10],
         "gamma":[0, 0.5, 1, 1.5, 2, 2.5],
         "subsample":[0.6,0.8, 1.0],
         "colsample_bytree":[0.6,0.8, 1.0],
         "max_depth":[3,4,5,6,7,8,9,10],
         "reg_lambda":[0.4, 0.6, 0.8, 1, 1.2, 1.4]}
fit_params = {"early_stopping_rounds":10,
             "eval_set":[(val_data[data_cols],val_data["labels"])],
             "eval_metric":"multi:softmax"}

rs_clf = model_selection.RandomizedSearchCV(xgb_pipe, params, scoring="roc_auc_ovr",
                                  n_jobs=1,n_iter=20, cv=3,
                                   random_state = RANDOM_STATE)

rs_clf.fit(train_data[data_cols],train_data["labels"])





















































































































































































































































RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                                           subsample=None, tree_method=None,
                                           validate_parameters=None,
                                   

In [89]:
# imbalanced classification report on optimized xgboost using roc auc as a metrics system
print(imblearn.metrics.classification_report_imbalanced(train_data["labels"],
                                                  rs_clf.predict(train_data[data_cols]),
                                                  target_names=le.classes_))

print(imblearn.metrics.classification_report_imbalanced(test_data["labels"],
                                                  rs_clf.predict(test_data[data_cols]),
                                                  target_names=le.classes_))

pd.DataFrame(rs_clf.cv_results_).sort_values("rank_test_score")

                       pre       rec       spe        f1       geo       iba       sup

      Dark Trap       0.68      0.54      0.97      0.61      0.73      0.50      3114
            Emo       0.82      0.77      0.99      0.79      0.87      0.75      1140
         Hiphop       0.63      0.56      0.97      0.59      0.74      0.52      2064
            Pop       0.81      0.13      1.00      0.22      0.36      0.12       300
            Rap       0.95      0.30      1.00      0.45      0.55      0.28      1276
            RnB       0.57      0.54      0.98      0.55      0.73      0.51      1398
     Trap Metal       0.68      0.39      0.99      0.49      0.62      0.36      1336
Underground Rap       0.49      0.77      0.87      0.60      0.82      0.66      3948
            dnb       0.97      0.99      1.00      0.98      1.00      0.99      2026
      hardstyle       0.88      0.95      0.99      0.91      0.97      0.94      1977
      psytrance       0.95      0.94      

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_reg_lambda,param_min_child_weight,param_max_depth,param_gamma,param_eta,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
8,19.005445,0.294881,0.137581,0.010002,1.0,1.2,3,7,1.5,0.03,0.8,"{'subsample': 1.0, 'reg_lambda': 1.2, 'min_chi...",0.950181,0.95086,0.950343,0.950461,0.00029,1
12,23.500025,0.12935,0.146319,0.004166,1.0,0.6,7,10,0.5,0.01,0.6,"{'subsample': 1.0, 'reg_lambda': 0.6, 'min_chi...",0.947933,0.948925,0.948339,0.948399,0.000407,2
9,19.03835,0.098154,0.143256,0.014478,0.6,0.6,1,7,0.5,0.01,0.8,"{'subsample': 0.6, 'reg_lambda': 0.6, 'min_chi...",0.947627,0.948614,0.948234,0.948158,0.000406,3
0,15.112634,0.76811,0.120299,0.014379,0.6,1.4,7,6,0.0,0.01,0.6,"{'subsample': 0.6, 'reg_lambda': 1.4, 'min_chi...",0.947403,0.948768,0.948019,0.948064,0.000558,4
17,17.987884,0.055443,0.117686,0.002098,0.8,1.0,1,6,1.0,0.01,0.6,"{'subsample': 0.8, 'reg_lambda': 1, 'min_child...",0.948115,0.948486,0.947454,0.948018,0.000427,5
13,25.087349,0.098218,0.139582,0.01339,1.0,0.4,1,9,2.0,0.01,0.6,"{'subsample': 1.0, 'reg_lambda': 0.4, 'min_chi...",0.947056,0.947549,0.946818,0.947141,0.000305,6
4,13.301558,0.157031,0.104106,0.003792,1.0,0.6,10,5,1.0,0.01,0.6,"{'subsample': 1.0, 'reg_lambda': 0.6, 'min_chi...",0.945127,0.946633,0.946009,0.945923,0.000618,7
15,14.022394,0.057102,0.120369,0.016562,0.6,1.0,5,5,1.0,0.005,0.6,"{'subsample': 0.6, 'reg_lambda': 1, 'min_child...",0.944343,0.945689,0.944817,0.944949,0.000557,8
10,19.809258,0.091378,0.120953,0.004964,0.6,0.4,7,8,1.5,0.01,1.0,"{'subsample': 0.6, 'reg_lambda': 0.4, 'min_chi...",0.943793,0.944025,0.944904,0.944241,0.000479,9
7,20.385061,0.076978,0.120643,0.002177,0.8,1.0,3,7,1.5,0.001,0.8,"{'subsample': 0.8, 'reg_lambda': 1, 'min_child...",0.94339,0.945132,0.943979,0.944167,0.000723,10


In [100]:
# making xgboost classifier using the best parameter
rs_roc_auc_ovr = pd.DataFrame(rs_clf.cv_results_)
best_params = rs_clf.best_params_
best_params.update(param)

bst = xgb.XGBClassifier(best_params)
bst.fit(train_data[data_cols], train_data["labels"])





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [103]:
# classification report on xgboost with best param
print(imblearn.metrics.classification_report_imbalanced(train_data["labels"],bst.predict(train_data[data_cols]),
                                                  target_names = le.classes_))
print(imblearn.metrics.classification_report_imbalanced(test_data["labels"],bst.predict(test_data[data_cols]),
                                                  target_names = le.classes_))
best_params.update(param);

                       pre       rec       spe        f1       geo       iba       sup

      Dark Trap       0.88      0.77      0.99      0.82      0.87      0.75      3114
            Emo       0.95      0.97      1.00      0.96      0.98      0.96      1140
         Hiphop       0.83      0.77      0.99      0.79      0.87      0.74      2064
            Pop       0.94      0.87      1.00      0.90      0.93      0.86       300
            Rap       0.87      0.66      1.00      0.75      0.81      0.63      1276
            RnB       0.81      0.81      0.99      0.81      0.89      0.79      1398
     Trap Metal       0.82      0.70      0.99      0.75      0.83      0.67      1336
Underground Rap       0.68      0.85      0.94      0.76      0.89      0.79      3948
            dnb       1.00      1.00      1.00      1.00      1.00      1.00      2026
      hardstyle       0.99      1.00      1.00      1.00      1.00      1.00      1977
      psytrance       1.00      1.00      

In [120]:
# getting the feature importance extracted from best xgboost model
pd.DataFrame([{"feat_importance":importance} for importance, dcol in zip(rs_clf.best_estimator_.feature_importances_, data_cols)], index=data_cols).sort_values("feat_importance",ascending=False)

Unnamed: 0,feat_importance
tempo,0.23562
instrumentalness,0.153189
duration_ms,0.147276
danceability,0.098365
loudness,0.077594
energy,0.058657
speechiness,0.051928
valence,0.049582
acousticness,0.043925
mode,0.029884


In [112]:
knn_clf = neighbors.KNeighborsClassifier(n_neighbors = 10)
knn_clf.fit(train_data[data_cols], train_data["labels"])

KNeighborsClassifier(n_neighbors=10)

In [113]:
print(imblearn.metrics.classification_report_imbalanced(train_data["labels"],knn_clf.predict(train_data[data_cols]),
                                                  target_names = le.classes_))
print(imblearn.metrics.classification_report_imbalanced(test_data["labels"],knn_clf.predict(test_data[data_cols]),
                                                  target_names = le.classes_))


                       pre       rec       spe        f1       geo       iba       sup

      Dark Trap       0.32      0.42      0.89      0.36      0.61      0.36      3114
            Emo       0.28      0.23      0.98      0.25      0.47      0.21      1140
         Hiphop       0.32      0.29      0.95      0.30      0.52      0.26      2064
            Pop       0.30      0.04      1.00      0.07      0.20      0.04       300
            Rap       0.32      0.14      0.99      0.20      0.38      0.13      1276
            RnB       0.31      0.16      0.98      0.21      0.39      0.14      1398
     Trap Metal       0.39      0.25      0.98      0.30      0.49      0.22      1336
Underground Rap       0.41      0.48      0.89      0.45      0.66      0.41      3948
            dnb       0.46      0.63      0.94      0.53      0.77      0.58      2026
      hardstyle       0.37      0.38      0.95      0.38      0.60      0.34      1977
      psytrance       0.55      0.76      

In [137]:
# training multilayer classifier
# scaling since neural networks need variabels to be scaled
ss = preprocessing.StandardScaler()
ss_train_data = ss.fit_transform(train_data[data_cols])
ss_test_data = ss.transform(test_data[data_cols])
mlp = neural_network.MLPClassifier(hidden_layer_sizes = (200,50), learning_rate = "adaptive",
                                  max_iter=3000,early_stopping = True)
mlp.fit(ss_train_data, train_data["labels"])
print(imblearn.metrics.classification_report_imbalanced(train_data["labels"],mlp.predict(ss_train_data),
                                                  target_names = le.classes_))
print(imblearn.metrics.classification_report_imbalanced(test_data["labels"],mlp.predict(ss_test_data),
                                                  target_names = le.classes_))


                       pre       rec       spe        f1       geo       iba       sup

      Dark Trap       0.55      0.54      0.95      0.55      0.71      0.49      3114
            Emo       0.71      0.69      0.99      0.70      0.82      0.66      1140
         Hiphop       0.52      0.42      0.97      0.47      0.64      0.38      2064
            Pop       0.32      0.13      1.00      0.18      0.36      0.12       300
            Rap       0.64      0.35      0.99      0.45      0.59      0.32      1276
            RnB       0.43      0.44      0.97      0.44      0.65      0.41      1398
     Trap Metal       0.58      0.27      0.99      0.37      0.52      0.25      1336
Underground Rap       0.48      0.63      0.89      0.54      0.75      0.55      3948
            dnb       0.95      0.98      1.00      0.97      0.99      0.98      2026
      hardstyle       0.84      0.91      0.99      0.87      0.95      0.89      1977
      psytrance       0.92      0.91      

In [136]:
mlp.loss_ # the loss funciton defaults to adam

0.8245094316157416

In [196]:
# classification report for mlp for test set
print(metrics.classification_report(test_data["labels"],mlp.predict(ss_test_data),
                                                  target_names = le.classes_))


                 precision    recall  f1-score   support

      Dark Trap       0.49      0.49      0.49      1133
            Emo       0.63      0.61      0.62       415
         Hiphop       0.45      0.38      0.41       747
            Pop       0.31      0.12      0.17       117
            Rap       0.58      0.32      0.42       447
            RnB       0.40      0.37      0.38       541
     Trap Metal       0.56      0.26      0.36       468
Underground Rap       0.47      0.61      0.53      1495
            dnb       0.94      0.96      0.95       716
      hardstyle       0.81      0.89      0.85       731
      psytrance       0.92      0.91      0.91       789
      techhouse       0.82      0.87      0.84       697
         techno       0.86      0.84      0.85       752
         trance       0.75      0.86      0.80       783
           trap       0.77      0.81      0.79       744

       accuracy                           0.67     10575
      macro avg       0.65   