# Music Genre Prediction

In [1]:
class Config:
    NB = '205'
    dataset_NB = '103'
    #stacking_NB = ['212', '213', '214']
    stacking_NB = False

    raw_data_dir = '../data/raw/'
    processed_data_dir = '../data/processed/'
    interim_dir = '../data/interim/'
    submission_dir = '../data/submission/'

    random_seed = 42
    n_folds = 5

    row_id = 'index'
    target = 'genre'

## Import libralies

In [2]:
import os
import gc
import warnings
warnings.filterwarnings('ignore')

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')

In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

plotly_template = dict(
    layout=go.Layout(
        template='plotly_dark',
        font=dict(
            family="Franklin Gothic",
            size=12
        ),
        height=500,
        width=1000,
    )
)

color_palette = {
    'Bin': ['#016CC9','#E876A3'],
    'Cat5': ['#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E'],
    'Cat10': ['#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E', '#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E'],
}

In [4]:
import random
import joblib
import itertools
from itertools import combinations
from imblearn import FunctionSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import StratifiedKFold, GroupKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

from sklearn.metrics import roc_auc_score, roc_curve, auc, f1_score, confusion_matrix
import scipy.stats as stats
from lightgbm import LGBMClassifier, early_stopping

## Load and check data

In [5]:
df_train = pd.read_pickle(Config.processed_data_dir + f'nb{Config.dataset_NB}_train.pkl', compression='zip')
df_test = pd.read_pickle(Config.processed_data_dir + f'nb{Config.dataset_NB}_test.pkl', compression='zip')

submission = pd.read_csv(Config.raw_data_dir + 'sample_submit.csv', header=None)

df_train.shape

(4046, 285)

In [6]:
df_train.head()

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo_int,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown,duration_long,popularity_add_duration_ms,popularity_sub_duration_ms,popularity_mul_duration_ms,popularity_bigger_duration_ms,popularity_add_acousticness,popularity_sub_acousticness,popularity_mul_acousticness,popularity_bigger_acousticness,popularity_add_positiveness,popularity_sub_positiveness,popularity_mul_positiveness,popularity_bigger_positiveness,popularity_add_danceability,popularity_sub_danceability,popularity_mul_danceability,popularity_bigger_danceability,popularity_add_loudness,popularity_sub_loudness,popularity_mul_loudness,popularity_bigger_loudness,popularity_add_energy,popularity_sub_energy,popularity_mul_energy,popularity_bigger_energy,popularity_add_liveness,popularity_sub_liveness,popularity_mul_liveness,popularity_bigger_liveness,popularity_add_speechiness,popularity_sub_speechiness,popularity_mul_speechiness,popularity_bigger_speechiness,popularity_add_instrumentalness,popularity_sub_instrumentalness,popularity_mul_instrumentalness,popularity_bigger_instrumentalness,popularity_add_tempo_int,popularity_sub_tempo_int,popularity_mul_tempo_int,popularity_bigger_tempo_int,duration_ms_add_acousticness,duration_ms_sub_acousticness,duration_ms_mul_acousticness,duration_ms_bigger_acousticness,duration_ms_add_positiveness,duration_ms_sub_positiveness,duration_ms_mul_positiveness,duration_ms_bigger_positiveness,duration_ms_add_danceability,duration_ms_sub_danceability,duration_ms_mul_danceability,duration_ms_bigger_danceability,duration_ms_add_loudness,duration_ms_sub_loudness,duration_ms_mul_loudness,duration_ms_bigger_loudness,duration_ms_add_energy,duration_ms_sub_energy,duration_ms_mul_energy,duration_ms_bigger_energy,duration_ms_add_liveness,duration_ms_sub_liveness,duration_ms_mul_liveness,duration_ms_bigger_liveness,duration_ms_add_speechiness,duration_ms_sub_speechiness,duration_ms_mul_speechiness,duration_ms_bigger_speechiness,duration_ms_add_instrumentalness,duration_ms_sub_instrumentalness,duration_ms_mul_instrumentalness,duration_ms_bigger_instrumentalness,duration_ms_add_tempo_int,duration_ms_sub_tempo_int,duration_ms_mul_tempo_int,duration_ms_bigger_tempo_int,acousticness_add_positiveness,acousticness_sub_positiveness,acousticness_mul_positiveness,acousticness_bigger_positiveness,acousticness_add_danceability,acousticness_sub_danceability,acousticness_mul_danceability,acousticness_bigger_danceability,acousticness_add_loudness,acousticness_sub_loudness,acousticness_mul_loudness,acousticness_bigger_loudness,acousticness_add_energy,acousticness_sub_energy,acousticness_mul_energy,acousticness_bigger_energy,acousticness_add_liveness,acousticness_sub_liveness,acousticness_mul_liveness,acousticness_bigger_liveness,acousticness_add_speechiness,acousticness_sub_speechiness,acousticness_mul_speechiness,acousticness_bigger_speechiness,acousticness_add_instrumentalness,acousticness_sub_instrumentalness,acousticness_mul_instrumentalness,acousticness_bigger_instrumentalness,acousticness_add_tempo_int,acousticness_sub_tempo_int,acousticness_mul_tempo_int,acousticness_bigger_tempo_int,positiveness_add_danceability,positiveness_sub_danceability,positiveness_mul_danceability,positiveness_bigger_danceability,positiveness_add_loudness,positiveness_sub_loudness,positiveness_mul_loudness,positiveness_bigger_loudness,positiveness_add_energy,positiveness_sub_energy,positiveness_mul_energy,positiveness_bigger_energy,positiveness_add_liveness,positiveness_sub_liveness,positiveness_mul_liveness,positiveness_bigger_liveness,positiveness_add_speechiness,positiveness_sub_speechiness,positiveness_mul_speechiness,positiveness_bigger_speechiness,positiveness_add_instrumentalness,positiveness_sub_instrumentalness,positiveness_mul_instrumentalness,positiveness_bigger_instrumentalness,positiveness_add_tempo_int,positiveness_sub_tempo_int,positiveness_mul_tempo_int,positiveness_bigger_tempo_int,danceability_add_loudness,danceability_sub_loudness,danceability_mul_loudness,danceability_bigger_loudness,danceability_add_energy,danceability_sub_energy,danceability_mul_energy,danceability_bigger_energy,danceability_add_liveness,danceability_sub_liveness,danceability_mul_liveness,danceability_bigger_liveness,danceability_add_speechiness,danceability_sub_speechiness,danceability_mul_speechiness,danceability_bigger_speechiness,danceability_add_instrumentalness,danceability_sub_instrumentalness,danceability_mul_instrumentalness,danceability_bigger_instrumentalness,danceability_add_tempo_int,danceability_sub_tempo_int,danceability_mul_tempo_int,danceability_bigger_tempo_int,loudness_add_energy,loudness_sub_energy,loudness_mul_energy,loudness_bigger_energy,loudness_add_liveness,loudness_sub_liveness,loudness_mul_liveness,loudness_bigger_liveness,loudness_add_speechiness,loudness_sub_speechiness,loudness_mul_speechiness,loudness_bigger_speechiness,loudness_add_instrumentalness,loudness_sub_instrumentalness,loudness_mul_instrumentalness,loudness_bigger_instrumentalness,loudness_add_tempo_int,loudness_sub_tempo_int,loudness_mul_tempo_int,loudness_bigger_tempo_int,energy_add_liveness,energy_sub_liveness,energy_mul_liveness,energy_bigger_liveness,energy_add_speechiness,energy_sub_speechiness,energy_mul_speechiness,energy_bigger_speechiness,energy_add_instrumentalness,energy_sub_instrumentalness,energy_mul_instrumentalness,energy_bigger_instrumentalness,energy_add_tempo_int,energy_sub_tempo_int,energy_mul_tempo_int,energy_bigger_tempo_int,liveness_add_speechiness,liveness_sub_speechiness,liveness_mul_speechiness,liveness_bigger_speechiness,liveness_add_instrumentalness,liveness_sub_instrumentalness,liveness_mul_instrumentalness,liveness_bigger_instrumentalness,liveness_add_tempo_int,liveness_sub_tempo_int,liveness_mul_tempo_int,liveness_bigger_tempo_int,speechiness_add_instrumentalness,speechiness_sub_instrumentalness,speechiness_mul_instrumentalness,speechiness_bigger_instrumentalness,speechiness_add_tempo_int,speechiness_sub_tempo_int,speechiness_mul_tempo_int,speechiness_bigger_tempo_int,instrumentalness_add_tempo_int,instrumentalness_sub_tempo_int,instrumentalness_mul_tempo_int,instrumentalness_bigger_tempo_int,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14,PCA15,PCA16,PCA17,PCA18,PCA19,PCA20,PCA21,PCA22,PCA23,PCA24,PCA25,PCA26,PCA27,PCA28,PCA29,PCA30
0,0,10.0,0.118421,0.336377,0.06848,0.107533,0.068271,0.937857,0.956448,0.355402,0.749919,1.0,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.039834,0.217956,0.039834,0,0.00811,0.049941,0.00811,1,0.012734,0.010888,0.012734,1,0.008085,0.05015,0.008085,1,0.111062,0.819436,0.111062,0,0.113264,0.838027,0.113264,0,0.042087,0.236981,0.042087,0,0.088806,0.631498,0.088806,0,0.118421,0.881579,0.118421,0,18.0,151.881579,18.0,0,0.023035,0.267896,0.023035,1,0.036172,0.228843,0.036172,1,0.022965,0.268106,0.022965,1,0.315473,0.60148,0.315473,0,0.321727,0.620072,0.321727,0,0.119549,0.019026,0.119549,0,0.252255,0.413542,0.252255,0,0.336377,0.663623,0.336377,0,51.129246,151.663623,51.129246,0,0.007364,0.039053,0.007364,0,0.004675,0.00021,0.004675,1,0.064225,0.869377,0.064225,0,0.065498,0.887968,0.065498,0,0.024338,0.286922,0.024338,0,0.051355,0.681438,0.051355,0,0.06848,0.93152,0.06848,0,10.409008,151.93152,10.409008,0,0.007341,0.039262,0.007341,1,0.100851,0.830324,0.100851,0,0.10285,0.848915,0.10285,0,0.038218,0.247869,0.038218,0,0.080641,0.642385,0.080641,0,0.107533,0.892467,0.107533,0,16.345063,151.892467,16.345063,0,0.064028,0.869586,0.064028,0,0.065297,0.888177,0.065297,0,0.024264,0.287131,0.024264,0,0.051198,0.681648,0.051198,0,0.068271,0.931729,0.068271,0,10.377163,151.931729,10.377163,0,0.897011,0.018591,0.897011,0,0.333316,0.582455,0.333316,1,0.703316,0.187938,0.703316,1,0.937857,0.062143,0.937857,0,142.554238,151.062143,142.554238,0,0.339924,0.601046,0.339924,1,0.717258,0.206529,0.717258,1,0.956448,0.043552,0.956448,0,145.380118,151.043552,145.380118,0,0.266523,0.394516,0.266523,0,0.355402,0.644598,0.355402,0,54.021142,151.644598,54.021142,0,0.749919,0.250081,0.749919,0,113.987648,151.250081,113.987648,0,152.0,151.0,152.0,0,79.075949,-85.613693,204.597951,81.714939,13.522066,123.905658,-16.988429,38.806989,-25.523688,-27.680075,23.408785,-0.043396,-1.136522,0.29251,-0.233483,0.536992,0.133966,-0.246089,-0.092782,-0.175006,0.107648,-0.092135,0.47041,0.549447,0.55457,0.20403,-0.416361,0.501871,0.659917,0.09703
1,1,8.0,0.881579,0.584169,0.054931,0.330301,0.591662,0.752644,0.932672,0.145787,0.21807,0.056041,176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.514991,0.29741,0.514991,1,0.048426,0.826648,0.048426,1,0.291186,0.551278,0.291186,1,0.521596,0.289917,0.521596,1,0.663515,0.128935,0.663515,1,0.822224,0.051093,0.822224,0,0.128522,0.735792,0.128522,1,0.192246,0.663509,0.192246,1,0.049405,0.825538,0.049405,1,155.157895,175.118421,155.157895,0,0.032089,0.529238,0.032089,1,0.192952,0.253868,0.192952,1,0.34563,0.007492,0.34563,0,0.439671,0.168475,0.439671,0,0.544838,0.348503,0.544838,0,0.085164,0.438382,0.085164,1,0.12739,0.366099,0.12739,1,0.032737,0.528128,0.032737,1,102.813757,175.415831,102.813757,0,0.018144,0.27537,0.018144,0,0.032501,0.536731,0.032501,0,0.041343,0.697713,0.041343,0,0.051233,0.877741,0.051233,0,0.008008,0.090856,0.008008,0,0.011979,0.163139,0.011979,0,0.003078,0.00111,0.003078,0,9.667848,175.945069,9.667848,0,0.195426,0.261361,0.195426,0,0.248599,0.422343,0.248599,0,0.308062,0.602371,0.308062,0,0.048153,0.184514,0.048153,1,0.072029,0.11223,0.072029,1,0.01851,0.27426,0.01851,1,58.13294,175.669699,58.13294,0,0.44531,0.160982,0.44531,0,0.551826,0.34101,0.551826,0,0.086256,0.445875,0.086256,1,0.129024,0.373591,0.129024,1,0.033157,0.535621,0.033157,1,104.132436,175.408338,104.132436,0,0.701969,0.180028,0.701969,0,0.109725,0.606857,0.109725,1,0.164129,0.534573,0.164129,1,0.042179,0.696603,0.042179,1,132.4653,175.247356,132.4653,0,0.135971,0.786885,0.135971,1,0.203388,0.714601,0.203388,1,0.052268,0.876631,0.052268,1,164.150202,175.067328,164.150202,0,0.031792,0.072284,0.031792,0,0.00817,0.089746,0.00817,1,25.658459,175.854213,25.658459,0,0.012221,0.162029,0.012221,1,38.380386,175.78193,38.380386,0,9.863219,175.943959,9.863219,0,203.071199,-35.550623,12.464042,-120.757182,-12.264657,-10.39991,17.493927,-15.055537,-11.32755,-4.638366,-50.439856,1.0605,0.271718,-0.424551,0.593342,0.6292,0.011083,0.041594,0.150102,0.136982,0.359968,0.383478,-0.018124,-0.190077,0.313286,0.046817,0.711192,-0.19125,-0.18945,0.167306
2,2,3.0,0.539474,0.32745,0.52132,0.234785,0.45346,0.565026,0.403174,0.174515,0.700945,0.111347,76,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.176651,0.212024,0.176651,1,0.281239,0.018153,0.281239,1,0.126661,0.304688,0.126661,1,0.24463,0.086014,0.24463,1,0.304817,0.025553,0.304817,0,0.217502,0.136299,0.217502,1,0.094146,0.364959,0.094146,1,0.378141,0.161471,0.378141,0,0.060069,0.428126,0.060069,1,41.0,75.460526,41.0,0,0.170706,0.19387,0.170706,0,0.076881,0.092665,0.076881,1,0.148485,0.12601,0.148485,0,0.185018,0.237576,0.185018,0,0.132019,0.075724,0.132019,0,0.057145,0.152935,0.057145,1,0.229524,0.373495,0.229524,0,0.036461,0.216103,0.036461,1,24.886201,75.67255,24.886201,0,0.122398,0.286535,0.122398,1,0.236398,0.067861,0.236398,1,0.29456,0.043706,0.29456,0,0.210183,0.118146,0.210183,1,0.090978,0.346806,0.090978,1,0.365417,0.179624,0.365417,0,0.058048,0.409973,0.058048,1,39.620347,75.47868,39.620347,0,0.106466,0.218674,0.106466,0,0.13266,0.330241,0.13266,0,0.094659,0.168389,0.094659,0,0.040973,0.060271,0.040973,1,0.164572,0.466159,0.164572,0,0.026143,0.123438,0.026143,1,17.843695,75.765215,17.843695,0,0.256217,0.111567,0.256217,0,0.182823,0.050285,0.182823,1,0.079135,0.278945,0.079135,1,0.31785,0.247485,0.31785,0,0.050491,0.342112,0.050491,1,34.462934,75.54654,34.462934,0,0.227804,0.161852,0.227804,1,0.098605,0.390512,0.098605,1,0.396052,0.135918,0.396052,0,0.062914,0.453679,0.062914,1,42.942007,75.434974,42.942007,0,0.07036,0.22866,0.07036,1,0.282603,0.29777,0.282603,0,0.044892,0.291827,0.044892,1,30.641253,75.596826,30.641253,0,0.122325,0.52643,0.122325,0,0.019432,0.063167,0.019432,1,13.263111,75.825485,13.263111,0,0.078048,0.589597,0.078048,1,53.271786,75.299055,53.271786,0,8.462392,75.888653,8.462392,0,-236.416901,-15.5641,4.817937,-7.489255,25.961638,14.563967,-5.688494,39.226703,8.871363,-1.148695,3.203081,-0.634603,-0.554847,-0.258907,-0.628868,-0.14804,-0.416359,0.445444,-0.003575,0.535081,-0.192602,0.215998,-0.408097,0.233628,-0.21153,0.213018,0.225363,0.022812,-0.953344,0.166199
3,3,10.0,0.565789,0.567093,0.130875,0.211419,0.309173,0.775795,0.925719,0.372026,0.369707,0.122951,192,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.320855,0.001304,0.320855,0,0.074048,0.434914,0.074048,1,0.119618,0.354371,0.119618,1,0.174927,0.256616,0.174927,1,0.438937,0.210006,0.438937,0,0.523762,0.35993,0.523762,0,0.210488,0.193764,0.210488,1,0.209176,0.196082,0.209176,1,0.069564,0.442838,0.069564,1,108.631579,191.434211,108.631579,0,0.074218,0.436218,0.074218,1,0.119894,0.355675,0.119894,1,0.17533,0.25792,0.17533,1,0.439948,0.208702,0.439948,0,0.524969,0.358626,0.524969,0,0.210973,0.195067,0.210973,1,0.209658,0.197386,0.209658,1,0.069725,0.444142,0.069725,1,108.88193,191.432907,108.88193,0,0.027669,0.080543,0.027669,0,0.040463,0.178298,0.040463,0,0.101532,0.64492,0.101532,0,0.121154,0.794844,0.121154,0,0.048689,0.241151,0.048689,0,0.048385,0.238832,0.048385,0,0.016091,0.007924,0.016091,1,25.128032,191.869125,25.128032,0,0.065365,0.097755,0.065365,0,0.164018,0.564377,0.164018,0,0.195714,0.714301,0.195714,0,0.078653,0.160607,0.078653,0,0.078163,0.158288,0.078163,0,0.025994,0.088468,0.025994,1,40.592366,191.788581,40.592366,0,0.239855,0.466622,0.239855,0,0.286208,0.616546,0.286208,0,0.11502,0.062853,0.11502,0,0.114304,0.060534,0.114304,0,0.038013,0.186222,0.038013,1,59.361257,191.690827,59.361257,0,0.718169,0.149924,0.718169,0,0.288616,0.40377,0.288616,1,0.286817,0.406088,0.286817,1,0.095385,0.652844,0.095385,1,148.952733,191.224205,148.952733,0,0.344392,0.553693,0.344392,1,0.342245,0.556012,0.342245,1,0.113818,0.802768,0.113818,1,177.738123,191.074281,177.738123,0,0.137541,0.002319,0.137541,1,0.045741,0.249075,0.045741,1,71.428986,191.627974,71.428986,0,0.045456,0.246756,0.045456,1,70.983745,191.630293,70.983745,0,23.606601,191.877049,23.606601,0,243.516586,-30.799107,105.447096,-45.473209,41.533841,-28.852839,9.885458,4.79534,-2.188745,-9.335878,-37.188466,0.297327,-0.397318,0.605329,-0.237236,0.254035,-0.337007,-0.530834,-0.293533,-0.072743,-0.285195,-0.378022,0.275364,-0.02261,-0.114892,-0.139512,0.132943,-0.052694,0.124329,0.101715
4,4,3.0,0.723684,0.512311,0.16045,0.837474,0.985751,0.834212,0.659366,0.115435,0.359962,0.188746,120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.370751,0.211373,0.370751,1,0.116115,0.563234,0.116115,1,0.606067,0.11379,0.606067,0,0.713373,0.262067,0.713373,0,0.603706,0.110528,0.603706,0,0.477173,0.064318,0.477173,1,0.083538,0.60825,0.083538,1,0.260499,0.363722,0.260499,1,0.136593,0.534938,0.136593,1,86.842105,119.276316,86.842105,0,0.0822,0.351861,0.0822,1,0.429047,0.325163,0.429047,0,0.505011,0.47344,0.505011,0,0.427376,0.321901,0.427376,0,0.3378,0.147055,0.3378,0,0.059138,0.396876,0.059138,1,0.184412,0.152349,0.184412,1,0.096697,0.323564,0.096697,1,61.477308,119.487689,61.477308,0,0.134373,0.677024,0.134373,0,0.158164,0.825301,0.158164,0,0.133849,0.673762,0.133849,0,0.105795,0.498916,0.105795,0,0.018521,0.045016,0.018521,1,0.057756,0.199512,0.057756,0,0.030284,0.028296,0.030284,0,19.25402,119.83955,19.25402,0,0.825541,0.148277,0.825541,0,0.698631,0.003262,0.698631,1,0.552202,0.178108,0.552202,1,0.096673,0.722039,0.096673,1,0.301459,0.477512,0.301459,1,0.15807,0.648727,0.15807,1,100.496871,119.162526,100.496871,0,0.822326,0.151539,0.822326,1,0.649971,0.326385,0.649971,1,0.11379,0.870317,0.11379,1,0.354833,0.62579,0.354833,1,0.186057,0.797005,0.186057,1,118.290159,119.014249,118.290159,0,0.550051,0.174846,0.550051,1,0.096297,0.718778,0.096297,1,0.300284,0.47425,0.300284,1,0.157455,0.645466,0.157455,1,100.105463,119.165788,100.105463,0,0.076114,0.543931,0.076114,1,0.237347,0.299404,0.237347,1,0.124453,0.47062,0.124453,1,79.123925,119.340634,79.123925,0,0.041552,0.244527,0.041552,0,0.021788,0.073312,0.021788,0,13.852151,119.884565,13.852151,0,0.067941,0.171215,0.067941,1,43.195405,119.640038,43.195405,0,22.649572,119.811254,22.649572,0,-5.809267,-48.013107,-84.286692,-26.442266,-20.558598,25.315402,22.450212,1.098041,-16.712723,-24.53409,17.63921,0.566225,-0.225016,0.082544,0.204415,-0.201811,0.815365,-0.573887,-0.317879,-0.354251,-0.147024,-0.121148,-0.007003,-0.189997,-0.257762,-0.515536,-0.087098,0.269397,0.251399,0.725347


In [7]:
for i in df_train.columns:
    print(i)

index
genre
popularity
duration_ms
acousticness
positiveness
danceability
loudness
energy
liveness
speechiness
instrumentalness
tempo_int
region_A
region_B
region_C
region_D
region_E
region_F
region_G
region_H
region_I
region_J
region_K
region_L
region_M
region_N
region_O
region_P
region_Q
region_R
region_S
region_T
unknown
duration_long
popularity_add_duration_ms
popularity_sub_duration_ms
popularity_mul_duration_ms
popularity_bigger_duration_ms
popularity_add_acousticness
popularity_sub_acousticness
popularity_mul_acousticness
popularity_bigger_acousticness
popularity_add_positiveness
popularity_sub_positiveness
popularity_mul_positiveness
popularity_bigger_positiveness
popularity_add_danceability
popularity_sub_danceability
popularity_mul_danceability
popularity_bigger_danceability
popularity_add_loudness
popularity_sub_loudness
popularity_mul_loudness
popularity_bigger_loudness
popularity_add_energy
popularity_sub_energy
popularity_mul_energy
popularity_bigger_energy
popularity_add

## Stacking Setting

In [8]:
if Config.stacking_NB is False:
    print('stacking is not setting.')
else:
    for i in Config.stacking_NB:
        df_train_NB =  pd.read_csv(Config.interim_dir + f'nb{i}.csv')
        df_test_NB =  pd.read_csv(Config.submission_dir + f'nb{i}.csv')

        df_train[f'nb{i}'] = df_train_NB[f'nb{i}']

        df_test = df_test.reset_index()
        df_test[f'nb{i}'] = df_test_NB[Config.target]
        df_test = df_test.set_index('index')

    df_test

stacking is not setting.


## Training

In [9]:
def seed_everything(seed):

    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [10]:
# Get feature list
features = [col for col in df_train.columns if col not in [Config.row_id, Config.target]]

# Get parameter list
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 11,
    'metric': 'multi_logloss',
    'seed': Config.random_seed,
    'n_estimators': 20000,
    'max_depth': -1,
    'num_leaves': 32,
    'learning_rate': 0.01,
    #'feature_fraction': 0.20,
    #'bagging_freq': 10,
    #'bagging_fraction': 0.95,
    'n_jobs': -1,
    #'lambda_l2': 2,
    'min_data_in_leaf': 5,
}

callbacks = [lgb.early_stopping(50), lgb.log_evaluation(50)]

In [11]:
# 約10秒

# Create a numpy array to store test predictions
test_predictions = np.zeros((len(df_test), Config.n_folds))

# Create a numpy array to store out of folds predictions
oof_predictions = np.zeros(len(df_train))

feature_importance_df = pd.DataFrame(index=features)
y_valids, val_preds =[],[]

kfold = StratifiedKFold(n_splits=Config.n_folds, shuffle=True, random_state=Config.random_seed)

for fold, (train_idx, valid_idx) in enumerate(kfold.split(df_train, df_train[Config.target])):

    print(' ')
    print('-'*50)
    print(f'Training fold {fold+1} with {len(features)} features...')

    X_train, X_val = df_train[features].iloc[train_idx], df_train[features].iloc[valid_idx]
    y_train, y_val = df_train[Config.target].iloc[train_idx], df_train[Config.target].iloc[valid_idx]

    # Over Sampling
    sm = SMOTE(random_state=Config.random_seed)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    # print(y_train.value_counts())

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_val, y_val)

    model = lgb.train(params=params, train_set=lgb_train, valid_sets=[lgb_train, lgb_valid], valid_names=['train', 'valid'], callbacks=callbacks)
    print(f'================================== training {fold+1} fin. ==================================')

    # Predict validation data
    print(f'================================== validation-data predicting ... ==================================')
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    val_pred = np.argmax(val_pred, axis=1)
    oof_predictions[valid_idx] = val_pred

    # Predict test data
    print(f'================================== test-data predicting ... ==================================')
    test_pred = model.predict(df_test[features], num_iteration=model.best_iteration)
    test_pred = np.argmax(test_pred, axis=1)

    test_predictions[:, fold] += test_pred

    # save results
    y_valids.append(y_val)
    val_preds.append(val_pred)
    feature_importance_df["Importance_Fold"+str(fold+1)]=model.feature_importance(importance_type='gain')

    # Compute fold metric
    val_pred = pd.DataFrame(data={'prediction': val_pred})
    y_val = pd.DataFrame(data={'target': y_val.reset_index(drop=True)})
    score = f1_score(y_val, val_pred, average='macro')

    print(f'Fold {fold+1} CV result')
    print(f'metric : {score}')

    del X_train, X_val, y_train, y_val, lgb_train, lgb_valid
    _ = gc.collect()

# Compute out of folds metric
oof_predictions = pd.DataFrame(data={'prediction': oof_predictions})
y_true = pd.DataFrame(data={Config.target: df_train[Config.target]})

print(' ')
print('-'*50)
print(f'TOTAL socre : {f1_score(df_train[Config.target], oof_predictions["prediction"], average="macro")}')
print('-'*50)

# Create a dataframe to store out of folds predictions
oof_df = pd.DataFrame({Config.row_id: df_train[Config.row_id], Config.target: df_train[Config.target], 'prediction': oof_predictions['prediction']})

# Create a dataframe to store test prediction


test_predictions, _ = stats.mode(test_predictions, axis=1)
test_predictions = test_predictions.reshape(-1)

test_df = pd.DataFrame({Config.row_id: df_test[Config.row_id], Config.target: test_predictions})

 
--------------------------------------------------
Training fold 1 with 283 features...
10.0    1069
8.0     1069
3.0     1069
7.0     1069
2.0     1069
1.0     1069
5.0     1069
9.0     1069
6.0     1069
4.0     1069
0.0     1069
Name: genre, dtype: int64
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55803
[LightGBM] [Info] Number of data points in the train set: 11759, number of used features: 272
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [In

In [12]:
oof_df.head()

Unnamed: 0,index,genre,prediction
0,0,10.0,10.0
1,1,8.0,8.0
2,2,3.0,8.0
3,3,10.0,10.0
4,4,3.0,3.0


In [13]:
# Save results
oof_df_tmp = oof_df.drop(columns=[Config.target])
oof_df_tmp.columns = [Config.row_id, f'nb{Config.NB}']
oof_df_tmp.to_csv(Config.interim_dir + f'nb{Config.NB}.csv', index=False)
oof_df_tmp

Unnamed: 0,index,nb205
0,0,10.0
1,1,8.0
2,2,8.0
3,3,10.0
4,4,3.0
...,...,...
4041,4041,8.0
4042,4042,10.0
4043,4043,8.0
4044,4044,10.0


In [14]:
cm = confusion_matrix(oof_df[Config.target], oof_df['prediction'], normalize='true')

names = [f'Target_{i}' for i in range(11)]

fig = ff.create_annotated_heatmap(cm, x=names, y=names)
fig.update_layout(
    yaxis_title='True Label',
    xaxis_title='Pred Label',
)
fig.show()

In [29]:
top = 50

feature_importance_df['avg'] = feature_importance_df.mean(axis=1)
feature_importance_top = feature_importance_df.avg.nlargest(top).sort_values(ascending=True)

pal=sns.color_palette("YlGnBu_r", top).as_hex()
fig=go.Figure()
for i in range(len(feature_importance_top.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feature_importance_top[i],
                       line_color=pal[::-1][i],opacity=0.8,line_width=4))

fig.add_trace(go.Scatter(x=feature_importance_top, y=feature_importance_top.index, mode='markers',
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.0f}<extra></extra>'))

fig.update_layout(template=plotly_template,title=f'LGBM Feature Importance<br>Top {top}',
                  margin=dict(l=300,t=80),
                  xaxis=dict(title='Importance', zeroline=False),
                  yaxis_showgrid=False, height=1000, width=800)
fig.show()

In [21]:
feature_importance_top

popularity_bigger_liveness                1126.323485
liveness_bigger_instrumentalness          1171.554492
region_A                                  1185.781283
popularity_bigger_positiveness            1213.871846
popularity_bigger_loudness                1244.416965
positiveness_bigger_energy                1361.047917
duration_ms_bigger_energy                 1377.527354
acousticness_bigger_speechiness           1400.314439
danceability_bigger_loudness              1518.423155
popularity_bigger_instrumentalness        1600.115276
positiveness                              1627.759378
positiveness_bigger_instrumentalness      1678.478603
tempo_int                                 1817.052714
acousticness_bigger_liveness              1921.625901
liveness_add_tempo_int                    1924.067120
energy_add_liveness                       1995.491530
acousticness_bigger_danceability          2015.224224
danceability_bigger_liveness              2050.873530
liveness_bigger_speechiness 

In [16]:
fig = go.Figure(layout=plotly_template['layout'])
fig.add_trace(
    go.Histogram(
        x=test_df[Config.target],
        name=f'Prediction',
        histnorm='probability',
        marker=dict(color=color_palette['Bin'][0]),
        #line=dict(color='black')
    ),
)

fig.add_trace(
    go.Histogram(
        x=df_train[Config.target],
        name=f'Train',
        histnorm='probability',
        marker=dict(color=color_palette['Bin'][1]),
        opacity=0.5
        #line=dict(color='black')
    ),
)

fig.update_layout(
    title='Prediction Distribution',
    barmode='overlay',
    uniformtext_minsize=15,
    uniformtext_mode='hide',
    width=700)

fig.show()

In [None]:
test_df = test_df.astype({'genre': int})
test_df.info()

In [None]:
test_df[Config.target].describe()

In [None]:
Config.NB

In [None]:
test_df.to_csv(Config.submission_dir + f'nb{Config.NB}.csv', index=False, header=False)

## 検証メモ