# Music Genre Prediction

In [1]:
class Config:
    NB = '203'
    dataset_NB = '102'
    #stacking_NB = ['212', '213', '214']
    stacking_NB = False

    raw_data_dir = '../data/raw/'
    processed_data_dir = '../data/processed/'
    interim_dir = '../data/interim/'
    submission_dir = '../data/submission/'

    random_seed = 42
    n_folds = 5

    row_id = 'index'
    target = 'genre'

## Import libralies

In [2]:
import os
import gc
import warnings
warnings.filterwarnings('ignore')

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')

In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

plotly_template = dict(
    layout=go.Layout(
        template='plotly_dark',
        font=dict(
            family="Franklin Gothic",
            size=12
        ),
        height=500,
        width=1000,
    )
)

color_palette = {
    'Bin': ['#016CC9','#E876A3'],
    'Cat5': ['#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E'],
    'Cat10': ['#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E', '#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E'],
}

In [4]:
import random
import joblib
import itertools
from itertools import combinations
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import StratifiedKFold, GroupKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

from sklearn.metrics import roc_auc_score, roc_curve, auc, f1_score, confusion_matrix
import scipy.stats as stats
from lightgbm import LGBMClassifier, early_stopping

## Load and check data

In [5]:
df_train = pd.read_pickle(Config.processed_data_dir + f'nb{Config.dataset_NB}_train.pkl', compression='zip')
df_test = pd.read_pickle(Config.processed_data_dir + f'nb{Config.dataset_NB}_test.pkl', compression='zip')

submission = pd.read_csv(Config.raw_data_dir + 'sample_submit.csv', header=None)

df_train.shape

(4046, 40)

In [6]:
df_train.head()

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo_int,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown,duration_long,PCA1,PCA2,PCA3,PCA4,PCA5
0,0,10.0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-41047.028616,-18.457388,30.293345,-5.422681,-0.104217
1,1,8.0,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,66351.971231,-40.207097,-28.864875,-1.711876,0.778478
2,2,3.0,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,76,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-44916.030091,57.856874,-1.052937,0.660017,-0.671912
3,3,10.0,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,192,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,58950.971693,-56.536491,-4.939926,-1.791299,-0.026038
4,4,3.0,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,35206.970466,15.167407,-16.097608,-4.069444,-0.008173


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4046 entries, 0 to 4045
Data columns (total 40 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             4046 non-null   int64  
 1   genre             4046 non-null   float64
 2   popularity        4046 non-null   int64  
 3   duration_ms       4046 non-null   int64  
 4   acousticness      4046 non-null   float64
 5   positiveness      4046 non-null   float64
 6   danceability      4046 non-null   float64
 7   loudness          4046 non-null   float64
 8   energy            4046 non-null   float64
 9   liveness          4046 non-null   float64
 10  speechiness       4046 non-null   float64
 11  instrumentalness  4046 non-null   float64
 12  tempo_int         4046 non-null   int64  
 13  region_A          4046 non-null   float64
 14  region_B          4046 non-null   float64
 15  region_C          4046 non-null   float64
 16  region_D          4046 non-null   float64


## Stacking Setting

In [8]:
if Config.stacking_NB is False:
    print('stacking is not setting.')
else:
    for i in Config.stacking_NB:
        df_train_NB =  pd.read_csv(Config.interim_dir + f'nb{i}.csv')
        df_test_NB =  pd.read_csv(Config.submission_dir + f'nb{i}.csv')

        df_train[f'nb{i}'] = df_train_NB[f'nb{i}']

        df_test = df_test.reset_index()
        df_test[f'nb{i}'] = df_test_NB[Config.target]
        df_test = df_test.set_index('index')

    df_test

stacking is not setting.


## Training

In [9]:
def seed_everything(seed):

    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [35]:
# Get feature list
features = [col for col in df_train.columns if col not in [Config.row_id, Config.target]]

# Get parameter list
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 11,
    'metric': 'multi_logloss',
    'seed': Config.random_seed,
    'n_estimators': 20000,
    'max_depth': -1,
    'num_leaves': 16,
    'learning_rate': 0.01,
    #'feature_fraction': 0.20,
    #'bagging_freq': 10,
    'bagging_fraction': 0.95,
    'n_jobs': -1,
    #'lambda_l2': 2,
    'min_data_in_leaf': 5,
}

callbacks = [lgb.early_stopping(50), lgb.log_evaluation(50)]

# Down Sampling ratio
down_sampling_ratio = 1

In [36]:
# 約10秒

# Create a numpy array to store test predictions
test_predictions = np.zeros((len(df_test), Config.n_folds))

# Create a numpy array to store out of folds predictions
oof_predictions = np.zeros(len(df_train))

feature_importance_df = pd.DataFrame(index=features)
y_valids, val_preds =[],[]

kfold = StratifiedKFold(n_splits=Config.n_folds, shuffle=True, random_state=Config.random_seed)

for fold, (train_idx, valid_idx) in enumerate(kfold.split(df_train, df_train[Config.target])):

    print(' ')
    print('-'*50)
    print(f'Training fold {fold+1} with {len(features)} features...')

    X_train, X_val = df_train[features].iloc[train_idx], df_train[features].iloc[valid_idx]
    y_train, y_val = df_train[Config.target].iloc[train_idx], df_train[Config.target].iloc[valid_idx]

    # Down Sampling
    base_count = y_train.value_counts().sort_index()
    base_count_min = base_count.min()
    sampling_strategy = {
        0: int(min(base_count[0], base_count_min * down_sampling_ratio)),
        1: int(min(base_count[1], base_count_min * down_sampling_ratio)),
        2: int(min(base_count[2], base_count_min * down_sampling_ratio)),
        3: int(min(base_count[3], base_count_min * down_sampling_ratio)),
        4: int(min(base_count[4], base_count_min * down_sampling_ratio)),
        5: int(min(base_count[5], base_count_min * down_sampling_ratio)),
        6: int(min(base_count[6], base_count_min * down_sampling_ratio)),
        7: int(min(base_count[7], base_count_min * down_sampling_ratio)),
        8: int(min(base_count[8], base_count_min * down_sampling_ratio * 2)),
        9: int(min(base_count[9], base_count_min * down_sampling_ratio)),
        10: int(min(base_count[10], base_count_min * down_sampling_ratio * 2)),
        }

    sampler = RandomUnderSampler(random_state=Config.random_seed, replacement=True, sampling_strategy=sampling_strategy)
    X_train, y_train = sampler.fit_resample(X_train, y_train)

    print(f'X_train : {len(X_train)}')

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_val, y_val)

    model = lgb.train(params=params, train_set=lgb_train, valid_sets=[lgb_train, lgb_valid], valid_names=['train', 'valid'], callbacks=callbacks)
    print(f'================================== training {fold+1} fin. ==================================')

    # Predict validation data
    print(f'================================== validation-data predicting ... ==================================')
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    val_pred = np.argmax(val_pred, axis=1)
    oof_predictions[valid_idx] = val_pred

    # Predict test data
    print(f'================================== test-data predicting ... ==================================')
    test_pred = model.predict(df_test[features], num_iteration=model.best_iteration)
    test_pred = np.argmax(test_pred, axis=1)

    test_predictions[:, fold] += test_pred

    # save results
    y_valids.append(y_val)
    val_preds.append(val_pred)
    feature_importance_df["Importance_Fold"+str(fold+1)]=model.feature_importance(importance_type='gain')

    # Compute fold metric
    val_pred = pd.DataFrame(data={'prediction': val_pred})
    y_val = pd.DataFrame(data={'target': y_val.reset_index(drop=True)})
    score = f1_score(y_val, val_pred, average='macro')

    print(f'Fold {fold+1} CV result')
    print(f'metric : {score}')

    del X_train, X_val, y_train, y_val, lgb_train, lgb_valid
    _ = gc.collect()

# Compute out of folds metric
oof_predictions = pd.DataFrame(data={'prediction': oof_predictions})
y_true = pd.DataFrame(data={Config.target: df_train[Config.target]})

print(' ')
print('-'*50)
print(f'TOTAL socre : {f1_score(df_train[Config.target], oof_predictions["prediction"], average="macro")}')
print('-'*50)

# Create a dataframe to store out of folds predictions
oof_df = pd.DataFrame({Config.row_id: df_train[Config.row_id], Config.target: df_train[Config.target], 'prediction': oof_predictions['prediction']})

# Create a dataframe to store test prediction


test_predictions, _ = stats.mode(test_predictions, axis=1)
test_predictions = test_predictions.reshape(-1)

test_df = pd.DataFrame({Config.row_id: df_test[Config.row_id], Config.target: test_predictions})

 
--------------------------------------------------
Training fold 1 with 38 features...
X_train : 325
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1600
[LightGBM] [Info] Number of data points in the train set: 325, number of used features: 30
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -1.871802
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -1.871802
Training until validation scores don't improve for 50 rounds
[50]	train's multi_logloss: 0.939455	valid's multi_logloss:

In [27]:
oof_df.head()

Unnamed: 0,index,genre,prediction
0,0,10.0,10.0
1,1,8.0,10.0
2,2,3.0,8.0
3,3,10.0,10.0
4,4,3.0,2.0


In [28]:
# Save results
oof_df_tmp = oof_df.drop(columns=[Config.target])
oof_df_tmp.columns = [Config.row_id, f'nb{Config.NB}']
oof_df_tmp.to_csv(Config.interim_dir + f'nb{Config.NB}.csv', index=False)
oof_df_tmp

Unnamed: 0,index,nb203
0,0,10.0
1,1,10.0
2,2,8.0
3,3,10.0
4,4,2.0
...,...,...
4041,4041,10.0
4042,4042,7.0
4043,4043,8.0
4044,4044,10.0


In [29]:
cm = confusion_matrix(oof_df[Config.target], oof_df['prediction'], normalize='true')

names = [f'Target_{i}' for i in range(11)]

fig = ff.create_annotated_heatmap(cm, x=names, y=names)
fig.update_layout(
    yaxis_title='True Label',
    xaxis_title='Pred Label',
)
fig.show()

In [34]:
top = 50

feature_importance_df['avg'] = feature_importance_df.mean(axis=1)
feature_importance_top = feature_importance_df.avg.nlargest(top).sort_values(ascending=True)

pal=sns.color_palette("YlGnBu", 65).as_hex()
fig=go.Figure()
for i in range(len(feature_importance_top.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feature_importance_top[i],
                       line_color=pal[::-1][i],opacity=0.8,line_width=4))

fig.add_trace(go.Scatter(x=feature_importance_top, y=feature_importance_top.index, mode='markers',
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.0f}<extra></extra>'))

fig.update_layout(template=plotly_template,title=f'LGBM Feature Importance<br>Top {top}',
                  margin=dict(l=150,t=80),
                  xaxis=dict(title='Importance', zeroline=False),
                  yaxis_showgrid=False, height=1000, width=800)
fig.show()

In [None]:
fig = go.Figure(layout=plotly_template['layout'])
fig.add_trace(
    go.Histogram(
        x=test_df[Config.target],
        name=f'Prediction',
        histnorm='probability',
        marker=dict(color=color_palette['Bin'][0]),
        #line=dict(color='black')
    ),
)

fig.add_trace(
    go.Histogram(
        x=df_train[Config.target],
        name=f'Train',
        histnorm='probability',
        marker=dict(color=color_palette['Bin'][1]),
        opacity=0.5
        #line=dict(color='black')
    ),
)

fig.update_layout(
    title='Prediction Distribution',
    barmode='overlay',
    uniformtext_minsize=15,
    uniformtext_mode='hide',
    width=700)

fig.show()

In [None]:
test_df = test_df.astype({'genre': int})
test_df.info()

In [None]:
test_df[Config.target].describe()

In [None]:
Config.NB

In [None]:
test_df.to_csv(Config.submission_dir + f'nb{Config.NB}.csv', index=False, header=False)

## 検証メモ