# loading packages

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np
from keras.utils import to_categorical

# static

In [None]:
TRAIN_PATH = '/content/drive/MyDrive/student_cup_2021/dataset/train.csv'
TEST_PATH = '/content/drive/MyDrive/student_cup_2021/dataset/test.csv'

MODEL_PATH = 'model.h5'

# loading data

In [None]:
df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)

print('df_train shape: {0}, df_test shape: {1}'.format(df_train.shape, df_test.shape))

df_train shape: (4046, 14), df_test shape: (4046, 13)


# hand-labeling

In [None]:
'''
# How to checked it... #
col = categorical feature
'''
# df_train.groupby("col")["target"].value_counts()

''' 
# Hand-Labeling # 
if popularity == 2: genre = 10
if popularity == 82: genre = 0
if tempo == '41-52': genre = 1
if region == 'region_M': genre = 7
if popularity == 5: genre = 10
'''
df_test['genre'] = -100

df_test.loc[df_test['popularity'] == 2, 'genre'] = 10
df_test.loc[df_test['popularity'] == 82, 'genre'] = 0
df_test.loc[df_test['tempo'] == '41-52', 'genre'] = 1
df_test.loc[df_test['region'] == 'region_M', 'genre'] = 7
df_test.loc[df_test['popularity'] == 5, 'genre'] = 10

# preparing data

In [None]:
df = pd.concat([df_train, df_test], axis=0)
df = df.drop(['index'], axis=1)

print('df shape: {0}'.format(df.shape))

df shape: (8092, 13)


# preprocessing

In [None]:
'''
# missing value #
fill in the avg of the data
'''
df = df.fillna(df.mean())

'''
# popularity #
min-max scaling
'''
df['popularity'] = (df['popularity'] - df['popularity'].min()) / (df['popularity'].max() - df['popularity'].min())

'''
# duration_ms #
[min, max]
[5826, 2135773] -> [3.765, 6.33] (log scaling) -> [0, 1] (min-max scaling)
'''
df['duration_ms'] = np.log(df['duration_ms'])
df['duration_ms'] = (df['duration_ms'] - df['duration_ms'].min()) - (df['duration_ms'].max() - df['duration_ms'].min())

'''
# loudness #
[min, max]
[0, -37.82] -> [0, 1] (napier to the - power)
'''
df['loudness'] = np.e**df['loudness']

'''
# tempo #
one-hot encoding
'''
df = pd.concat([df, pd.get_dummies(df['tempo'])], axis=1)
df = df.drop(['tempo'], axis=1)

'''
# region #
one-hot encoding
'''
df = pd.concat([df, pd.get_dummies(df['region'])], axis=1)
df = df.drop(['region'], axis=1)

print('df shape: {0}'.format(df.shape))

df shape: (8092, 44)


# creating data for training 

In [None]:
train_data = df[df['genre'] != -100]

X_train = train_data.drop(['genre'], axis=1).values
cY_train = to_categorical(train_data['genre'])
X_test = df.drop(['genre'], axis=1).iloc[4046:, :].values

print('train data shape: {0}, X_train shape: {1}, cY_train shape: {2}'.format(train_data.shape, X_train.shape, cY_train.shape))
print('X_test shape: {0}'.format(X_test.shape))

train data shape: (4076, 44), X_train shape: (4076, 43), cY_train shape: (4076, 11)
X_test shape: (4046, 43)


# creating model

In [None]:
!pip install tensorflow_addons
import tensorflow_addons as tfa

from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, LeakyReLU
from keras.optimizers import Adam

def create_model():

  model = Sequential()

  model.add(Dense(100, input_shape=(X_train.shape[1],)))
  model.add(BatchNormalization())
  model.add(LeakyReLU(alpha=0.01))

  model.add(Dense(50))
  model.add(BatchNormalization())
  model.add(LeakyReLU(alpha=0.01))
  
  model.add(Dense(cY_train.shape[1], activation='softmax'))
 
  model.compile(optimizer=Adam(lr=1e-3),
                loss = tfa.losses.SigmoidFocalCrossEntropy(),
                metrics=['accuracy'],)
  
  return model



# cv for ensemble

In [None]:
from keras.callbacks import Callback
from sklearn.metrics import f1_score
import numpy as np

class CustomCallback(Callback):

  def __init__(self, model, x_val, cy_val, model_path):
    self.model = model
    self.x_val = x_val
    self.cy_val = cy_val
    self.model_path = model_path
    self.max = 0
  
  def on_epoch_end(self, epoch, logs=None):
    score = f1_score(np.argmax(self.cy_val, axis=1), np.argmax(self.model.predict(self.x_val), axis=1), average='macro')
    if self.max < score:
      self.max = score
      self.model.save(self.model_path)

In [None]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from sklearn.metrics import f1_score

def cross_val_score_for_ensemble(X_train, Y_train, epochs, batch_size, n_splits=10):

  skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2021)

  loss=[]
  acc=[]
  f1_macro = []
  model_list = []

  num = 0

  for train_idx, val_idx in skf.split(X_train, cY_train.argmax(axis=1)):
    train_data = X_train[train_idx]
    train_labels = cY_train[train_idx]
    val_data = X_train[val_idx]
    val_labels = cY_train[val_idx]

    model = create_model()
    model_path = str(num)+'_'+MODEL_PATH
    callbacks_list = [CustomCallback(model, val_data, val_labels, model_path),]
    history = model.fit(train_data, train_labels, 
                        epochs=epochs, 
                        batch_size=batch_size, 
                        callbacks=callbacks_list, 
                        verbose=0, 
                        validation_data=(val_data, val_labels))
    model_list.append(load_model(model_path))

    score = f1_score(np.argmax(val_labels, axis=1), np.argmax(model_list[num].predict(val_data), axis=1), average='macro')
    f1_macro.append(score)
    print('f1 macro: {0:.3f}'.format(score))

    num += 1

  return f1_macro, model_list

In [None]:
from time import time

EPOCHS = 500
BATCH_SIZE = 32

start_time = time()
f1_macro, model_list = cross_val_score_for_ensemble(X_train, cY_train, EPOCHS, BATCH_SIZE, 10)
elapsed_time = time() - start_time

print('Elapsed time: {0:.3f} hrs'.format(elapsed_time / 3600))
print('f1 macro for cv: {0:.3f}'.format(np.mean(f1_macro)))

f1 macro: 0.645
f1 macro: 0.606
f1 macro: 0.669
f1 macro: 0.598
f1 macro: 0.583
f1 macro: 0.648
f1 macro: 0.541
f1 macro: 0.586
f1 macro: 0.550
f1 macro: 0.603
Elapsed time: 0.457 hrs
f1 macro for cv: 0.603


# submission

In [None]:
import numpy as np

predicted =  np.argmax(np.mean([model_list[0].predict(X_test),
                                model_list[2].predict(X_test),
                                model_list[3].predict(X_test),
                                model_list[4].predict(X_test),
                                model_list[5].predict(X_test),
                                model_list[6].predict(X_test),
                                model_list[7].predict(X_test),
                                model_list[8].predict(X_test),
                                model_list[9].predict(X_test),], axis=0), axis=1)

df_sub = pd.concat([df_test['index'].reset_index(drop=True), pd.DataFrame(predicted, columns=['predicted']).astype(int)], axis=1)
# df_sub.to_csv('submission.csv', index=False, header=False)

df_sub.head()

In [None]:
# import numpy as np

# preds = model_list[0].predict(X_test)*f1_macro[0]/np.sum(f1_macro)

# for i in range(1, 10):
#   preds += model_list[i].predict(X_test)*f1_macro[i]/np.sum(f1_macro)

# preds = np.argmax(preds, axis=1)

# df_sub = pd.concat([df_test['index'].reset_index(drop=True), pd.DataFrame(preds, columns=['predicted']).astype(int)], axis=1)

# df_sub.head()

In [None]:
# for ge, ind in zip(df_test[df_test['genre'] != -100]['genre'], df_test[df_test['genre'] != -100]['index']):
#   df_sub.loc[df_sub['index'] == ind, 'predicted'] = ge

# df_sub.to_csv('submission.csv', index=False, header=False)

# df_sub.head()