# static

In [1]:
# TRAIN_PATH = '/content/drive/MyDrive/student_cup_2021/dataset/train.csv'
# TEST_PATH = '/content/drive/MyDrive/student_cup_2021/dataset/test.csv'

TRAIN_PATH = '/content/drive/MyDrive/student_cup_2021/dataset/new_train.csv'
TEST_PATH = '/content/drive/MyDrive/student_cup_2021/dataset/new_test.csv'

MODEL_PATH = 'model.h5'

# class

In [2]:
from keras.callbacks import Callback
from sklearn.metrics import f1_score
import numpy as np

class CustomCallback(Callback):

  def __init__(self, model, x_val, cy_val, model_path):
    self.model = model
    self.x_val = x_val
    self.cy_val = cy_val
    self.model_path = model_path
    self.max = 0
  
  def on_epoch_end(self, epoch, logs=None):
    score = f1_score(np.argmax(self.cy_val, axis=1), np.argmax(self.model.predict(self.x_val), axis=1), average='macro')
    if self.max < score:
      self.max = score
      self.model.save(self.model_path)

# fucntion

## creating model

In [3]:
!pip install tensorflow_addons
import tensorflow_addons as tfa

from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, LeakyReLU
from keras.optimizers import Adam

def create_model():

  model = Sequential()

  model.add(Dense(100, input_shape=(X_train.shape[1],)))
  model.add(BatchNormalization())
  model.add(LeakyReLU(alpha=0.01))

  model.add(Dense(50))
  model.add(BatchNormalization())
  model.add(LeakyReLU(alpha=0.01))
  
  model.add(Dense(cY_train.shape[1], activation='softmax'))
 
  model.compile(optimizer=Adam(lr=1e-3),
                loss = tfa.losses.SigmoidFocalCrossEntropy(),
                metrics=['accuracy'],)
  
  return model

Collecting tensorflow_addons
[?25l  Downloading https://files.pythonhosted.org/packages/74/e3/56d2fe76f0bb7c88ed9b2a6a557e25e83e252aec08f13de34369cd850a0b/tensorflow_addons-0.12.1-cp37-cp37m-manylinux2010_x86_64.whl (703kB)
[K     |▌                               | 10kB 12.2MB/s eta 0:00:01[K     |█                               | 20kB 17.8MB/s eta 0:00:01[K     |█▍                              | 30kB 12.0MB/s eta 0:00:01[K     |█▉                              | 40kB 9.3MB/s eta 0:00:01[K     |██▎                             | 51kB 7.3MB/s eta 0:00:01[K     |██▉                             | 61kB 7.1MB/s eta 0:00:01[K     |███▎                            | 71kB 7.2MB/s eta 0:00:01[K     |███▊                            | 81kB 7.8MB/s eta 0:00:01[K     |████▏                           | 92kB 7.9MB/s eta 0:00:01[K     |████▋                           | 102kB 7.9MB/s eta 0:00:01[K     |█████▏                          | 112kB 7.9MB/s eta 0:00:01[K     |█████▋      

## cross validation

In [4]:
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def cross_val_score_for_keras(X_train, cY_train, epochs, batch_size, n_splits=10):

  skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2021)

  acc = []
  f1_macro = []

  for train_idx, val_idx in skf.split(X_train, cY_train.argmax(axis=1)):
    # creating dataset #
    train_data = X_train[train_idx]
    train_labels = cY_train[train_idx] 
    val_data = X_train[val_idx],
    val_labels = cY_train[val_idx]
    # creating model, save & load the best model #
    model = create_model()
    callbacks_list = [CustomCallback(model, val_data, val_labels, MODEL_PATH),]
    history = model.fit(train_data, train_labels, 
                        epochs=epochs, 
                        batch_size=batch_size, 
                        verbose=0,
                        callbacks=callbacks_list,
                        validation_data=(val_data, val_labels))
    model = load_model(MODEL_PATH)
    # calculate the metrics #
    acc_score = accuracy_score(np.argmax(val_labels, axis=1), np.argmax(model.predict(val_data), axis=1))
    acc.append(acc_score)
    score = f1_score(np.argmax(val_labels, axis=1), np.argmax(model.predict(val_data), axis=1), average='macro')
    f1_macro.append(score)

    print('accuracy: {0:.3f}, f1 macro: {1:.3f}'.format(acc_score, score))
    break

  return acc, f1_macro

## cross validation and ensemble

In [5]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from sklearn.metrics import f1_score

def cross_val_score_for_ensemble(X_train, Y_train, epochs, batch_size, n_splits=10):

  skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2021)

  loss=[]
  acc=[]
  f1_macro = []
  model_list = []

  num = 0

  for train_idx, val_idx in skf.split(X_train, cY_train.argmax(axis=1)):
    train_data = X_train[train_idx]
    train_labels = cY_train[train_idx]
    val_data = X_train[val_idx]
    val_labels = cY_train[val_idx]

    model = create_model()
    model_path = str(num)+'_'+MODEL_PATH
    callbacks_list = [CustomCallback(model, val_data, val_labels, model_path),]
    history = model.fit(train_data, train_labels, 
                        epochs=epochs, 
                        batch_size=batch_size, 
                        callbacks=callbacks_list, 
                        verbose=0, 
                        validation_data=(val_data, val_labels))
    model_list.append(load_model(model_path))

    score = f1_score(np.argmax(val_labels, axis=1), np.argmax(model_list[num].predict(val_data), axis=1), average='macro')
    f1_macro.append(score)
    print('f1 macro: {0:.3f}'.format(score))

    num += 1

  return f1_macro, model_list

# Loading data

In [6]:
import pandas as pd

df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)

df = pd.concat([df_train.drop('genre', axis=1), df_test], axis=0)

print('df_train shape: {0}, df_test shape: {1}, df shape: {2}'.format(df_train.shape, df_test.shape, df.shape))
df_train.head()

df_train shape: (4046, 15), df_test shape: (4046, 14), df shape: (8092, 14)


Unnamed: 0.1,Unnamed: 0,index,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region,genre
0,0,0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H,10
1,1,1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I,8
2,2,2,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E,3
3,3,3,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C,10
4,4,4,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown,3


# Preprocessing

## index

In [7]:
df = df.drop('index', axis=1)

print('df shape: {0}'.format(df.shape))
df.head()

df shape: (8092, 13)


Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region
0,0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H
1,1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I
2,2,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E
3,3,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C
4,4,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown


## popularity

In [8]:
df['popularity_temp'] = 'popularity_'+df['popularity'].astype(str) 
df = pd.concat([df, pd.get_dummies(df['popularity_temp'])], axis=1)
df = df.drop(['popularity_temp'], axis=1)

print('df shape: {0}'.format(df.shape))
df.head()

df shape: (8092, 96)


Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region,popularity_0,popularity_1,popularity_10,popularity_11,popularity_12,popularity_13,popularity_14,popularity_15,popularity_16,popularity_17,popularity_18,popularity_19,popularity_2,popularity_20,popularity_21,popularity_22,popularity_23,popularity_24,popularity_25,popularity_26,popularity_27,popularity_28,popularity_29,popularity_3,popularity_30,popularity_31,popularity_32,...,popularity_48,popularity_49,popularity_5,popularity_50,popularity_51,popularity_52,popularity_53,popularity_54,popularity_55,popularity_56,popularity_57,popularity_58,popularity_59,popularity_6,popularity_60,popularity_61,popularity_62,popularity_63,popularity_64,popularity_65,popularity_66,popularity_67,popularity_68,popularity_69,popularity_7,popularity_70,popularity_71,popularity_72,popularity_73,popularity_74,popularity_75,popularity_76,popularity_77,popularity_78,popularity_79,popularity_8,popularity_80,popularity_81,popularity_82,popularity_9
0,0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## duration_ms

In [9]:
import numpy as np

df['duration_ms_log'] = np.log10(df['duration_ms'])

print('df shape: {0}'.format(df.shape))
df.head()

df shape: (8092, 97)


Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region,popularity_0,popularity_1,popularity_10,popularity_11,popularity_12,popularity_13,popularity_14,popularity_15,popularity_16,popularity_17,popularity_18,popularity_19,popularity_2,popularity_20,popularity_21,popularity_22,popularity_23,popularity_24,popularity_25,popularity_26,popularity_27,popularity_28,popularity_29,popularity_3,popularity_30,popularity_31,popularity_32,...,popularity_49,popularity_5,popularity_50,popularity_51,popularity_52,popularity_53,popularity_54,popularity_55,popularity_56,popularity_57,popularity_58,popularity_59,popularity_6,popularity_60,popularity_61,popularity_62,popularity_63,popularity_64,popularity_65,popularity_66,popularity_67,popularity_68,popularity_69,popularity_7,popularity_70,popularity_71,popularity_72,popularity_73,popularity_74,popularity_75,popularity_76,popularity_77,popularity_78,popularity_79,popularity_8,popularity_80,popularity_81,popularity_82,popularity_9,duration_ms_log
0,0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.303399
1,1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.489245
2,2,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.294962
3,3,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.478699
4,4,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.443025


## loudness

In [10]:
df['loudness_10'] = 10**df['loudness']

print('df shape: {0}'.format(df.shape))
df.head()

df shape: (8092, 98)


Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region,popularity_0,popularity_1,popularity_10,popularity_11,popularity_12,popularity_13,popularity_14,popularity_15,popularity_16,popularity_17,popularity_18,popularity_19,popularity_2,popularity_20,popularity_21,popularity_22,popularity_23,popularity_24,popularity_25,popularity_26,popularity_27,popularity_28,popularity_29,popularity_3,popularity_30,popularity_31,popularity_32,...,popularity_5,popularity_50,popularity_51,popularity_52,popularity_53,popularity_54,popularity_55,popularity_56,popularity_57,popularity_58,popularity_59,popularity_6,popularity_60,popularity_61,popularity_62,popularity_63,popularity_64,popularity_65,popularity_66,popularity_67,popularity_68,popularity_69,popularity_7,popularity_70,popularity_71,popularity_72,popularity_73,popularity_74,popularity_75,popularity_76,popularity_77,popularity_78,popularity_79,popularity_8,popularity_80,popularity_81,popularity_82,popularity_9,duration_ms_log,loudness_10
0,0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.303399,0.01303611
1,1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.489245,2.841221e-06
2,2,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.294962,5.550475e-10
3,3,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.478699,8.151028e-06
4,4,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.443025,0.0001164406


## tempo

In [11]:
df['tempo_max'] = df['tempo'].str.split('-').str.get(1).astype(int) # max in tempo
df = pd.concat([df, pd.get_dummies(df['tempo'])], axis=1)     # binning
df = df.drop(['tempo'], axis=1)

print('df shape: {0}'.format(df.shape))
df.head()

df shape: (8092, 110)


Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,region,popularity_0,popularity_1,popularity_10,popularity_11,popularity_12,popularity_13,popularity_14,popularity_15,popularity_16,popularity_17,popularity_18,popularity_19,popularity_2,popularity_20,popularity_21,popularity_22,popularity_23,popularity_24,popularity_25,popularity_26,popularity_27,popularity_28,popularity_29,popularity_3,popularity_30,popularity_31,popularity_32,popularity_33,...,popularity_61,popularity_62,popularity_63,popularity_64,popularity_65,popularity_66,popularity_67,popularity_68,popularity_69,popularity_7,popularity_70,popularity_71,popularity_72,popularity_73,popularity_74,popularity_75,popularity_76,popularity_77,popularity_78,popularity_79,popularity_8,popularity_80,popularity_81,popularity_82,popularity_9,duration_ms_log,loudness_10,tempo_max,0-40,121-152,153-176,177-192,193-208,209-220,41-50,51-56,57-63,64-76,77-96,97-120
0,0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,region_H,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.303399,0.01303611,152,0,1,0,0,0,0,0,0,0,0,0,0
1,1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,region_I,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.489245,2.841221e-06,176,0,0,1,0,0,0,0,0,0,0,0,0
2,2,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,region_E,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.294962,5.550475e-10,76,0,0,0,0,0,0,0,0,0,1,0,0
3,3,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,region_C,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.478699,8.151028e-06,192,0,0,0,1,0,0,0,0,0,0,0,0
4,4,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,unknown,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.443025,0.0001164406,120,0,0,0,0,0,0,0,0,0,0,0,1


## region

In [12]:
df = pd.concat([df, pd.get_dummies(df['region'])], axis=1)
df = df.drop(['region'], axis=1)

print('df shape: {0}'.format(df.shape))
df.head()

df shape: (8092, 130)


Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,popularity_0,popularity_1,popularity_10,popularity_11,popularity_12,popularity_13,popularity_14,popularity_15,popularity_16,popularity_17,popularity_18,popularity_19,popularity_2,popularity_20,popularity_21,popularity_22,popularity_23,popularity_24,popularity_25,popularity_26,popularity_27,popularity_28,popularity_29,popularity_3,popularity_30,popularity_31,popularity_32,popularity_33,popularity_34,...,popularity_80,popularity_81,popularity_82,popularity_9,duration_ms_log,loudness_10,tempo_max,0-40,121-152,153-176,177-192,193-208,209-220,41-50,51-56,57-63,64-76,77-96,97-120,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown
0,0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5.303399,0.01303611,152,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5.489245,2.841221e-06,176,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,2,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5.294962,5.550475e-10,76,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5.478699,8.151028e-06,192,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5.443025,0.0001164406,120,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


## missing value

In [13]:
# col_list = ['acousticness', 'positiveness', 'danceability', 'energy', 'liveness', 'speechiness', 'instrumentalness']
# for col in col_list:
#   df[col+'_nan'] = df[col].isna().astype(int)

# df = df.fillna(df.mean())

print('df shape: {0}'.format(df.shape))
df.head()

df shape: (8092, 130)


Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,popularity_0,popularity_1,popularity_10,popularity_11,popularity_12,popularity_13,popularity_14,popularity_15,popularity_16,popularity_17,popularity_18,popularity_19,popularity_2,popularity_20,popularity_21,popularity_22,popularity_23,popularity_24,popularity_25,popularity_26,popularity_27,popularity_28,popularity_29,popularity_3,popularity_30,popularity_31,popularity_32,popularity_33,popularity_34,...,popularity_80,popularity_81,popularity_82,popularity_9,duration_ms_log,loudness_10,tempo_max,0-40,121-152,153-176,177-192,193-208,209-220,41-50,51-56,57-63,64-76,77-96,97-120,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown
0,0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5.303399,0.01303611,152,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5.489245,2.841221e-06,176,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,2,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5.294962,5.550475e-10,76,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5.478699,8.151028e-06,192,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5.443025,0.0001164406,120,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


## popularity *

In [14]:
col_list = ['acousticness', 'positiveness', 'danceability', 'energy', 'liveness', 'speechiness', 'instrumentalness', 'loudness_10']
for col in col_list:
  df['popularity*'+col] = df['popularity']*df[col]

print('df shape: {0}'.format(df.shape))
df.head()

df shape: (8092, 138)


Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,popularity_0,popularity_1,popularity_10,popularity_11,popularity_12,popularity_13,popularity_14,popularity_15,popularity_16,popularity_17,popularity_18,popularity_19,popularity_2,popularity_20,popularity_21,popularity_22,popularity_23,popularity_24,popularity_25,popularity_26,popularity_27,popularity_28,popularity_29,popularity_3,popularity_30,popularity_31,popularity_32,popularity_33,popularity_34,...,121-152,153-176,177-192,193-208,209-220,41-50,51-56,57-63,64-76,77-96,97-120,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown,popularity*acousticness,popularity*positiveness,popularity*danceability,popularity*energy,popularity*liveness,popularity*speechiness,popularity*instrumentalness,popularity*loudness_10
0,0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1.240922,1.729719,2.066255,9.833101,3.99925,4.291191,9.777725,0.1433972
1,1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,6.991994,23.91286,38.256613,60.334189,13.378515,11.143274,8.549819,0.0001960443
2,2,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,21.34604,11.411802,19.678603,18.91711,9.337269,15.869452,7.158207,2.386704e-08
3,3,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7.455009,11.048972,16.046026,39.091677,16.966103,10.200453,7.892969,0.0003667962
4,4,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,10.871059,44.321924,47.337288,37.058511,9.651383,12.681803,12.883737,0.006637115


## standardization

In [15]:
df.describe()

Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,popularity_0,popularity_1,popularity_10,popularity_11,popularity_12,popularity_13,popularity_14,popularity_15,popularity_16,popularity_17,popularity_18,popularity_19,popularity_2,popularity_20,popularity_21,popularity_22,popularity_23,popularity_24,popularity_25,popularity_26,popularity_27,popularity_28,popularity_29,popularity_3,popularity_30,popularity_31,popularity_32,popularity_33,popularity_34,...,121-152,153-176,177-192,193-208,209-220,41-50,51-56,57-63,64-76,77-96,97-120,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown,popularity*acousticness,popularity*positiveness,popularity*danceability,popularity*energy,popularity*liveness,popularity*speechiness,popularity*instrumentalness,popularity*loudness_10
count,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,...,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0,8092.0
mean,4045.5,41.107143,241994.8,0.340824,0.466998,0.50108,-7.676095,0.606151,0.267525,0.20037,0.215541,0.004449,0.003089,0.004572,0.004572,0.005067,0.004943,0.005067,0.00655,0.007538,0.010257,0.009021,0.009516,0.002966,0.011246,0.010381,0.01001,0.013594,0.010751,0.011987,0.011987,0.014212,0.014335,0.015942,0.003337,0.018413,0.01693,0.019155,0.01693,0.020143,...,0.345156,0.107143,0.03176,0.012481,0.000618,0.000247,0.000989,0.003584,0.029783,0.183144,0.282625,0.002224,0.083169,0.021132,0.050544,0.167202,0.036456,0.01347,0.047578,0.19439,0.007168,0.057093,0.027805,0.000371,0.009268,0.035838,0.08218,0.009145,0.010504,0.015818,0.042635,0.086011,14.539187,19.486881,21.153785,24.407908,10.868763,8.260528,8.478083,0.3239022
std,2336.103522,16.135588,81817.82,0.238695,0.223599,0.160348,4.049943,0.200901,0.156326,0.085047,0.15585,0.066555,0.055501,0.067469,0.067469,0.071005,0.070138,0.071005,0.08067,0.086501,0.100763,0.094557,0.097089,0.054383,0.105454,0.101361,0.099554,0.115804,0.103136,0.108834,0.108834,0.118369,0.118876,0.125258,0.057671,0.134449,0.129018,0.137077,0.129018,0.140499,...,0.475448,0.309314,0.175371,0.111028,0.024851,0.01572,0.031429,0.059761,0.169997,0.386808,0.450303,0.047114,0.276154,0.143833,0.219078,0.373179,0.187433,0.115284,0.212884,0.395754,0.084363,0.232035,0.164425,0.019252,0.095831,0.185897,0.274656,0.095196,0.101957,0.124779,0.202045,0.280397,12.333897,12.395443,11.391534,12.318465,7.795321,5.181686,6.929087,2.959956
min,0.0,0.0,5826.0,0.0,0.0,0.0,-37.820457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2022.75,31.0,203105.8,0.147911,0.28292,0.390211,-9.76915,0.461737,0.16874,0.149483,0.14373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.296034,9.812512,12.538182,15.395709,5.937573,4.982271,5.012983,5.472414e-09
50%,4045.5,42.0,235738.5,0.249037,0.449434,0.50895,-7.14534,0.635327,0.219955,0.184954,0.172111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.412903,17.557973,20.494183,23.583811,9.048621,7.357502,7.158962,2.601698e-06
75%,6068.25,52.0,272485.5,0.508564,0.64487,0.616651,-4.906935,0.770725,0.321921,0.22753,0.205584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.07728,27.466832,28.693451,32.444304,13.554561,10.294577,9.641747,0.0004336488
max,8091.0,82.0,2135773.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,78.0,77.405601,75.802197,68.77264,73.08435,63.0,73.136868,66.0


In [16]:
for col in df.columns:
  if col != 'genre' and df[col].max() > 1.1:
    print(col)
    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

print('df shape: {0}'.format(df.shape))
df.head()

Unnamed: 0
popularity
duration_ms
duration_ms_log
tempo_max
popularity*acousticness
popularity*positiveness
popularity*danceability
popularity*energy
popularity*liveness
popularity*speechiness
popularity*instrumentalness
popularity*loudness_10
df shape: (8092, 138)


Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,popularity_0,popularity_1,popularity_10,popularity_11,popularity_12,popularity_13,popularity_14,popularity_15,popularity_16,popularity_17,popularity_18,popularity_19,popularity_2,popularity_20,popularity_21,popularity_22,popularity_23,popularity_24,popularity_25,popularity_26,popularity_27,popularity_28,popularity_29,popularity_3,popularity_30,popularity_31,popularity_32,popularity_33,popularity_34,...,121-152,153-176,177-192,193-208,209-220,41-50,51-56,57-63,64-76,77-96,97-120,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown,popularity*acousticness,popularity*positiveness,popularity*danceability,popularity*energy,popularity*liveness,popularity*speechiness,popularity*instrumentalness,popularity*loudness_10
0,0.0,0.134146,0.091677,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.015909,0.022346,0.027259,0.14298,0.054721,0.068114,0.133691,0.002172685
1,0.000124,0.841463,0.142101,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.089641,0.308929,0.50469,0.877299,0.183056,0.176877,0.116902,2.970368e-06
2,0.000247,0.52439,0.089861,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.273667,0.147429,0.259605,0.275067,0.12776,0.251896,0.097874,3.616218e-10
3,0.000371,0.54878,0.138626,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.095577,0.142741,0.211683,0.568419,0.232144,0.161912,0.107921,5.557519e-06
4,0.000494,0.695122,0.127478,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.139373,0.572593,0.624484,0.538855,0.132058,0.201298,0.176159,0.0001005623


# data

In [17]:
from keras.utils import to_categorical

X_train = df.iloc[:4046, :].values
X_test = df.iloc[4046:, :].values
cY_train = to_categorical(df_train['genre'])

print('X_train shape: {0}, cY_train shape: {1}'.format(X_train.shape, cY_train.shape,))

X_train shape: (4046, 138), cY_train shape: (4046, 11)


# cross validation

|index|notes|consideration|accuracy|f1 score|LB|
|:--:|:--:|:--|:--:|:--:|:--:|
|0|baseline| duration_ms_log loudness_10が利いてる?!|0.654*|0.639*||
|1|[0]+missing value -> one hot encoding|[0]と比べてaccuracyが高いことから悲観する必要はない気がする.<br>中間層増やすか, monitor=val_accuracyで精度が向上する可能性あり.|0.659*|0.580*||
|2|[0]+popularity*|popularityの貢献度が図りしれない.<br>他にもpopularityと組み合わせて特徴量を増やす手段があるかもしれない.|0.694*|0.665*||
|3|[1]+lr=1e-5|欠損値はあまり結果に影響しないまでか,邪魔をするまである.|0.617*|0.553*||
|4|[2] monitor=val_accuracy|val_lossとval_accuracyの違いは分からないので,両方試すしかない|0.694*|0.637*||
|5|[2]+137-200-100-100-11|多分epochsが足りない|0.662*|0.628*||
|6|[5]+epochs=1000|3層がbestだね|0.644*|0.582*||
|7|[2]+CustomCallbacks|変わらないが,安心感あり|0.681*|0.657* - 0.629|0.613|
|8|[7]+lr=1e-5+epochs=1000|変化が激しいのはなぜ?-> epochsが足りない?!|0.647*|0.614*||
|9|[7]+lr=1e-4|間違えてるで（笑）|0.672*|0.673* - 0.601|0.|
|10|[7]+lr=2e-5|||0.564|0.|
|11|[7]+lr=1e-5+epochs=1500||||0.|

## cv

In [18]:
# from time import time

# EPOCHS = 500
# BATCH_SIZE = 4

# start_time = time()
# acc, f1_macro = cross_val_score_for_keras(X_train, cY_train, EPOCHS, BATCH_SIZE, 10)
# elapsed_time = time() - start_time

# print('Elapsed time: {0:.3f} m'.format(elapsed_time / 60))
# print('accuracy: {0:.3f}, f1 macro: {1:.3f} with (CV=10)'.format(np.mean(acc), np.mean(f1_macro)))

## cv for ensemble

In [19]:
from time import time

EPOCHS = 500
BATCH_SIZE = 4

start_time = time()
f1_macro, model_list = cross_val_score_for_ensemble(X_train, cY_train, EPOCHS, BATCH_SIZE, 10)
elapsed_time = time() - start_time

print('Elapsed time: {0:.3f} hrs'.format(elapsed_time / 3600))
print('f1 macro for cv: {0:.3f}'.format(np.mean(f1_macro)))

f1 macro: 0.652
f1 macro: 0.628
f1 macro: 0.590
f1 macro: 0.621
f1 macro: 0.614
f1 macro: 0.657
f1 macro: 0.649
f1 macro: 0.612
f1 macro: 0.590
f1 macro: 0.615
Elapsed time: 2.712 hrs
f1 macro for cv: 0.623


In [20]:
# model = create_model()
# history = model.fit(X_train, cY_train, 
#                     epochs=500, 
#                     batch_size=4,
#                     verbose=1,)

In [21]:
# import matplotlib.pyplot as plt

# acc = history.history['accuracy']
# val_acc = history.history['val_accuracy']
# loss = history.history['loss']
# val_loss = history.history['val_loss']

# epochs = range(1, len(acc) + 1)

# plt.plot(epochs, acc, 'b', label='Training acc')
# plt.plot(epochs, val_acc, 'b', color='orange', label='Validation acc')
# plt.title('Training and validation accuracy')
# plt.legend()

# plt.show()

# plt.plot(epochs, loss, 'b', label='Training loss')
# plt.plot(epochs, val_loss, 'b', color='orange', label='Validation loss')
# plt.title('Training and validation loss')
# plt.legend()

# plt.show()

# submission


## 相加平均

In [27]:
import numpy as np

preds = model_list[0].predict(X_test)*f1_macro[0]/np.sum(f1_macro)

for i in range(1, 10):
  preds += model_list[i].predict(X_test)*f1_macro[i]/np.sum(f1_macro)

preds = np.argmax(preds, axis=1)

df_sub = pd.concat([df_test['index'].reset_index(drop=True), pd.DataFrame(preds, columns=['predicted']).astype(int)], axis=1)
df_sub.to_csv('submission.csv', index=False, header=False)

df_sub.head()

Unnamed: 0,index,predicted
0,4046,7
1,4047,10
2,4048,10
3,4049,10
4,4050,8


In [28]:
import numpy as np

predicted =  np.argmax(np.mean([model_list[0].predict(X_test),
                                model_list[1].predict(X_test),
                                model_list[2].predict(X_test),
                                model_list[3].predict(X_test),
                                model_list[4].predict(X_test),
                                model_list[5].predict(X_test),
                                model_list[6].predict(X_test),
                                model_list[7].predict(X_test),
                                model_list[8].predict(X_test),
                                model_list[9].predict(X_test),], axis=0), axis=1)

df_sub = pd.concat([df_test['index'].reset_index(drop=True), pd.DataFrame(predicted, columns=['predicted']).astype(int)], axis=1)
df_sub.to_csv('am_submission.csv', index=False, header=False)

df_sub.head()

Unnamed: 0,index,predicted
0,4046,7
1,4047,10
2,4048,10
3,4049,10
4,4050,8


## 調和平均

In [23]:
# from scipy import stats

# predicted =  np.argmax(stats.hmean([model_list[0].predict(X_test),
#                                     model_list[2].predict(X_test),
#                                     model_list[3].predict(X_test),
#                                     model_list[4].predict(X_test),
#                                     model_list[5].predict(X_test),
#                                     model_list[6].predict(X_test),
#                                     model_list[7].predict(X_test),
#                                     model_list[8].predict(X_test),
#                                     model_list[9].predict(X_test),], axis=0), axis=1)

# df_sub = pd.concat([df_test['index'].reset_index(drop=True), pd.DataFrame(predicted, columns=['predicted']).astype(int)], axis=1)
# df_sub.to_csv('/content/drive/MyDrive/student_cup_2021/dataset/hm_submission.csv', index=False, header=False)

# df_sub.head()

# hold-out

In [24]:
# from sklearn.model_selection import train_test_split
# from keras.callbacks import EarlyStopping

# train_data, val_data, train_labels, val_labels = train_test_split(X_train, cY_train, test_size=0.2, shuffle=False, random_state=2021)

# model = create_model()
# history = model.fit(X_train, cY_train, 
#                     epochs=10, 
#                     batch_size=4,
#                     callbacks=[CustomCallback(model, val_data, val_labels)],
#                     verbose=1,
#                     validation_split=0.2)

# Colab

```javascript
function ClickConnect(){ 
console.log("Working"); 
document.querySelector("#comments > span").click()
}
setInterval(ClickConnect,500000)
```