# static

In [1]:
TRAIN_PATH = '/content/drive/MyDrive/student_cup_2021/dataset/train.csv'
TEST_PATH = '/content/drive/MyDrive/student_cup_2021/dataset/test.csv'

MODEL_PATH = 'model.h5'

# function

## genre

In [2]:
def prep_genre(data):
  if data == 10:
    return 'rock'
  elif data == 8:
    return 'pop'
  else:
    return 'others'

## creating model

In [3]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

def create_model():

  model = Sequential()

  model.add(Dense(100, activation='relu', input_shape=(X_train.shape[1],)))
  model.add(Dense(50, activation='relu'))
  model.add(Dense(cY_train.shape[1], activation='softmax'))
 
  model.compile(optimizer=Adam(lr=1e-5),
                loss='categorical_crossentropy',
                metrics=['accuracy'],)
  
  return model

## cross validation

In [4]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from sklearn.metrics import accuracy_score, f1_score

def cross_val_score_for_keras(X_train, cY_train, epochs, batch_size, n_splits=10):

  skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2021)

  acc = []
  f1_macro = []

  for train_idx, val_idx in skf.split(X_train, cY_train.argmax(axis=1)):
    # creating dataset #
    train_data = X_train[train_idx]
    train_labels = cY_train[train_idx] 
    val_data = X_train[val_idx],
    val_labels = cY_train[val_idx]
    # creating model, save & load the best model #
    model = create_model()
    callbacks_list = [ModelCheckpoint(filepath=MODEL_PATH, monitor='val_loss', save_best_only=True, mode='min'),]
    history = model.fit(train_data, train_labels, 
                        epochs=epochs, 
                        batch_size=batch_size, 
                        verbose=0,
                        callbacks=callbacks_list,
                        validation_data=(val_data, val_labels))
    model = load_model(MODEL_PATH)
    # calculate the metrics #
    acc_score = accuracy_score(np.argmax(val_labels, axis=1), np.argmax(model.predict(val_data), axis=1))
    acc.append(acc_score)
    score = f1_score(np.argmax(val_labels, axis=1), np.argmax(model.predict(val_data), axis=1), average='macro')
    f1_macro.append(score)

    print('accuracy: {0:.3f}, f1 macro: {1:.3f}'.format(acc_score, score))

  return acc, f1_macro

# Loading data

In [5]:
import pandas as pd

df_train = pd.read_csv(TRAIN_PATH)

print('df_train shape: {0}'.format(df_train.shape))
df_train.head()

df_train shape: (4046, 14)


Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region
0,0,10,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H
1,1,8,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I
2,2,3,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E
3,3,10,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C
4,4,3,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown


# Preprocessing

## index

In [6]:
df_train = df_train.drop('index', axis=1)

print('df_train shape: {0}'.format(df_train.shape))
df_train.head()

df_train shape: (4046, 13)


Unnamed: 0,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region
0,10,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H
1,8,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I
2,3,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E
3,10,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C
4,3,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown


## genre

In [7]:
df_train['genre'] = df_train['genre'].apply(prep_genre)

print('df_train shape: {0}'.format(df_train.shape))
df_train.head()

df_train shape: (4046, 13)


Unnamed: 0,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region
0,rock,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H
1,pop,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I
2,others,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E
3,rock,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C
4,others,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown


In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_train['genre'] = le.fit_transform(df_train['genre'])

print('df_train shape: {0}'.format(df_train.shape))
df_train.head()

df_train shape: (4046, 13)


Unnamed: 0,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region
0,2,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H
1,1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I
2,0,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E
3,2,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C
4,0,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown


## tempo

In [9]:
df_train['tempo_max'] = df_train['tempo'].str.split('-').str.get(1).astype(int) # max in tempo
df_train = pd.concat([df_train, pd.get_dummies(df_train['tempo'])], axis=1)     # binning
df_train = df_train.drop(['tempo'], axis=1)

print('df_train shape: {0}'.format(df_train.shape))
df_train.head()

df_train shape: (4046, 25)


Unnamed: 0,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,region,tempo_max,0-40,121-152,153-176,177-192,193-208,209-220,41-50,51-56,57-63,64-76,77-96,97-120
0,2,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,region_H,152,0,1,0,0,0,0,0,0,0,0,0,0
1,1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,region_I,176,0,0,1,0,0,0,0,0,0,0,0,0
2,0,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,region_E,76,0,0,0,0,0,0,0,0,0,1,0,0
3,2,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,region_C,192,0,0,0,1,0,0,0,0,0,0,0,0
4,0,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,unknown,120,0,0,0,0,0,0,0,0,0,0,0,1


## region

In [10]:
df_train = pd.concat([df_train, pd.get_dummies(df_train['region'])], axis=1)
df_train = df_train.drop(['region'], axis=1)

print('df_train shape: {0}'.format(df_train.shape))
df_train.head()

df_train shape: (4046, 45)


Unnamed: 0,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo_max,0-40,121-152,153-176,177-192,193-208,209-220,41-50,51-56,57-63,64-76,77-96,97-120,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown
0,2,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,152,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,176,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,76,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,192,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,120,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


## missing value

In [11]:
col_list = ['acousticness', 'positiveness', 'danceability', 'energy', 'liveness', 'speechiness', 'instrumentalness']
for col in col_list:
  df_train[col+'_nan'] = df_train[col].isna().astype(int)

df_train = df_train.fillna(df_train.mean())

print('df_train shape: {0}'.format(df_train.shape))
df_train.head()

df_train shape: (4046, 52)


Unnamed: 0,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo_max,0-40,121-152,153-176,177-192,193-208,209-220,41-50,51-56,57-63,64-76,77-96,97-120,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown,acousticness_nan,positiveness_nan,danceability_nan,energy_nan,liveness_nan,speechiness_nan,instrumentalness_nan
0,2,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,152,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,176,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,76,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,192,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,120,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


## standardization

In [12]:
df_train.describe()

Unnamed: 0,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo_max,0-40,121-152,153-176,177-192,193-208,209-220,41-50,51-56,57-63,64-76,77-96,97-120,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown,acousticness_nan,positiveness_nan,danceability_nan,energy_nan,liveness_nan,speechiness_nan,instrumentalness_nan
count,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0,4046.0
mean,0.98344,41.056105,242141.0,0.346455,0.4641,0.504347,-7.715659,0.603663,0.265986,0.198655,0.214336,134.53213,0.002224,0.349728,0.103312,0.031389,0.012852,0.000989,0.000247,0.001236,0.003213,0.023233,0.185615,0.285961,0.003213,0.080326,0.020761,0.049184,0.175482,0.033613,0.014829,0.044736,0.185863,0.006179,0.056846,0.027929,0.000741,0.010875,0.034108,0.086258,0.008898,0.012358,0.014335,0.042017,0.091448,0.0,0.002472,0.001977,0.0,0.000741,0.001977,0.000247
std,0.823014,16.165708,85202.41,0.241004,0.224774,0.158258,4.10964,0.20102,0.155712,0.083474,0.154262,30.432382,0.047117,0.476943,0.304403,0.174388,0.112651,0.031431,0.015721,0.035136,0.0566,0.150661,0.388844,0.451927,0.0566,0.271831,0.142602,0.21628,0.380426,0.180254,0.120885,0.206748,0.389044,0.078373,0.231577,0.164789,0.027223,0.103727,0.181528,0.280779,0.093919,0.110491,0.118883,0.200652,0.288281,0.0,0.04966,0.044428,0.0,0.027223,0.044428,0.015721
min,0.0,0.0,5998.0,0.0,0.0,0.013839,-37.820457,0.003383,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,31.0,204442.0,0.149705,0.276496,0.393138,-9.775363,0.462137,0.168575,0.148849,0.143298,120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,42.0,235873.5,0.250711,0.450598,0.510664,-7.18946,0.634078,0.218596,0.183322,0.171709,120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,52.0,272402.0,0.523088,0.644149,0.617067,-4.876553,0.768768,0.31771,0.224863,0.20547,152.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,82.0,2135773.0,1.0,0.989661,1.0,0.0,1.0,1.0,0.886806,1.0,220.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0


In [13]:
col_list = ['popularity', 'duration_ms', 'loudness', 'tempo_max']

for col in col_list:
  df_train[col] = (df_train[col] - df_train[col].min()) / (df_train[col].max() - df_train[col].min())

print('df_train shape: {0}'.format(df_train.shape))
df_train.head()

df_train shape: (4046, 52)


Unnamed: 0,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo_max,0-40,121-152,153-176,177-192,193-208,209-220,41-50,51-56,57-63,64-76,77-96,97-120,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,region_K,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown,acousticness_nan,positiveness_nan,danceability_nan,energy_nan,liveness_nan,speechiness_nan,instrumentalness_nan
0,2,0.134146,0.091604,0.112811,0.157247,0.187841,0.950163,0.893918,0.363568,0.390108,0.888884,0.622222,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0.841463,0.142031,0.101333,0.346563,0.554444,0.853347,0.874409,0.193892,0.161497,0.12391,0.755556,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0.52439,0.089787,0.49642,0.265391,0.457642,0.755273,0.439933,0.217146,0.369057,0.16647,0.2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,0.54878,0.138556,0.165667,0.245533,0.356578,0.865449,0.868704,0.377025,0.226677,0.175399,0.844444,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0.695122,0.127408,0.19072,0.777578,0.830479,0.895985,0.650149,0.169323,0.222488,0.22603,0.444444,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


# data

In [14]:
from keras.utils import to_categorical

X_train = df_train.drop(['genre'], axis=1).values
cY_train = to_categorical(df_train['genre'])

print('X_train shape: {0}, cY_train shape: {1}'.format(X_train.shape, cY_train.shape,))

X_train shape: (4046, 51), cY_train shape: (4046, 3)


# cross validation

|notes|accuracy|f1 score|
|:--:|:--:|:--:|
|10-8-others|0.655|0.655|

In [15]:
from time import time

EPOCHS = 500
BATCH_SIZE = 4

start_time = time()
acc, f1_macro = cross_val_score_for_keras(X_train, cY_train, EPOCHS, BATCH_SIZE, 10)
elapsed_time = time() - start_time

print('Elapsed time: {0:.3f} m'.format(elapsed_time / 60))
print('accuracy: {0:.3f}, f1 macro: {1:.3f} with (CV=10)'.format(np.mean(acc), np.mean(f1_macro)))

accuracy: 0.674, f1 macro: 0.675
accuracy: 0.657, f1 macro: 0.658
accuracy: 0.679, f1 macro: 0.678
accuracy: 0.637, f1 macro: 0.637
accuracy: 0.667, f1 macro: 0.666
accuracy: 0.672, f1 macro: 0.672
accuracy: 0.653, f1 macro: 0.653
accuracy: 0.661, f1 macro: 0.661
accuracy: 0.599, f1 macro: 0.601
accuracy: 0.651, f1 macro: 0.652
Elapsed time: 86.998 m
accuracy: 0.655, f1 macro: 0.655 with (CV=10)


# hold out

In [16]:
# from sklearn.utils import class_weight

# model = create_model()
# history = model.fit(X_train, cY_train, 
#                     epochs=500, 
#                     batch_size=4,
#                     verbose=1,
#                     validation_split=0.2)

In [17]:
# import matplotlib.pyplot as plt

# acc = history.history['accuracy']
# val_acc = history.history['val_accuracy']
# loss = history.history['loss']
# val_loss = history.history['val_loss']

# epochs = range(1, len(acc) + 1)

# plt.plot(epochs, acc, 'b', label='Training acc')
# plt.plot(epochs, val_acc, 'b', color='orange', label='Validation acc')
# plt.title('Training and validation accuracy')
# plt.legend()

# plt.show()

# plt.plot(epochs, loss, 'b', label='Training loss')
# plt.plot(epochs, val_loss, 'b', color='orange', label='Validation loss')
# plt.title('Training and validation loss')
# plt.legend()

# plt.show()