# DNN Models
- Based on the 3rd place solution
- Make different DNN models by changing only the random seed value
- Ensemble the models using the power mean

### Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
%matplotlib inline

import time
import sys
import gc
import pickle
import platform
import os
import random
from tqdm import tqdm

# For scaling
from sklearn.preprocessing import StandardScaler

# For DNN modeling
import tensorflow as tf
import keras.backend as K

# Tensorflow warning off
if tf.__version__ <'2': 
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
    tf.logging.set_verbosity(tf.logging.ERROR)

import keras
from keras import backend as K
from keras.layers import * #Input, Dense
from keras.models import * #Model
from keras.optimizers import *
from keras.initializers import *
from keras.regularizers import *
from keras.utils.np_utils import *
from keras.utils.vis_utils import * #model_to_dot
from IPython.display import Image

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


### Load Data

In [2]:
df = pd.read_pickle('data.pkl')
test  = pd.read_csv('test.csv', encoding='cp949')

In [1]:
# df.info()

### Split Data

In [6]:
X_train = ...
y_train = ...
X_valid = ...
y_valid = ...
X_test = ...

In [8]:
X_train.shape, X_test.shape

((118130, 42), (13481, 42))

### Train & Evaluate the Model

#### Scale Data

In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

#### Build Models
Random seed 변경만을 통해 다수의 DNN 모델 생성

In [10]:
# 모형 학습 시 RMSE를 계산하는 함수
import keras.backend as K
def rmse(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [None]:
# 매번 모델링을 할 때마다 동일한 결과를 얻으려면 아래 코드를 실행해야 함.
def get_reproducible_model():
    SEED = 1                    # seed 숫자를 지정
    random.seed(SEED)           # Python 고정
    np.random.seed(SEED)        # numpy 고정
    if tf.__version__[0] < '2': # Tensorflow 고정
        tf.set_random_seed(SEED)
    else:
        tf.random.set_seed(SEED)

In [None]:
# 배치 사이즈 찾는 함수
def FindBatchSize(model):
    """#model: model architecture, that is yet to be trained"""
    import os, sys, psutil, gc, tensorflow, keras
    import numpy as np
    from keras import backend as K
    BatchFound= 16

    try:
        total_params= int(model.count_params());    GCPU= "CPU"
        #find whether gpu is available
        try:
            if K.tensorflow_backend._get_available_gpus()== []:
                GCPU= "CPU";    #CPU and Cuda9GPU
            else:
                GCPU= "GPU"
        except:
            from tensorflow.python.client import device_lib;    #Cuda8GPU
            def get_available_gpus():
                local_device_protos= device_lib.list_local_devices()
                return [x.name for x in local_device_protos if x.device_type == 'GPU']
            if "gpu" not in str(get_available_gpus()).lower():
                GCPU= "CPU"
            else:
                GCPU= "GPU"

        #decide batch size on the basis of GPU availability and model complexity
        if (GCPU== "GPU") and (os.cpu_count() >15) and (total_params <1000000):
            BatchFound= 64    
        if (os.cpu_count() <16) and (total_params <500000):
            BatchFound= 64  
        if (GCPU== "GPU") and (os.cpu_count() >15) and (total_params <2000000) and (total_params >=1000000):
            BatchFound= 32      
        if (GCPU== "GPU") and (os.cpu_count() >15) and (total_params >=2000000) and (total_params <10000000):
            BatchFound= 16  
        if (GCPU== "GPU") and (os.cpu_count() >15) and (total_params >=10000000):
            BatchFound= 8       
        if (os.cpu_count() <16) and (total_params >5000000):
            BatchFound= 8    
        if total_params >100000000:
            BatchFound= 1

    except:
        pass
    try:

        #find percentage of memory used
        memoryused= psutil.virtual_memory()
        memoryused= float(str(memoryused).replace(" ", "").split("percent=")[1].split(",")[0])
        if memoryused >75.0:
            BatchFound= 8
        if memoryused >85.0:
            BatchFound= 4
        if memoryused >90.0:
            BatchFound= 2
        if total_params >100000000:
            BatchFound= 1
        print("Batch Size:  "+ str(BatchFound));    gc.collect()
    except:
        pass

    memoryused= [];    total_params= [];    GCPU= "";
    del memoryused, total_params, GCPU;    gc.collect()
    return BatchFound

In [None]:
FindBatchSize(model)

In [11]:
# 예측값을 저장할 폴더 생성
folder = 'Ensemble'
if not os.path.isdir(folder):
    os.mkdir(folder)

In [12]:
for i in tqdm(range(10)):    
    SEED = np.random.randint(1, 10000)              
    random.seed(SEED)       
    np.random.seed(SEED)     
    if tf.__version__[0] < '2':  
        tf.set_random_seed(SEED)
    else:
        tf.random.set_seed(SEED)
    
    # Define the NN architecture
    input = Input(shape=(X_train.shape[1],))
    x = Dense(64, activation='elu')(input)
    x = Dropout(0.3)(x)
    x1 = Dense(64)(x)
    x = Add()([x1,x])
    x = Dense(32, activation='elu')(x)
    x = Dropout(0.3)(x)
    x1 = Dense(32)(x)
    x = Add()([x1,x])
    x = Dense(16, activation='elu')(x)
    x = Dropout(0.3)(x)
    x1 = Dense(16)(x)
    x = Add()([x1,x])
    output = Dense(1)(x)
    model = Model(input,output)  
    
    # Choose the optimizer and the cost function
    model.compile(loss='mse', optimizer='adam', metrics=[rmse])
    
    # explain model
    model.summary()
    Image(model_to_dot(model,show_shapes=True, show_layer_names=False).create(prog='dot', format='png'))
    
    # Train the model
    callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=200)]
    hist = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=2048, epochs=200, 
                 callbacks=callbacks, shuffle=False, verbose=0)
    
    # plotting loss func.
    plt.plot(hist.history['loss'], label="train loss")
    plt.plot(hist.history["val_loss"], label="validation loss")
    plt.legend()
    plt.xlabel('epoch')
    plt.title("Loss")
    plt.show()
    
    # Make submissions
    submission = pd.DataFrame({
        "xx": test.item_id, 
        # (0~20)으로 범위를 제한할 경우
        "yy": model.predict(X_test).clip(0, 20).flatten()
    })
    t = pd.Timestamp.now()
    fname = f"{folder}/dnn_submission_{t.month:02}{t.day:02}_s{SEED:05}.csv"
    submission.to_csv(fname, index=False)    

100%|██████████| 10/10 [16:54<00:00, 101.48s/it]


#### Ensemble Models 
생성된 다수의 DNN 모형을 power mean하여 앙상블

In [13]:
nf = 0
for f in os.listdir(folder):
    ext = os.path.splitext(f)[-1]
    if ext == '.csv': 
        s = pd.read_csv(folder+"/"+f)
    else: 
        continue
    if len(s.columns) !=2:
        continue
    if nf == 0: 
        slist = s
    else: 
        slist = pd.merge(slist, s, on="item_id")
    nf += 1

p = 4.5 # 이 값에 따라 성능이 달라짐 (p=1: 산술평균, p>1: 멱평균)    
if nf >= 2:
    pred = 0
    for j in range(nf): pred = pred + slist.iloc[:,j+1]**p 
    pred = pred / nf    
    pred = pred**(1/p)

    submission = pd.DataFrame({'xx': slist.item_id, 'yy': pred})
    t = pd.Timestamp.now()
    fname = f"p{p}mean_submission_{t.month:02}{t.day:02}_{t.hour:02}{t.minute:02}.csv"
    submission.to_csv(fname, index=False)

# End