### 최종 결과물 : Catboost + NN Ensemble (Public : 5.226109375, Private : 8.73413)
### 목차
#### 0. Library
#### 1. Feature Engineering
##### 1-1) Catboost Data
- 데이터 원본 => 최초데이터 형태로 복구
- Mean Target Encoding
##### 1-2) NN Data
- Step1
    - 데이터 원본
    - 격자공간 정보 기반 외부 데이터 병합
- Step2
    - Step1 Data -> 일부 Feature 제외 Onehot Encoding -> Deep Auto Encoder
    - Step1 Data -> Mean Target Encoding -> Polynomial Features
#### 2. Catboost Modeling
- Catboost Data
- Catboost Regressor에 통채로 넣어 예측
#### 3. NN Modeling
- NN Data
- Fully Connected Netowrk 구성
#### 4. Ensemble & Submission

## 0. Library

In [1]:
## Loading & Save Data
import pandas as pd
import numpy as np
from glob import glob
import os

## print log
import traceback

## Feature Generation

## NN Modeling & DAE
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *
tf.compat.v1.enable_eager_execution()
import tensorflow.keras.backend as K

os.environ["CUDA_VISIBLE_DEVICES"]="0"
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# Others
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')


In [2]:
def return_warning_messages(e, step) : 
    """
    traceback 모듈을 사용해 어디서 에러가 발생했는지 출력
    매개변수 :
        e : error 메세지
        step : 현재 step -> 어디서 에러가 발생했는지 확인하기 위해
    출력 : 
        몇번째 step에서 어떤 error가 발생했는지 출력
    """

    log =''
    log += '='*10+'\n'
    log += f"[ERROR : {step}] Unexpected error : {e}" + '\n'
    log += traceback.format_exc() +'\n'
    log += '='*10+'\n'
    print(log)

## 1. Feature Engineering

### 1-0) Function for both data

In [3]:
def load_data(path): 
    """
    데이터를 불러와 반환하는 작업
    매개변수 :
        path : 데이터가 담겨져있는 변수
            list -> 원본데이터로 판단
            str  -> pickle 데이터로 판단
    반환값 : 
        train, test
    """
    if type(path) == list : 
        train_path = path[0]; test_path = path[1]
        train = pd.read_csv(train_path, encoding = 'cp949')
        test = pd.read_csv(test_path, encoding = 'cp949')

        kor_columns = ['index', '송장인번호', '수하인번호', '카테고리_대', '카테고리_중']
        train.columns = kor_columns + ['운송장_건수']
        test.columns = kor_columns
        return train, test
    else : 
        import pickle
        with open(path, 'rb') as f : 
            train, test, describe = pickle.load(f)
        f.close()
        
        return  train, test, describe
            

In [4]:
def mean_target_encoding(train, test, columns) : 
    """
    Mean Target Encoding : train 데이터의 값을 기준으로, 종속 변수에 대한 평균값으로 Encoding 하는 방법
    장점 : Label Encoding의 수치 변환에 대한 우려 X, Onehot Encoding의 차원의 저주에 대한 우려 X
    단점 : 실서비스 반영시 주기적인 업데이트 필요 , Data Leakage 발생하지 않도록 조심, Train Data Overfitting 가능성 존재

    매개변수 : 
        train    : 변환 작업을 위해 사용
        test     : 변환 작업을 위해 사용
        columns  : 변환 대상인 열들을 지정
    반환값 : 
        train : train 원본 데이터에 axis=1 방향으로 mean target encoding을 병합한 데이터
        test  : test  원본 데이터에 axis=1 방향으로 mean target encoding을 병합한 데이터
    """
    med = train['운송장_건수'].median() # 운송장_건수에는 이상치가 상당히 존재하기 때문에, median으로 채워준다.
    for column in columns :  
        new_column = 'Mean_' + column
        dic = train.groupby(column)['운송장_건수'].agg('mean').to_dict()
        train[new_column] = train[column].map(dic)
        test[new_column] = test[column].map(dic)
        test[new_column] = test[new_column].fillna(med)
    return train, test

In [5]:
def save_data(train, test, describe, path) : 
    """
    데이터를 pickle형태로 저장한 후, 추후 모델링 과정에서 불러올 예정
    매개 변수 : 
        train    : 저장할 train data
        test     : 저장할 test data
        describe : 메타정보
    """

    import pickle
    if path in os.listdir('./data') : 
        os.remove(path)
        print('Update Data')
    
    with open(path, 'wb') as f :
        pickle.dump([train, test, describe], f)
    f.close()

In [6]:
def save_prediction(sample, submission_path, pred, y_train) : 
    """
    최종 결과물을 저장한다.
    매개변수 : 
        sample              : 저장을 위해 사용될 sample data
        submission_path     : 저장될 위치
        pred                : 저장할 예측 값
        y_train             : post pre-processing을 위해 사용
    """

    if 'submissions' not in os.listdir(): 
        os.makedirs('submissions')
        print('Made paths in submissions')
    minimum = y_train.min()
    pred[pred < minimum ] = minimum
    display(pd.DataFrame(pred).describe())
    sample['INVC_CONT'] = pred
    sample.to_csv(submission_path, index=False)
    print(f'=== Prediction Saved At : {submission_path} ===')

### 1-1 ) Catboost Data

In [7]:
def cat_clear_data(train, test) : 
    """
    원본 데이터 형태로 처리한다.
    """
    train[['송장인번호','수하인번호']] = train[['송장인번호','수하인번호']].applymap(lambda x : str(x)[:5]+'0'*7)
    test[['송장인번호','수하인번호']] = test[['송장인번호','수하인번호']].applymap(lambda x : str(x)[:5]+'0'*7)
    return train,test

In [8]:
def __catboost_engineering_main__(save_path) : 
    try : 
        step = 1
        # 데이터 불러오기
        try : 
            train_path = './data/train_df.csv'
            test_path = './data/test_df.csv'
            train, test = load_data([train_path, test_path])
    
            step += 1
        except Exception as e :
            return_warning_messages(e, step = step)
            return 1
        
        # 초기 데이터 상태로 만들기
        try : 
            train, test = cat_clear_data(train, test)
            step += 1
        except Exception as e :
            return_warning_messages(e, step = step)
            return 1
        
        # Mean target encoding 적용하기
        try : 
            mean_target_encoding_columns = list(train.drop(columns = ['index','운송장_건수']).columns)
            train, test = mean_target_encoding(train, test, mean_target_encoding_columns)
            step += 1
        except Exception as e :
            return_warning_messages(e, step = step)
            return 1
        
        try : 
            describe = 'Data Type : Pandas DataFrame\nRaw Data + Mean Target Encoding'
            save_data(train, test, describe, path = save_path) 
        except Exception as e :
            return_warning_messages(e, step = step)
            return 1
        
    except Exception as e: 
        return_warning_messages(e, step = 0)
        return 1

### 1-2) NN Data

In [9]:
def NN_clear_data(train, test) :
    """
    국토연구원에서 제공하는 50m 격자단위 데이터를 결합합니다.
    데이터 링크 : https://www.bigdata-region.kr/#/dataset/0ad3c882-f7ee-4faf-970d-00c53cb65a84

    매개변수 : 
        train : 원본 데이터에 병합해서 사용할 예정
        test  : 원본 데이터에 병합해서 사용할 에정
    """
    from glob import glob
    from datetime import datetime

    if 'NN_train.csv' not in os.listdir('./data/') : 
        print('=== Installing New Data Initiated === / ', datetime.today())
        if "50m" not in os.listdir('./data/') : 
            print('50M 격자공간 데이터가 존재하지 않습니다.\n 데이터를 다운받아주세요')
        grid_50m = pd.DataFrame()
        for path in glob('./data/50m/*.csv') :
            grid_50m = pd.concat([grid_50m, pd.read_csv(path)], axis = 0).astype('str')
        grid_50m_name = dict(zip(grid_50m.격자공간고유번호, grid_50m.격자공간명))
        grid_50m_code = dict(zip(grid_50m.격자공간고유번호, grid_50m.시군구코드))
        grid_50m_sigu = dict(zip(grid_50m.격자공간고유번호, grid_50m.시군구명))
        train['송장인_name'] = train['송장인번호'].astype('str').map(grid_50m_name)
        train['송장인_code'] = train['송장인번호'].astype('str').map(grid_50m_code)
        train['송장인_sigu'] = train['송장인번호'].astype('str').map(grid_50m_sigu)
        train['수하인_name'] = train['수하인번호'].astype('str').map(grid_50m_name)
        train['수하인_code'] = train['수하인번호'].astype('str').map(grid_50m_code)
        train['수하인_sigu'] = train['수하인번호'].astype('str').map(grid_50m_sigu)
        test['송장인_name'] = test['송장인번호'].astype('str').map(grid_50m_name)
        test['송장인_code'] = test['송장인번호'].astype('str').map(grid_50m_code)
        test['송장인_sigu'] = test['송장인번호'].astype('str').map(grid_50m_sigu)
        test['수하인_name'] = test['수하인번호'].astype('str').map(grid_50m_name)
        test['수하인_code'] = test['수하인번호'].astype('str').map(grid_50m_code)
        test['수하인_sigu'] = test['수하인번호'].astype('str').map(grid_50m_sigu)

        train.to_csv('./data/NN_train.csv', index=False)
        test.to_csv('./data/NN_test.csv', index= False)
        print('=== Installing New Data Finishied === / ', datetime.today())
    else : 
        print('=== NN Data Exist ===')
    train = pd.read_csv('./data/NN_train.csv')
    test = pd.read_csv('./data/NN_test.csv')
    return train, test

In [10]:
def polynomial_features(train, test, columns) : 
    """
    Sklearn.preprocessing Polynomial Features : row단위로 데이터를 곱하거나 제곱해 Feature Generation 하는 방법
    Reference : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
    
    매개변수 : 
        train    : 변환 작업을 위해 사용
        test     : 변환 작업을 위해 사용
        columns  : 변환 대상인 열들을 지정
    반환값 : 
        train_poly : Polynomial Features를 적용한 값
        test_poly : Polynomial Feature를 적용한 값
    """
    from sklearn.preprocessing import PolynomialFeatures
    poly = PolynomialFeatures(degree=2)
    poly.fit(train[columns])
    new_columns = poly.get_feature_names(columns)
    train_poly = pd.DataFrame(poly.transform(train[columns]), columns = new_columns)
    test_poly = pd.DataFrame(poly.fit_transform(test[columns]), columns = new_columns)
    return train_poly, test_poly

In [11]:
def onehot_encoding(train, test ,columns, need_df = False) : 
    """
    매개변수 : 
        train    : 변환 작업을 위해 사용
        test     : 변환 작업을 위해 사용
        columns  : 변환 대상인 열들을 지정
        need_df  : DataFrame 형태로 반환할지, numpy array 형태로 반환할지 지정
    반환값 : 
        train_onehot : OneHotEncoding 된 train 값
        test_onehot  : OneHotEncoding 된 test 값
    """
    from sklearn.preprocessing import OneHotEncoder
    encoder = OneHotEncoder(handle_unknown='ignore')
    train_onehot = encoder.fit_transform(train[columns]).toarray()
    test_onehot  = encoder.transform(test[columns]).toarray()

    if need_df : 
        onehot_columns = encoder.get_feature_names(columns)
        train_onehot = pd.DataFrame(train_onehot, columns = onehot_columns)
        test_onehot = pd.DataFrame(test_onehot, columns = onehot_columns)
    return train_onehot, test_onehot

In [12]:
def create_auto_model(input_dim) : 
    def hidden(x, dim_size) : 
        x = Dense(dim_size, activity_regularizer=tf.keras.regularizers.l1(1e-6), kernel_initializer = 'he_normal')(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Dropout(0.2)(x)
        return x
    hid_dim1 = input_dim // 2
    hid_dim2 = input_dim // 4
    hid_dim3 = input_dim // 8

    auto_in = Input(shape = (input_dim,))
    h1 = hidden(auto_in, hid_dim1)
    h2 = hidden(h1, hid_dim1)
    h3 = hidden(h2, hid_dim2)
    h4 = hidden(h3, hid_dim3)
    h5 = hidden(h4, hid_dim2)
    h6 = hidden(h5, hid_dim1)
    auto_out = Dense(input_dim, activation = 'sigmoid')(h6)
    auto_model = Model(auto_in, auto_out)
    auto_model.compile(optimizer = 'adam', loss = 'mse')
    return auto_model

def get_callbacks(model_save_path, patience) : 
    """
    매개변수 : 
        model_save_path  : Auto encoder 모델 저장 경로
        patience         : Early Stopping에 적용 / 
    반환값 : 
        callbacks : 모델 학습에 필요한 callback 함수들
    """
    from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
    if 'models' not in os.listdir() : 
        os.makedirs('models')
    if model_save_path.replace('./models/','') in os.listdir('./models/') :
        os.remove(model_save_path)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=patience//3,verbose = 0, min_delta=1e-7)
    early = EarlyStopping(monitor = 'val_loss', patience = patience, verbose = 0)
    mck = ModelCheckpoint(filepath=model_save_path,  monitor='val_loss', save_best_only=True, verbose = 0, model = 'min')
    callbacks = [reduce_lr, early, mck]
    return callbacks

In [13]:
def return_encoded_data(train_onehot, test_onehot, model_save_path, patience) : 
    from sklearn.model_selection import KFold
    from datetime import datetime
    print('=== AUTO ENCODING STARTED ===  /', datetime.today())
    callbacks = get_callbacks(model_save_path, patience)
    new_trains = [] ; new_tests = []
    kf = KFold(n_splits=5, shuffle = True)
    loss = 0
    for train_idx, valid_idx in kf.split(np.arange(train_onehot.shape[0])) :
        tr_onehot = train_onehot[train_idx]; val_onehot = train_onehot[valid_idx]
        auto_model = create_auto_model(tr_onehot.shape[1])
        history = auto_model.fit(tr_onehot, tr_onehot,
                    epochs = 20000,
                    batch_size = tr_onehot.shape[0]//10,
                    validation_data=(val_onehot, val_onehot),
                    callbacks = callbacks,
                    verbose = 0
                    )
        auto_model.load_weights(model_save_path)
        new_trains.append(auto_model.predict(train_onehot))
        new_tests.append(auto_model.predict(test_onehot))
        loss += min(history.history['val_loss'])
    train_AE = np.mean(new_trains, axis = 0)
    test_AE = np.mean(new_tests, axis = 0)

    print(train_AE.shape, test_AE.shape, loss / 5)
    print('=== AUTO ENCODING ENDED ===  /', datetime.today())
    return train_AE, test_AE

In [14]:
def __NN_engineering_main__(save_path, need_AE_update) : 
    try : 
        step = 1
        try :  # STEP 1 : Reset Data
            train_path = './data/train_df.csv'
            test_path = './data/test_df.csv'
            train, test = load_data([train_path, test_path])
            step += 1
        except Exception as e :
            return_warning_messages(e, step = step)
            return 1

        try : # STEP 2 : Merge Grid Data
            train, test = NN_clear_data(train, test)
            step += 1
        except Exception as e :
            return_warning_messages(e, step = step)
            return 1

        try : # STEP 3 : Mean Target Encoding
            mean_target_encoding_columns = ['송장인번호', '카테고리_대', '카테고리_중', 
                            '송장인_name' ,'송장인_sigu', '송장인_code',
                            '수하인_sigu', '수하인_code']
            train, test = mean_target_encoding(train, test, mean_target_encoding_columns)
            step += 1
        except Exception as e :
            return_warning_messages(e, step = step)
            return 1
        
        try : # STEP 4 : Polynomial Features
            polynomial_features_columns = list(train.filter(like = 'Mean_').columns)
            train_poly, test_poly = polynomial_features(train, test, polynomial_features_columns)
            step += 1
        except Exception as e :
            return_warning_messages(e, step = step)
            return 1
        
        if need_AE_update : 
            try : # STEP 5 : Onehot Encoding
                onehot_encoding_columns = ['송장인번호', '카테고리_대', '카테고리_중', 
                            '송장인_name' , '송장인_sigu','송장인_code',
                            '수하인_sigu', '수하인_code']
                train_onehot, test_onehot = onehot_encoding(train, test, onehot_encoding_columns)
                step += 1
            except Exception as e :
                return_warning_messages(e, step = step)
                return 1
            
            try : # STEP 6 : Auto Encoding
                model_save_path = './models/AUTO_ENCODER.h5'
                patience = 15
                train_AE, test_AE = return_encoded_data(train_onehot, test_onehot, model_save_path, patience)
                step += 1
            except Exception as e :
                return_warning_messages(e, step = step)
                return 1
        else : 
            [[_,train_AE,_], [_, test_AE], _] = load_data(save_path)

        try : # STEP 7 : Save Data
            describe = 'Data Type : Numpy Array\nMean Target Encoding -> Polynomial Features & Onehot Encoding -> Deep Auto Encoding'
            train = [train_poly, train_AE, train['운송장_건수'].values]
            test  = [test_poly, test_AE]
            save_data(train, test, describe, path = save_path) 
        except Exception as e :
            return_warning_messages(e, step = step)
            return 1            
            

    except Exception as e: 
        return_warning_messages(e, step = 0)
        return 1


## 2. Catboost Modeling

In [15]:
def set_catboost_modeling_data(train, test) : 
    X_train = train.drop(columns = ['index', '운송장_건수'])
    y_train = train['운송장_건수'].astype('float32')
    X_test  = test.drop(columns = ['index'])
    cat_columns = ['송장인번호','수하인번호','카테고리_대','카테고리_중']
    sample = pd.read_csv('./data/sample_submission.csv')
    return X_train, y_train, X_test, cat_columns, sample

def catboost_modeling(X_train, y_train, X_test, cat_columns) : 
    from datetime import datetime
    print('=== Modeling Started   ===  / Initialized at : ', datetime.today())
    from catboost import CatBoostRegressor
    catboost = CatBoostRegressor()
    catboost.fit(X_train, y_train, silent = True, cat_features=cat_columns)
    pred = catboost.predict(X_test)
    print('=== Modeling Finishied === /  Finishied   at : ', datetime.today())
    return pred

In [16]:
def __catboost_main__(data_path, need_update) : 
    from datetime import datetime
    print('!! CATBOOST MAIN STARTED !!  /  ', datetime.today())
    print(str(datetime.today()), 'STARTED')
    print('')
    try : 
        step = 1
        try : # STEP 1 : Load Data
            check_path = data_path.replace('./data/','')
            if (check_path not in os.listdir('./data/')) or need_update : 
                print('=== UPDATE DATA ===')
                __catboost_engineering_main__(data_path)
                print(str(datetime.today()), 'UPDATED')
                print('')
            train, test, describe = load_data(data_path)
            print(describe)
        except Exception as e : 
            return_warning_messages(e, step = step)
            return 1

        try : # STEP 2 : Set Modeling Data
            X_train, y_train, X_test, cat_columns, sample = set_catboost_modeling_data(train, test)
            step += 1
        except Exception as e : 
            return_warning_messages(e, step = step)
            return 1

        try : # STEP 3 : Modeling
            prediction = catboost_modeling(X_train, y_train, X_test, cat_columns)
            step += 1
        except Exception as e : 
            return_warning_messages(e, step = step)
            return 1
        
        try : # STEP 4 : Save Prediction Data
            submission_path = './submissions/CATBOOST.csv'
            save_prediction(sample, submission_path, prediction, y_train)
        except Exception as e : 
            return_warning_messages(e, step = 0)
            return 1
        print('')
        print('!! CATBOOST MAIN ENDED !!  /  ', datetime.today())

    except Exception as e : 
        return_warning_messages(e, step = 0)

In [17]:
CAT_data_path = './data/CAT_DATA.pkl'
need_update = True
__catboost_main__(data_path = CAT_data_path, need_update = need_update)

!! CATBOOST MAIN STARTED !!  /   2021-12-16 20:18:19.984146
2021-12-16 20:18:19.984146 STARTED

=== UPDATE DATA ===
2021-12-16 20:18:20.085660 UPDATED

Data Type : Pandas DataFrame
Raw Data + Mean Target Encoding
=== Modeling Started   ===  / Initialized at :  2021-12-16 20:18:20.103611
=== Modeling Finishied === /  Finishied   at :  2021-12-16 20:18:47.269928


Unnamed: 0,0
count,4640.0
mean,4.807993
std,3.584526
min,3.0
25%,3.991876
50%,4.355627
75%,5.061352
max,161.186954


=== Prediction Saved At : ./submissions/CATBOOST.csv ===

!! CATBOOST MAIN ENDED !!  /   2021-12-16 20:18:47.298750


## 3. NN Modeling

In [18]:
def set_NN_modeling_data(train, test) : 
    X_train = np.concatenate([train[0].to_numpy(), train[1]], axis = 1)
    y_train = train[2].astype('float32')
    X_test  = np.concatenate([test[0].to_numpy(), test[1]], axis = 1)
    print(X_train.shape, y_train.shape, X_test.shape)
    sample = pd.read_csv('./data/sample_submission.csv')
    return X_train, y_train, X_test, sample

In [19]:
def NN_modeling(X_train, y_train, X_test, patience, total_epoch) : 
    def fc_layer(x, unit, dr) : 
        x = Dense(unit, kernel_initializer = 'he_normal')(x)
        x = BatchNormalization()(x)
        x = LeakyReLU()(x)
        x = Dropout(dr)(x)
        return x

    def rmse(y_true, y_pred):
            return K.sqrt(K.mean(K.square(y_pred - y_true))) 
    def create_model():
        inp = Input(shape = X_train.shape[1],)
        fc=  fc_layer(inp, 4196, 0.2)
        fc=  fc_layer(fc, 1024, 0.2)
        fc=  fc_layer(fc, 256, 0.2)
        fc = fc_layer(fc, 64, 0.2)
        fc = fc_layer(fc, 16, 0.2)
        fc = fc_layer(fc, 4, 0.2)
        out = Dense(1, activation = 'relu')(fc)

        model = Model(inp, out)
        optimizer = tf.keras.optimizers.Adam(0.1)
        model.compile(optimizer=optimizer, loss=rmse)
        return model
    def kf_NN_modeling() : 
        from sklearn.model_selection import KFold
        for k in range(5) :
            print(f'    ===== FOLD : {k} =====', end = ' ')
        print('')
        rs = np.random.randint(0,1234578,1)[0]
        kf = KFold(n_splits=5, shuffle=True, random_state = rs)
        fold = 0
        preds = []
        for train_idx, valid_idx in kf.split(np.arange(X_train.shape[0])) : 
            X_tr = X_train[train_idx] ; X_val = X_train[valid_idx]
            y_tr = y_train[train_idx] ; y_val = y_train[valid_idx]
            model_save_path = f'./models/model_{fold}.h5'
            callbacks = get_callbacks(model_save_path, patience)
            model = create_model()
            history = model.fit(X_tr, y_tr,
                                epochs = 20000,
                                batch_size = X_tr.shape[0]//5,
                                validation_data=(X_val, y_val),
                                callbacks = callbacks,
                                verbose = 0
                                )
            score = min(history.history['val_loss'])
            print('    ',round(score,16), end = '   ')
            model.load_weights(model_save_path)
            pred = model.predict(X_test).flatten()
            preds.append([pred, score])
            fold += 1
        print('')
        return preds
    def NN_modeling(epoch) : 
        print('')
        print(f'========== EPOCH : {epoch} ==========')
        preds = kf_NN_modeling()
        return preds

    total_predictions = []
    for epoch in range(total_epoch) : 
        total_predictions.extend(NN_modeling(epoch))
    predictions = sorted(total_predictions, key = lambda x: x[1])[:5]
    prediction = np.mean([x[0] for x in predictions], axis= 0)
    return prediction

In [20]:
def __NN_main__(data_path, need_update, need_AE_update) : 
    from datetime import datetime
    print('!! NN MAIN STARTED !!  /  ', datetime.today())
    print(str(datetime.today()), 'STARTED')
    print('')
    try : 
        step = 1
        try : # STEP 1 : Load Data
            check_path = data_path.replace('./data/','')
            if (check_path not in os.listdir('./data/')) or need_update : 
                print('=== UPDATE DATA ===')
                __NN_engineering_main__(data_path, need_AE_update = need_AE_update)
                print(str(datetime.today()), 'UPDATED')
                print('')
            train, test, describe = load_data(data_path)
            print(describe)
        except Exception as e : 
            return_warning_messages(e, step = step)
            return 1

        try : # STEP 2 : Set Modeling Data
            X_train, y_train, X_test, sample = set_NN_modeling_data(train, test)
            step += 1
        except Exception as e : 
            return_warning_messages(e, step = step)
            return 1

        try : # STEP 3 : Modeling
            prediction = NN_modeling(X_train, y_train, X_test, patience=100, total_epoch=5)
            step += 1
        except Exception as e : 
            return_warning_messages(e, step = step)
            return 1

        try : # STEP 4 : Save Prediction Data
            submission_path = './submissions/DAE.csv'
            save_prediction(sample, submission_path, prediction, y_train)
        except Exception as e : 
            return_warning_messages(e, step = 0)
            return 1

        print('')
        print('!! NN MAIN ENDED !!  /  ', datetime.today())

    except Exception as e : 
        return_warning_messages(e, step = 0)

In [21]:
NN_data_path = './data/NN_DATA.pkl'
need_update = True
need_AE_update = True
__NN_main__(NN_data_path, need_update, need_AE_update)

!! NN MAIN STARTED !!  /   2021-12-16 20:18:47.575688
2021-12-16 20:18:47.575688 STARTED

=== UPDATE DATA ===
=== NN Data Exist ===
=== AUTO ENCODING STARTED ===  / 2021-12-16 20:18:48.750207
(32000, 6704) (4640, 6704) 0.0025652787648141385
=== AUTO ENCODING ENDED ===  / 2021-12-16 20:21:38.224983
2021-12-16 20:21:40.113645 UPDATED

Data Type : Numpy Array
Mean Target Encoding -> Polynomial Features & Onehot Encoding -> Deep Auto Encoding
(32000, 6749) (32000,) (4640, 6749)

    ===== FOLD : 0 =====     ===== FOLD : 1 =====     ===== FOLD : 2 =====     ===== FOLD : 3 =====     ===== FOLD : 4 ===== 
     4.709329605102539        4.5456929206848145        4.493783950805664        4.832345962524414        4.24008321762085   

    ===== FOLD : 0 =====     ===== FOLD : 1 =====     ===== FOLD : 2 =====     ===== FOLD : 3 =====     ===== FOLD : 4 ===== 
     4.746572971343994        4.689569473266602        4.3002729415893555        4.4574761390686035        4.388387680053711   

    ===== FO

Unnamed: 0,0
count,4640.0
mean,4.719107
std,4.458767
min,3.0
25%,3.739633
50%,3.893864
75%,4.781611
max,165.709824


=== Prediction Saved At : ./submissions/DAE.csv ===

!! NN MAIN ENDED !!  /   2021-12-16 21:31:13.064512


## 4. Ensemble

In [22]:
cat_pred = pd.read_csv('./submissions/CATBOOST.csv')['INVC_CONT'] # 5.3587728343
dae_pred = pd.read_csv('./submissions/DAE.csv')['INVC_CONT'] # 5.3398171479
# 위 두 submission의 correlationship이 낮아 두개만 ensemble

final = cat_pred * 0.7 + dae_pred * 0.3
sample = pd.read_csv('./data/sample_submission.csv')
sample['INVC_CONT'] = final
sample.to_csv('./submissions/JayHongPred.csv', index= False) # 5.226109375