In [1]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline

# EDA
import klib

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split


# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold, cross_val_score

# Modeling
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor

# Evaluation
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import plotly 
import plotly.express as px

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import shap
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean

import tensorflow as tf
from tensorflow import keras
import kerastuner as kt
from pycaret.regression import *

In [2]:
!pip install pytorch-tabnet==3.1.1

Collecting pytorch-tabnet==3.1.1
  Downloading pytorch_tabnet-3.1.1-py3-none-any.whl (39 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1


In [3]:
import torch
from torch import nn
from pytorch_tabnet.tab_model  import TabNetRegressor

In [4]:
# 학습용과 평가용(제출용) 데이터를 읽어들인다.
train = pd.read_csv('train_df.csv',encoding='cp949')
test = pd.read_csv('test_df.csv',encoding='cp949')

# 분석에 필요없는 INDEX 필드를 데이터에서 제거
#train_index = train['index']
#test_index = test['index']
#del train['index'], test['index']

# 학습용 정답 데이터를 읽는다.
target = pd.read_csv('train_df.csv',encoding='cp949').INVC_CONT

#submission 데이터를 읽는다.
submission = pd.read_csv('sample_submission.csv')

In [5]:
train = pd.read_csv('features_et.csv')
test = pd.read_csv('features_et_te.csv')

In [6]:
train_X = train.to_numpy()
train_y = target.to_numpy().reshape(-1,1)
test_X = test.to_numpy()

In [7]:
kf = KFold(n_splits=6, random_state=42, shuffle=True)
preds = []
scores  = []

for trn_idx,val_idx in kf.split(train_X):
    X_trn = train_X[trn_idx]
    y_trn = train_y[trn_idx]
    X_val = train_X[val_idx]
    y_val = train_y[val_idx]

    model = TabNetRegressor(verbose = 1,seed = 42,optimizer_fn=torch.optim.AdamW)
    model.fit(X_train=X_trn, y_train=y_trn,
              eval_set=[(X_val, y_val)],
            patience=100, max_epochs=2000,
            batch_size=64,
            loss_fn=torch.nn.L1Loss(),
            eval_metric=['rmse'])
    scores.append(model.best_cost)
    
    pred = model.predict(test_X)
    preds.append(pred)

Device used : cuda
epoch 0  | loss: 38.63876| val_0_rmse: 5.94566 |  0:00:03s
epoch 1  | loss: 33.02122| val_0_rmse: 5.75869 |  0:00:05s
epoch 2  | loss: 32.5847 | val_0_rmse: 5.75757 |  0:00:07s
epoch 3  | loss: 32.56509| val_0_rmse: 5.75817 |  0:00:09s
epoch 4  | loss: 32.31202| val_0_rmse: 5.71246 |  0:00:11s
epoch 5  | loss: 32.11876| val_0_rmse: 5.70801 |  0:00:12s
epoch 6  | loss: 31.42359| val_0_rmse: 5.66841 |  0:00:14s
epoch 7  | loss: 30.93017| val_0_rmse: 5.74344 |  0:00:16s
epoch 8  | loss: 32.32989| val_0_rmse: 5.73805 |  0:00:18s
epoch 9  | loss: 31.94672| val_0_rmse: 5.69667 |  0:00:20s
epoch 10 | loss: 31.66386| val_0_rmse: 5.67398 |  0:00:22s
epoch 11 | loss: 31.03555| val_0_rmse: 5.62332 |  0:00:24s
epoch 12 | loss: 31.10368| val_0_rmse: 5.67195 |  0:00:25s
epoch 13 | loss: 30.31261| val_0_rmse: 5.2523  |  0:00:27s
epoch 14 | loss: 30.80437| val_0_rmse: 5.69357 |  0:00:29s
epoch 15 | loss: 31.53989| val_0_rmse: 5.64699 |  0:00:31s
epoch 16 | loss: 31.6166 | val_0_rmse

epoch 5  | loss: 30.96807| val_0_rmse: 6.3158  |  0:00:09s
epoch 6  | loss: 30.88725| val_0_rmse: 6.31785 |  0:00:10s
epoch 7  | loss: 30.83792| val_0_rmse: 6.31953 |  0:00:12s
epoch 8  | loss: 30.84706| val_0_rmse: 6.31384 |  0:00:13s
epoch 9  | loss: 30.73923| val_0_rmse: 6.32408 |  0:00:15s
epoch 10 | loss: 30.66523| val_0_rmse: 6.32913 |  0:00:16s
epoch 11 | loss: 30.8401 | val_0_rmse: 6.29761 |  0:00:18s
epoch 12 | loss: 30.31352| val_0_rmse: 6.13831 |  0:00:19s
epoch 13 | loss: 30.21986| val_0_rmse: 6.10396 |  0:00:21s
epoch 14 | loss: 29.34055| val_0_rmse: 5.9587  |  0:00:22s
epoch 15 | loss: 29.76388| val_0_rmse: 5.73552 |  0:00:24s
epoch 16 | loss: 28.08286| val_0_rmse: 6.79316 |  0:00:26s
epoch 17 | loss: 28.01536| val_0_rmse: 7.61731 |  0:00:27s
epoch 18 | loss: 27.25531| val_0_rmse: 6.86083 |  0:00:29s
epoch 19 | loss: 27.60079| val_0_rmse: 7.48538 |  0:00:30s
epoch 20 | loss: 27.71956| val_0_rmse: 8.18966 |  0:00:32s
epoch 21 | loss: 27.82282| val_0_rmse: 11.95132|  0:00:3

epoch 144| loss: 16.43768| val_0_rmse: 5.90751 |  0:03:44s
epoch 145| loss: 15.90844| val_0_rmse: 5.8904  |  0:03:45s
epoch 146| loss: 15.99705| val_0_rmse: 6.18104 |  0:03:47s
epoch 147| loss: 16.2037 | val_0_rmse: 6.21808 |  0:03:48s
epoch 148| loss: 16.74183| val_0_rmse: 5.65638 |  0:03:50s
epoch 149| loss: 15.72635| val_0_rmse: 5.84368 |  0:03:51s
epoch 150| loss: 15.6065 | val_0_rmse: 6.1166  |  0:03:53s
epoch 151| loss: 15.51168| val_0_rmse: 5.85997 |  0:03:55s
epoch 152| loss: 15.90664| val_0_rmse: 6.33948 |  0:03:56s
epoch 153| loss: 15.79686| val_0_rmse: 6.23249 |  0:03:58s
epoch 154| loss: 15.44348| val_0_rmse: 6.30465 |  0:03:59s
epoch 155| loss: 15.09158| val_0_rmse: 6.25818 |  0:04:01s
epoch 156| loss: 15.22827| val_0_rmse: 6.03463 |  0:04:03s
epoch 157| loss: 15.23907| val_0_rmse: 5.83927 |  0:04:04s
epoch 158| loss: 15.67555| val_0_rmse: 5.97131 |  0:04:06s
epoch 159| loss: 15.68277| val_0_rmse: 5.63837 |  0:04:09s
epoch 160| loss: 15.6992 | val_0_rmse: 5.77984 |  0:04:1

epoch 283| loss: 14.12546| val_0_rmse: 5.92412 |  0:07:21s
epoch 284| loss: 14.03401| val_0_rmse: 5.67506 |  0:07:23s
epoch 285| loss: 13.37414| val_0_rmse: 5.69373 |  0:07:24s
epoch 286| loss: 13.82267| val_0_rmse: 5.61672 |  0:07:26s
epoch 287| loss: 13.58601| val_0_rmse: 5.68231 |  0:07:27s
epoch 288| loss: 13.0584 | val_0_rmse: 6.18573 |  0:07:29s
epoch 289| loss: 13.50504| val_0_rmse: 5.77001 |  0:07:30s
epoch 290| loss: 13.27618| val_0_rmse: 5.60527 |  0:07:32s
epoch 291| loss: 13.19237| val_0_rmse: 5.5991  |  0:07:34s
epoch 292| loss: 14.5101 | val_0_rmse: 5.63643 |  0:07:35s
epoch 293| loss: 13.22756| val_0_rmse: 5.61188 |  0:07:37s
epoch 294| loss: 13.27797| val_0_rmse: 5.68228 |  0:07:39s
epoch 295| loss: 13.19413| val_0_rmse: 6.12153 |  0:07:40s

Early stopping occurred at epoch 295 with best_epoch = 195 and best_val_0_rmse = 5.46322
Best weights from best epoch are automatically used!
Device used : cuda
epoch 0  | loss: 39.48304| val_0_rmse: 5.63332 |  0:00:01s
epoch 1  | l

epoch 124| loss: 23.54721| val_0_rmse: 5.211   |  0:03:16s
epoch 125| loss: 23.09514| val_0_rmse: 5.15283 |  0:03:17s
epoch 126| loss: 23.58389| val_0_rmse: 5.39439 |  0:03:19s
epoch 127| loss: 22.90205| val_0_rmse: 5.2152  |  0:03:20s
epoch 128| loss: 22.57493| val_0_rmse: 5.20197 |  0:03:22s
epoch 129| loss: 23.88725| val_0_rmse: 4.98325 |  0:03:24s
epoch 130| loss: 22.67992| val_0_rmse: 5.15041 |  0:03:25s
epoch 131| loss: 23.62937| val_0_rmse: 5.10508 |  0:03:27s
epoch 132| loss: 23.21912| val_0_rmse: 5.31859 |  0:03:28s
epoch 133| loss: 23.09734| val_0_rmse: 5.48629 |  0:03:30s
epoch 134| loss: 23.83915| val_0_rmse: 5.20526 |  0:03:31s
epoch 135| loss: 23.089  | val_0_rmse: 5.21307 |  0:03:33s
epoch 136| loss: 23.05025| val_0_rmse: 5.34298 |  0:03:34s
epoch 137| loss: 22.17997| val_0_rmse: 5.05584 |  0:03:36s
epoch 138| loss: 22.47887| val_0_rmse: 5.09094 |  0:03:38s

Early stopping occurred at epoch 138 with best_epoch = 38 and best_val_0_rmse = 4.96635
Best weights from best epo

epoch 122| loss: 21.9082 | val_0_rmse: 5.5115  |  0:03:14s
epoch 123| loss: 22.26185| val_0_rmse: 5.50257 |  0:03:16s
epoch 124| loss: 22.10553| val_0_rmse: 6.02198 |  0:03:17s
epoch 125| loss: 21.72828| val_0_rmse: 5.57854 |  0:03:19s
epoch 126| loss: 21.84701| val_0_rmse: 5.82412 |  0:03:20s
epoch 127| loss: 23.80427| val_0_rmse: 5.6965  |  0:03:22s
epoch 128| loss: 22.23243| val_0_rmse: 5.36463 |  0:03:25s
epoch 129| loss: 22.81393| val_0_rmse: 5.64195 |  0:03:26s
epoch 130| loss: 21.85403| val_0_rmse: 5.43133 |  0:03:28s
epoch 131| loss: 22.04533| val_0_rmse: 5.61534 |  0:03:29s
epoch 132| loss: 21.86069| val_0_rmse: 5.42601 |  0:03:31s
epoch 133| loss: 21.64346| val_0_rmse: 5.89744 |  0:03:32s
epoch 134| loss: 21.64691| val_0_rmse: 5.44321 |  0:03:34s
epoch 135| loss: 21.52245| val_0_rmse: 5.92571 |  0:03:36s
epoch 136| loss: 21.02506| val_0_rmse: 5.54594 |  0:03:37s
epoch 137| loss: 21.11033| val_0_rmse: 5.49637 |  0:03:39s
epoch 138| loss: 21.63645| val_0_rmse: 5.72901 |  0:03:4

epoch 261| loss: 16.91714| val_0_rmse: 5.47904 |  0:06:56s
epoch 262| loss: 16.39789| val_0_rmse: 5.47624 |  0:06:58s
epoch 263| loss: 16.65622| val_0_rmse: 5.66361 |  0:07:00s
epoch 264| loss: 16.3538 | val_0_rmse: 5.62106 |  0:07:02s
epoch 265| loss: 16.09951| val_0_rmse: 5.39377 |  0:07:03s
epoch 266| loss: 17.06457| val_0_rmse: 5.53143 |  0:07:05s
epoch 267| loss: 16.77481| val_0_rmse: 5.43013 |  0:07:06s
epoch 268| loss: 15.78496| val_0_rmse: 5.4588  |  0:07:08s
epoch 269| loss: 15.97909| val_0_rmse: 5.57146 |  0:07:10s
epoch 270| loss: 16.45875| val_0_rmse: 5.52852 |  0:07:11s
epoch 271| loss: 15.90471| val_0_rmse: 5.6233  |  0:07:13s
epoch 272| loss: 15.49151| val_0_rmse: 5.49403 |  0:07:15s
epoch 273| loss: 15.97958| val_0_rmse: 5.61567 |  0:07:16s
epoch 274| loss: 15.72062| val_0_rmse: 5.3824  |  0:07:18s
epoch 275| loss: 16.1351 | val_0_rmse: 5.79712 |  0:07:19s
epoch 276| loss: 15.97913| val_0_rmse: 5.30966 |  0:07:21s
epoch 277| loss: 15.46293| val_0_rmse: 5.43629 |  0:07:2

epoch 70 | loss: 24.43632| val_0_rmse: 11.7435 |  0:01:55s
epoch 71 | loss: 24.13204| val_0_rmse: 7.29376 |  0:01:57s
epoch 72 | loss: 25.35416| val_0_rmse: 13.49255|  0:01:58s
epoch 73 | loss: 24.07581| val_0_rmse: 10.87104|  0:02:00s
epoch 74 | loss: 23.77952| val_0_rmse: 14.28664|  0:02:02s
epoch 75 | loss: 24.09335| val_0_rmse: 9.05211 |  0:02:03s
epoch 76 | loss: 28.01848| val_0_rmse: 12.15513|  0:02:05s
epoch 77 | loss: 24.43864| val_0_rmse: 11.39962|  0:02:06s
epoch 78 | loss: 25.19678| val_0_rmse: 8.02502 |  0:02:08s
epoch 79 | loss: 26.74268| val_0_rmse: 8.69332 |  0:02:10s
epoch 80 | loss: 24.04098| val_0_rmse: 8.87847 |  0:02:11s
epoch 81 | loss: 24.39861| val_0_rmse: 8.72882 |  0:02:13s
epoch 82 | loss: 24.03823| val_0_rmse: 9.73214 |  0:02:14s
epoch 83 | loss: 24.23382| val_0_rmse: 9.43924 |  0:02:16s
epoch 84 | loss: 23.82863| val_0_rmse: 6.18996 |  0:02:17s
epoch 85 | loss: 23.86263| val_0_rmse: 7.41489 |  0:02:19s
epoch 86 | loss: 24.166  | val_0_rmse: 15.55031|  0:02:2

epoch 90 | loss: 26.89409| val_0_rmse: 4.48007 |  0:02:28s
epoch 91 | loss: 25.18936| val_0_rmse: 4.40262 |  0:02:29s
epoch 92 | loss: 25.2436 | val_0_rmse: 4.40918 |  0:02:31s
epoch 93 | loss: 24.78737| val_0_rmse: 4.33394 |  0:02:32s
epoch 94 | loss: 24.8146 | val_0_rmse: 4.40953 |  0:02:34s
epoch 95 | loss: 29.4731 | val_0_rmse: 4.56586 |  0:02:35s
epoch 96 | loss: 24.68756| val_0_rmse: 4.3432  |  0:02:37s
epoch 97 | loss: 24.40473| val_0_rmse: 4.34169 |  0:02:38s
epoch 98 | loss: 24.67722| val_0_rmse: 4.28576 |  0:02:40s
epoch 99 | loss: 24.06585| val_0_rmse: 4.38894 |  0:02:41s
epoch 100| loss: 27.243  | val_0_rmse: 4.37471 |  0:02:43s
epoch 101| loss: 24.79027| val_0_rmse: 4.37162 |  0:02:45s
epoch 102| loss: 24.1622 | val_0_rmse: 4.4542  |  0:02:46s
epoch 103| loss: 23.14824| val_0_rmse: 4.37423 |  0:02:48s
epoch 104| loss: 23.43972| val_0_rmse: 4.35376 |  0:02:49s
epoch 105| loss: 23.71032| val_0_rmse: 4.32009 |  0:02:51s
epoch 106| loss: 23.32776| val_0_rmse: 4.35191 |  0:02:5

In [13]:
scores

[4.747778828965679,
 5.463217920818146,
 4.966349743353065,
 5.245580746471708,
 5.838333111460074,
 4.192740682467957]

In [8]:
print("CV 평균 점수 : %.5f" % np.mean(scores,axis=0) )

CV 평균 점수 : 5.07567


In [9]:
final_pred = np.mean(preds,axis = 0)

In [10]:
submission['INVC_CONT'] = final_pred
submission

Unnamed: 0,index,INVC_CONT
0,32000,4.461727
1,32001,5.110055
2,32002,4.613761
3,32003,4.496495
4,32004,4.434270
...,...,...
4635,36635,4.555692
4636,36636,4.456085
4637,36637,4.570349
4638,36638,4.428309


In [11]:
submission.describe()

Unnamed: 0,index,INVC_CONT
count,4640.0,4640.0
mean,34319.5,4.49244
std,1339.596954,3.543576
min,32000.0,3.502838
25%,33159.75,3.85616
50%,34319.5,4.413731
75%,35479.25,4.707259
max,36639.0,177.33844


In [12]:
submission.to_csv('tabnet.csv',index = False)