# read

In [1]:
import pandas as pd
import numpy as np; np.random.seed(0)
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('./data/train_df.csv', encoding='cp949')
test  = pd.read_csv('./data/test_df.csv', encoding='cp949')
sub   = pd.read_csv('./data/sample_submission.csv', encoding='cp949')

In [2]:
train.shape, test.shape, sub.shape

((32000, 6), (4640, 5), (4640, 2))

In [3]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT
0,0,1129000014045300,5011000220046300,패션의류,상의,3
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9
3,3,1154500002014200,5011000315087400,식품,농산물,10
4,4,1165000021008300,5011000177051200,식품,가공식품,3


In [4]:
test.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM
0,32000,5013000043028400,1165000021097200,식품,농산물
1,32001,5013000044016100,1154500002066400,식품,농산물
2,32002,5013000205030200,4139000102013200,식품,농산물
3,32003,5013000205030200,4221000040093400,식품,농산물
4,32004,5013000268011400,2726000004017100,식품,농산물


# preprocessing

In [5]:
large_label = dict(train.groupby('DL_GD_LCLS_NM').mean().sort_values('INVC_CONT')[['INVC_CONT']].reset_index().values)
mid_label   = dict(train.groupby('DL_GD_MCLS_NM').mean().sort_values('INVC_CONT')[['INVC_CONT']].reset_index().values)

train['DL_GD_LCLS_NM'] = train['DL_GD_LCLS_NM'].map(large_label)
test['DL_GD_LCLS_NM']  = test['DL_GD_LCLS_NM'].map(large_label)

train['DL_GD_MCLS_NM'] = train['DL_GD_MCLS_NM'].map(mid_label)
test['DL_GD_MCLS_NM']  = test['DL_GD_MCLS_NM'].map(mid_label)

In [6]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT
0,0,1129000014045300,5011000220046300,6.678694,3.672897,3
1,1,1135000009051200,5011000178037300,5.407921,4.444134,3
2,2,1135000030093100,5011000265091400,6.678694,10.375479,9
3,3,1154500002014200,5011000315087400,4.658195,4.297401,10
4,4,1165000021008300,5011000177051200,4.658195,6.807151,3


In [7]:
# 격자공간고유번호 슬라이싱

for i in [3, 4, 5, 6, 7, 8, 9, ]:
    train['send_' + str(i)] = train['SEND_SPG_INNB'].astype('str').str[:i].astype('int64')
    test['send_' + str(i)]  = test['SEND_SPG_INNB'].astype('str').str[:i].astype('int64')
    
    train['rec_' + str(i)] = train['REC_SPG_INNB'].astype('str').str[:i].astype('int64')
    test['rec_' + str(i)]  = test['REC_SPG_INNB'].astype('str').str[:i].astype('int64')

In [8]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT,send_3,rec_3,send_4,rec_4,send_5,rec_5,send_6,rec_6,send_7,rec_7,send_8,rec_8,send_9,rec_9
0,0,1129000014045300,5011000220046300,6.678694,3.672897,3,112,501,1129,5011,11290,50110,112900,501100,1129000,5011000,11290000,50110002,112900001,501100022
1,1,1135000009051200,5011000178037300,5.407921,4.444134,3,113,501,1135,5011,11350,50110,113500,501100,1135000,5011000,11350000,50110001,113500000,501100017
2,2,1135000030093100,5011000265091400,6.678694,10.375479,9,113,501,1135,5011,11350,50110,113500,501100,1135000,5011000,11350000,50110002,113500003,501100026
3,3,1154500002014200,5011000315087400,4.658195,4.297401,10,115,501,1154,5011,11545,50110,115450,501100,1154500,5011000,11545000,50110003,115450000,501100031
4,4,1165000021008300,5011000177051200,4.658195,6.807151,3,116,501,1165,5011,11650,50110,116500,501100,1165000,5011000,11650000,50110001,116500002,501100017


In [9]:
train.columns[6:]

Index(['send_3', 'rec_3', 'send_4', 'rec_4', 'send_5', 'rec_5', 'send_6',
       'rec_6', 'send_7', 'rec_7', 'send_8', 'rec_8', 'send_9', 'rec_9'],
      dtype='object')

In [10]:
for col in train.columns[6:]:
    
    dict = {}
    for i, j in enumerate(train.groupby(col).mean().sort_values('INVC_CONT')[['INVC_CONT']].index):
        dict[j] = i
    
    train[col] = train[col].map(dict)
    test[col]  = test[col].map(dict)

In [11]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT,send_3,rec_3,send_4,rec_4,send_5,rec_5,send_6,rec_6,send_7,rec_7,send_8,rec_8,send_9,rec_9
0,0,1129000014045300,5011000220046300,6.678694,3.672897,3,13,63,46,208,45,229,45,229,55,237,177,843,518,2141
1,1,1135000009051200,5011000178037300,5.407921,4.444134,3,37,63,161,208,176,229,176,229,181,237,397,749,654,2648
2,2,1135000030093100,5011000265091400,6.678694,10.375479,9,37,63,161,208,176,229,176,229,181,237,397,843,1071,2734
3,3,1154500002014200,5011000315087400,4.658195,4.297401,10,23,63,79,208,91,229,91,229,93,237,251,834,625,2925
4,4,1165000021008300,5011000177051200,4.658195,6.807151,3,32,63,96,208,109,229,109,229,115,237,297,749,340,2648


# encoding

In [12]:
train.drop(['index', 'SEND_SPG_INNB', 'REC_SPG_INNB'], axis=1, inplace=True)
test.drop(['index', 'SEND_SPG_INNB', 'REC_SPG_INNB'], axis=1, inplace=True)

In [13]:
train.head()

Unnamed: 0,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT,send_3,rec_3,send_4,rec_4,send_5,rec_5,send_6,rec_6,send_7,rec_7,send_8,rec_8,send_9,rec_9
0,6.678694,3.672897,3,13,63,46,208,45,229,45,229,55,237,177,843,518,2141
1,5.407921,4.444134,3,37,63,161,208,176,229,176,229,181,237,397,749,654,2648
2,6.678694,10.375479,9,37,63,161,208,176,229,176,229,181,237,397,843,1071,2734
3,4.658195,4.297401,10,23,63,79,208,91,229,91,229,93,237,251,834,625,2925
4,4.658195,6.807151,3,32,63,96,208,109,229,109,229,115,237,297,749,340,2648


In [14]:
train.corr().loc[
    ['send_3', 'send_4', 'send_5', 'send_6', 'send_7', 'send_8', 'send_9'], 
    ['rec_3', 'rec_4', 'rec_5', 'rec_6', 'rec_7', 'rec_8', 'rec_9']
]

Unnamed: 0,rec_3,rec_4,rec_5,rec_6,rec_7,rec_8,rec_9
send_3,0.609402,0.534885,0.521025,0.521025,0.517344,0.489246,0.389902
send_4,0.534475,0.470289,0.457705,0.457705,0.454563,0.430187,0.347539
send_5,0.520898,0.458421,0.446113,0.446113,0.443069,0.419082,0.339077
send_6,0.520898,0.458421,0.446113,0.446113,0.443069,0.419082,0.339077
send_7,0.493384,0.434042,0.421596,0.421596,0.418835,0.396999,0.324399
send_8,0.322799,0.289771,0.283621,0.283621,0.281925,0.267111,0.237355
send_9,0.232629,0.213998,0.210824,0.210824,0.209973,0.202874,0.198872


# modeling

In [15]:
X = train.drop('INVC_CONT', axis=1)
y = train['INVC_CONT']

In [16]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV

model = CatBoostRegressor(random_state=0, random_strength=0)

param = {
    
}

gscv = GridSearchCV(
    estimator=model, param_grid=param, scoring='neg_root_mean_squared_error', 
    refit=True, cv=5
)

gscv.fit(X, y)

Learning rate set to 0.068341
0:	learn: 5.3803207	total: 179ms	remaining: 2m 58s
1:	learn: 5.3161334	total: 214ms	remaining: 1m 46s
2:	learn: 5.2607088	total: 246ms	remaining: 1m 21s
3:	learn: 5.2108525	total: 278ms	remaining: 1m 9s
4:	learn: 5.1629084	total: 318ms	remaining: 1m 3s
5:	learn: 5.1176822	total: 360ms	remaining: 59.7s
6:	learn: 5.0782452	total: 395ms	remaining: 56s
7:	learn: 5.0382064	total: 436ms	remaining: 54.1s
8:	learn: 5.0031911	total: 466ms	remaining: 51.3s
9:	learn: 4.9714880	total: 597ms	remaining: 59.1s
10:	learn: 4.9392340	total: 673ms	remaining: 1m
11:	learn: 4.9078302	total: 753ms	remaining: 1m 2s
12:	learn: 4.8794642	total: 856ms	remaining: 1m 4s
13:	learn: 4.8545371	total: 920ms	remaining: 1m 4s
14:	learn: 4.8326003	total: 988ms	remaining: 1m 4s
15:	learn: 4.8115437	total: 1.03s	remaining: 1m 3s
16:	learn: 4.7919634	total: 1.09s	remaining: 1m 3s
17:	learn: 4.7735668	total: 1.14s	remaining: 1m 2s
18:	learn: 4.7568460	total: 1.22s	remaining: 1m 3s
19:	learn: 4.

GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostRegressor object at 0x000001E4292923D0>,
             param_grid={}, scoring='neg_root_mean_squared_error')

In [17]:
print(gscv.best_params_)
print(-gscv.best_score_)

{}
5.105242645050244


In [18]:
pred = gscv.predict(test)
sub['INVC_CONT'] = pred
sub.head()

Unnamed: 0,index,INVC_CONT
0,32000,7.915326
1,32001,5.503204
2,32002,3.81103
3,32003,4.396259
4,32004,5.158318


In [21]:
# sub.to_csv('./sub/logistics_05.csv', index=False)