# read

In [56]:
import pandas as pd; pd.set_option('display.max_rows', None)
import numpy as np; np.random.seed(0)
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('train_df.csv', encoding='cp949')
test  = pd.read_csv('test_df.csv', encoding='cp949')
sub   = pd.read_csv('sample_submission.csv', encoding='cp949')

In [57]:
train.shape, test.shape, sub.shape

((32000, 6), (4640, 5), (4640, 2))

In [58]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT
0,0,1129000014045300,5011000220046300,패션의류,상의,3
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9
3,3,1154500002014200,5011000315087400,식품,농산물,10
4,4,1165000021008300,5011000177051200,식품,가공식품,3


In [59]:
test.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM
0,32000,5013000043028400,1165000021097200,식품,농산물
1,32001,5013000044016100,1154500002066400,식품,농산물
2,32002,5013000205030200,4139000102013200,식품,농산물
3,32003,5013000205030200,4221000040093400,식품,농산물
4,32004,5013000268011400,2726000004017100,식품,농산물


# preprocessing

In [60]:
train['ag'] = 0
test['ag']  = 0

train.loc[train['DL_GD_MCLS_NM'] == '농산물', 'ag'] = 1
test.loc[test['DL_GD_MCLS_NM'] == '농산물', 'ag'] = 1

In [61]:
large_label = dict(train.groupby('DL_GD_LCLS_NM').mean().sort_values('INVC_CONT')[['INVC_CONT']].reset_index().values)
mid_label   = dict(train.groupby('DL_GD_MCLS_NM').mean().sort_values('INVC_CONT')[['INVC_CONT']].reset_index().values)

train['DL_GD_LCLS_NM'] = train['DL_GD_LCLS_NM'].map(large_label)
test['DL_GD_LCLS_NM']  = test['DL_GD_LCLS_NM'].map(large_label)

train['DL_GD_MCLS_NM'] = train['DL_GD_MCLS_NM'].map(mid_label)
test['DL_GD_MCLS_NM']  = test['DL_GD_MCLS_NM'].map(mid_label)

In [62]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT,ag
0,0,1129000014045300,5011000220046300,6.678694,3.672897,3,0
1,1,1135000009051200,5011000178037300,5.407921,4.444134,3,0
2,2,1135000030093100,5011000265091400,6.678694,10.375479,9,0
3,3,1154500002014200,5011000315087400,4.658195,4.297401,10,1
4,4,1165000021008300,5011000177051200,4.658195,6.807151,3,0


In [63]:
# 격자공간고유번호 슬라이싱

for i in [3, 4, 5, 6, 7, 8, 9]:
    train['send_' + str(i)] = train['SEND_SPG_INNB'].astype('str').str[:i].astype('int64')
    test['send_' + str(i)]  = test['SEND_SPG_INNB'].astype('str').str[:i].astype('int64')
    
    train['rec_' + str(i)] = train['REC_SPG_INNB'].astype('str').str[:i].astype('int64')
    test['rec_' + str(i)]  = test['REC_SPG_INNB'].astype('str').str[:i].astype('int64')

In [64]:
temp_send = train.groupby('SEND_SPG_INNB').min()
temp_rec = train.groupby('REC_SPG_INNB').min()

send_idx = temp_send[temp_send['INVC_CONT'] > 10].index
rec_idx = temp_rec[temp_rec['INVC_CONT'] > 10].index

In [65]:
train['plus'] = 0
test['plus'] = 0

train.loc[(train['SEND_SPG_INNB'].isin(send_idx)) | (train['REC_SPG_INNB'].isin(send_idx)), 'plus'] = 1
test.loc[(test['SEND_SPG_INNB'].isin(send_idx)) | (test['REC_SPG_INNB'].isin(send_idx)), 'plus'] = 1

In [66]:
train['eq'] = 0
test['eq']  = 0

train.loc[train['SEND_SPG_INNB'] == train['REC_SPG_INNB'], 'eq'] = 1
test.loc[test['SEND_SPG_INNB'] == test['REC_SPG_INNB'], 'eq'] = 1

In [67]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT,ag,send_3,rec_3,send_4,...,send_6,rec_6,send_7,rec_7,send_8,rec_8,send_9,rec_9,plus,eq
0,0,1129000014045300,5011000220046300,6.678694,3.672897,3,0,112,501,1129,...,112900,501100,1129000,5011000,11290000,50110002,112900001,501100022,0,0
1,1,1135000009051200,5011000178037300,5.407921,4.444134,3,0,113,501,1135,...,113500,501100,1135000,5011000,11350000,50110001,113500000,501100017,0,0
2,2,1135000030093100,5011000265091400,6.678694,10.375479,9,0,113,501,1135,...,113500,501100,1135000,5011000,11350000,50110002,113500003,501100026,0,0
3,3,1154500002014200,5011000315087400,4.658195,4.297401,10,1,115,501,1154,...,115450,501100,1154500,5011000,11545000,50110003,115450000,501100031,1,0
4,4,1165000021008300,5011000177051200,4.658195,6.807151,3,0,116,501,1165,...,116500,501100,1165000,5011000,11650000,50110001,116500002,501100017,0,0


# encoding

In [68]:
train.drop(['index', 'SEND_SPG_INNB', 'REC_SPG_INNB'], axis=1, inplace=True)
test.drop(['index', 'SEND_SPG_INNB', 'REC_SPG_INNB'], axis=1, inplace=True)

In [69]:
train.head()

Unnamed: 0,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT,ag,send_3,rec_3,send_4,rec_4,send_5,rec_5,send_6,rec_6,send_7,rec_7,send_8,rec_8,send_9,rec_9,plus,eq
0,6.678694,3.672897,3,0,112,501,1129,5011,11290,50110,112900,501100,1129000,5011000,11290000,50110002,112900001,501100022,0,0
1,5.407921,4.444134,3,0,113,501,1135,5011,11350,50110,113500,501100,1135000,5011000,11350000,50110001,113500000,501100017,0,0
2,6.678694,10.375479,9,0,113,501,1135,5011,11350,50110,113500,501100,1135000,5011000,11350000,50110002,113500003,501100026,0,0
3,4.658195,4.297401,10,1,115,501,1154,5011,11545,50110,115450,501100,1154500,5011000,11545000,50110003,115450000,501100031,1,0
4,4.658195,6.807151,3,0,116,501,1165,5011,11650,50110,116500,501100,1165000,5011000,11650000,50110001,116500002,501100017,0,0


# modeling

In [70]:
X = train.drop('INVC_CONT', axis=1)
y = train['INVC_CONT']

In [71]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV

model = CatBoostRegressor(random_state=0, random_strength=0)

param = {
    
}

gscv = GridSearchCV(
    estimator=model, param_grid=param, scoring='neg_root_mean_squared_error', 
    refit=True, cv=5
)

gscv.fit(X, y)

Learning rate set to 0.069513
0:	learn: 5.4012051	total: 153ms	remaining: 2m 32s
1:	learn: 5.3601065	total: 159ms	remaining: 1m 19s
2:	learn: 5.3232678	total: 166ms	remaining: 55s
3:	learn: 5.2893287	total: 171ms	remaining: 42.6s
4:	learn: 5.2518353	total: 179ms	remaining: 35.7s
5:	learn: 5.2225545	total: 185ms	remaining: 30.6s
6:	learn: 5.1965277	total: 191ms	remaining: 27.2s
7:	learn: 5.1727739	total: 198ms	remaining: 24.5s
8:	learn: 5.1505386	total: 206ms	remaining: 22.6s
9:	learn: 5.1294854	total: 211ms	remaining: 20.9s
10:	learn: 5.1056816	total: 217ms	remaining: 19.5s
11:	learn: 5.0882853	total: 222ms	remaining: 18.3s
12:	learn: 5.0723277	total: 228ms	remaining: 17.3s
13:	learn: 5.0577625	total: 233ms	remaining: 16.4s
14:	learn: 5.0444443	total: 240ms	remaining: 15.7s
15:	learn: 5.0276893	total: 246ms	remaining: 15.1s
16:	learn: 5.0161058	total: 251ms	remaining: 14.5s
17:	learn: 5.0053550	total: 257ms	remaining: 14s
18:	learn: 4.9951787	total: 262ms	remaining: 13.5s
19:	learn: 4.

GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostRegressor object at 0x000001C25858F7F0>,
             param_grid={}, scoring='neg_root_mean_squared_error')

In [72]:
print(gscv.best_params_)
print(-gscv.best_score_)

{}
5.5226032488842


In [73]:
pred = gscv.predict(test)
sub['INVC_CONT'] = pred

In [74]:
sub.head()

Unnamed: 0,index,INVC_CONT
0,32000,4.442609
1,32001,4.578589
2,32002,5.24477
3,32003,5.627938
4,32004,5.059235


In [76]:
sub.to_csv('./sub/jh.csv', index=False)