# read

In [1]:
import pandas as pd; pd.set_option('display.max_rows', 50, 'display.max_columns', None)
import numpy as np; np.random.seed(0)
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('./data/train_df.csv', encoding='cp949')
test  = pd.read_csv('./data/test_df.csv', encoding='cp949')
sub   = pd.read_csv('./data/sample_submission.csv', encoding='cp949')

In [2]:
train.shape, test.shape, sub.shape

((32000, 6), (4640, 5), (4640, 2))

In [3]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT
0,0,1129000014045300,5011000220046300,패션의류,상의,3
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9
3,3,1154500002014200,5011000315087400,식품,농산물,10
4,4,1165000021008300,5011000177051200,식품,가공식품,3


In [4]:
test.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM
0,32000,5013000043028400,1165000021097200,식품,농산물
1,32001,5013000044016100,1154500002066400,식품,농산물
2,32002,5013000205030200,4139000102013200,식품,농산물
3,32003,5013000205030200,4221000040093400,식품,농산물
4,32004,5013000268011400,2726000004017100,식품,농산물


# preprocessing

In [6]:
train['equal'] = 0
test['equal']  = 0

train.loc[train['SEND_SPG_INNB'] == train['REC_SPG_INNB'], 'equal'] = 1
test.loc[test['SEND_SPG_INNB'] == test['REC_SPG_INNB'], 'equal'] = 1

In [7]:
# 격자공간고유번호 슬라이싱
slice_range = range(3, 11)

for i in slice_range:
    train['send_' + str(i)] = train['SEND_SPG_INNB'].astype('str').str[:i].astype('int64')
    test['send_' + str(i)]  = test['SEND_SPG_INNB'].astype('str').str[:i].astype('int64')

for i in slice_range:
    train['rec_' + str(i)] = train['REC_SPG_INNB'].astype('str').str[:i].astype('int64')
    test['rec_' + str(i)]  = test['REC_SPG_INNB'].astype('str').str[:i].astype('int64')

In [18]:
threshold = 40

temp_send = train.groupby('SEND_SPG_INNB').min()
temp_rec = train.groupby('REC_SPG_INNB').min()

send_idx = temp_send[temp_send['INVC_CONT'] >= threshold].index
rec_idx  = temp_rec[temp_rec['INVC_CONT'] >= threshold].index

# encoding

In [8]:
large_label = dict(train.groupby('DL_GD_LCLS_NM').mean().sort_values('INVC_CONT')[['INVC_CONT']].reset_index().values)
mid_label   = dict(train.groupby('DL_GD_MCLS_NM').mean().sort_values('INVC_CONT')[['INVC_CONT']].reset_index().values)

train['large_mean'] = train['DL_GD_LCLS_NM'].map(large_label)
test['large_mean']  = test['DL_GD_LCLS_NM'].map(large_label)

train['mid_mean'] = train['DL_GD_MCLS_NM'].map(mid_label)
test['mid_mean']  = test['DL_GD_MCLS_NM'].map(mid_label)

In [9]:
train = pd.get_dummies(train)
test  = pd.get_dummies(test)

In [31]:
train.drop(['index', 'SEND_SPG_INNB', 'REC_SPG_INNB'], axis=1, inplace=True)
test.drop(['index', 'SEND_SPG_INNB', 'REC_SPG_INNB'], axis=1, inplace=True)

In [33]:
train.head()

Unnamed: 0,INVC_CONT,equal,send_3,send_4,send_5,send_6,send_7,send_8,send_9,send_10,rec_3,rec_4,rec_5,rec_6,rec_7,rec_8,rec_9,rec_10,large_mean,mid_mean,DL_GD_LCLS_NM_디지털/가전,DL_GD_LCLS_NM_생활/건강,DL_GD_LCLS_NM_식품,DL_GD_LCLS_NM_여행/문화,DL_GD_LCLS_NM_패션의류,DL_GD_LCLS_NM_화장품/미용,DL_GD_MCLS_NM_가공식품,DL_GD_MCLS_NM_건강식품,DL_GD_MCLS_NM_건강용품,DL_GD_MCLS_NM_과자,DL_GD_MCLS_NM_기타디지털/가전,DL_GD_MCLS_NM_기타식품,DL_GD_MCLS_NM_기타패션의류,DL_GD_MCLS_NM_냉동/간편조리식품,DL_GD_MCLS_NM_농산물,DL_GD_MCLS_NM_문구/사무용품,DL_GD_MCLS_NM_문화컨텐츠,DL_GD_MCLS_NM_반려동물,DL_GD_MCLS_NM_상의,DL_GD_MCLS_NM_생활용품,DL_GD_MCLS_NM_수산,DL_GD_MCLS_NM_스킨케어,DL_GD_MCLS_NM_음료,DL_GD_MCLS_NM_음반,DL_GD_MCLS_NM_주방용품,DL_GD_MCLS_NM_축산
0,3,0,112,1129,11290,112900,1129000,11290000,112900001,1129000014,501,5011,50110,501100,5011000,50110002,501100022,5011000220,6.678694,3.672897,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,3,0,113,1135,11350,113500,1135000,11350000,113500000,1135000009,501,5011,50110,501100,5011000,50110001,501100017,5011000178,5.407921,4.444134,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,9,0,113,1135,11350,113500,1135000,11350000,113500003,1135000030,501,5011,50110,501100,5011000,50110002,501100026,5011000265,6.678694,10.375479,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,10,0,115,1154,11545,115450,1154500,11545000,115450000,1154500002,501,5011,50110,501100,5011000,50110003,501100031,5011000315,4.658195,4.297401,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,3,0,116,1165,11650,116500,1165000,11650000,116500002,1165000021,501,5011,50110,501100,5011000,50110001,501100017,5011000177,4.658195,6.807151,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# modeling

In [34]:
X = train.drop('INVC_CONT', axis=1)
y = train['INVC_CONT']

In [52]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV

model = CatBoostRegressor(random_state=0, random_strength=0)

param = {
    
}

gscv = GridSearchCV(
    estimator=model, param_grid=param, scoring='neg_root_mean_squared_error', 
    refit=True, cv=5
)
gscv.fit(X, y)
print(gscv.best_params_)
print(-gscv.best_score_)

# model.fit(X, y)
# model.feature_importances_

Learning rate set to 0.068341
0:	learn: 5.4330044	total: 51.1ms	remaining: 51s
1:	learn: 5.4204212	total: 137ms	remaining: 1m 8s
2:	learn: 5.3941134	total: 186ms	remaining: 1m 1s
3:	learn: 5.3843802	total: 253ms	remaining: 1m 2s
4:	learn: 5.3760872	total: 338ms	remaining: 1m 7s
5:	learn: 5.3677599	total: 402ms	remaining: 1m 6s
6:	learn: 5.3608017	total: 473ms	remaining: 1m 7s
7:	learn: 5.3538067	total: 519ms	remaining: 1m 4s
8:	learn: 5.3485681	total: 597ms	remaining: 1m 5s
9:	learn: 5.3434325	total: 686ms	remaining: 1m 7s
10:	learn: 5.3386693	total: 831ms	remaining: 1m 14s
11:	learn: 5.3343347	total: 895ms	remaining: 1m 13s
12:	learn: 5.3304566	total: 989ms	remaining: 1m 15s
13:	learn: 5.3112481	total: 1.13s	remaining: 1m 19s
14:	learn: 5.3077660	total: 1.26s	remaining: 1m 22s
15:	learn: 5.2899322	total: 1.31s	remaining: 1m 20s
16:	learn: 5.2865131	total: 1.36s	remaining: 1m 18s
17:	learn: 5.2832685	total: 1.44s	remaining: 1m 18s
18:	learn: 5.2682372	total: 1.48s	remaining: 1m 16s
19:

In [51]:
# pd.DataFrame({'col': X.columns, 'fi': model.feature_importances_})

In [55]:
pred = gscv.predict(test)
sub['INVC_CONT'] = pred

In [56]:
sub.head()

Unnamed: 0,index,INVC_CONT
0,32000,4.449998
1,32001,4.757657
2,32002,5.450021
3,32003,5.779702
4,32004,4.744702


In [57]:
# sub.to_csv('./sub/logistics_07.csv', index=False)