# read

In [1]:
import pandas as pd; pd.set_option('display.max_columns', None)
import numpy as np; np.random.seed(0)
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('./data/train_df.csv', encoding='cp949')
test  = pd.read_csv('./data/test_df.csv', encoding='cp949')
sub   = pd.read_csv('./data/sample_submission.csv', encoding='cp949')

In [2]:
train.shape, test.shape, sub.shape

((32000, 6), (4640, 5), (4640, 2))

In [3]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT
0,0,1129000014045300,5011000220046300,패션의류,상의,3
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9
3,3,1154500002014200,5011000315087400,식품,농산물,10
4,4,1165000021008300,5011000177051200,식품,가공식품,3


In [4]:
test.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM
0,32000,5013000043028400,1165000021097200,식품,농산물
1,32001,5013000044016100,1154500002066400,식품,농산물
2,32002,5013000205030200,4139000102013200,식품,농산물
3,32003,5013000205030200,4221000040093400,식품,농산물
4,32004,5013000268011400,2726000004017100,식품,농산물


# preprocessing

In [5]:
train['ag'] = 0
test['ag']  = 0

train.loc[train['DL_GD_MCLS_NM'] == '농산물', 'ag'] = 1
test.loc[test['DL_GD_MCLS_NM'] == '농산물', 'ag'] = 1

In [6]:
large_label = dict(train.groupby('DL_GD_LCLS_NM').mean().sort_values('INVC_CONT')[['INVC_CONT']].reset_index().values)
mid_label   = dict(train.groupby('DL_GD_MCLS_NM').mean().sort_values('INVC_CONT')[['INVC_CONT']].reset_index().values)

train['DL_GD_LCLS_NM'] = train['DL_GD_LCLS_NM'].map(large_label)
test['DL_GD_LCLS_NM']  = test['DL_GD_LCLS_NM'].map(large_label)

train['DL_GD_MCLS_NM'] = train['DL_GD_MCLS_NM'].map(mid_label)
test['DL_GD_MCLS_NM']  = test['DL_GD_MCLS_NM'].map(mid_label)

In [7]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT,ag
0,0,1129000014045300,5011000220046300,6.678694,3.672897,3,0
1,1,1135000009051200,5011000178037300,5.407921,4.444134,3,0
2,2,1135000030093100,5011000265091400,6.678694,10.375479,9,0
3,3,1154500002014200,5011000315087400,4.658195,4.297401,10,1
4,4,1165000021008300,5011000177051200,4.658195,6.807151,3,0


In [8]:
# 격자공간고유번호 슬라이싱
slice_range = range(3, 11)

for i in slice_range:
    train['send_' + str(i)] = train['SEND_SPG_INNB'].astype('str').str[:i].astype('int64')
    test['send_' + str(i)]  = test['SEND_SPG_INNB'].astype('str').str[:i].astype('int64')

for i in slice_range:
    train['rec_' + str(i)] = train['REC_SPG_INNB'].astype('str').str[:i].astype('int64')
    test['rec_' + str(i)]  = test['REC_SPG_INNB'].astype('str').str[:i].astype('int64')

In [9]:
train['eq'] = 0
test['eq']  = 0

train.loc[train['SEND_SPG_INNB'] == train['REC_SPG_INNB'], 'eq'] = 1
test.loc[test['SEND_SPG_INNB'] == test['REC_SPG_INNB'], 'eq'] = 1

In [10]:
threshold1 = 20

temp_send1 = train.groupby('SEND_SPG_INNB').min()
temp_rec1 = train.groupby('REC_SPG_INNB').min()

send_idx1 = temp_send1[temp_send1['INVC_CONT'] >= threshold1].index
rec_idx1  = temp_rec1[temp_rec1['INVC_CONT'] >= threshold1].index

train[(train['SEND_SPG_INNB'].isin(send_idx1)) | (train['REC_SPG_INNB'].isin(rec_idx1))]

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT,ag,send_3,send_4,send_5,send_6,send_7,send_8,send_9,send_10,rec_3,rec_4,rec_5,rec_6,rec_7,rec_8,rec_9,rec_10,eq
158,158,5011000319046400,2635000026058300,4.658195,4.297401,33,1,501,5011,50110,501100,5011000,50110003,501100031,5011000319,263,2635,26350,263500,2635000,26350000,263500002,2635000026,0
162,162,5011000373058300,4615000790058400,4.658195,5.780338,36,0,501,5011,50110,501100,5011000,50110003,501100037,5011000373,461,4615,46150,461500,4615000,46150007,461500079,4615000790,0
202,202,5011000767075300,2729000007038300,4.658195,8.196172,25,0,501,5011,50110,501100,5011000,50110007,501100076,5011000767,272,2729,27290,272900,2729000,27290000,272900000,2729000007,0
605,605,1111000025032200,5011000373070400,6.678694,10.375479,37,0,111,1111,11110,111100,1111000,11110000,111100002,1111000025,501,5011,50110,501100,5011000,50110003,501100037,5011000373,0
647,647,4159000310093100,5011000494074200,4.658195,6.130548,22,0,415,4159,41590,415900,4159000,41590003,415900031,4159000310,501,5011,50110,501100,5011000,50110004,501100049,5011000494,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31504,31504,1117000009037100,5011000315059300,6.547718,6.547718,24,0,111,1117,11170,111700,1117000,11170000,111700000,1117000009,501,5011,50110,501100,5011000,50110003,501100031,5011000315,0
31675,31675,5011001053094200,4613000323074400,4.658195,4.297401,20,1,501,5011,50110,501100,5011001,50110010,501100105,5011001053,461,4613,46130,461300,4613000,46130003,461300032,4613000323,0
31915,31915,5011000137023400,2638000022073100,4.658195,6.807151,50,0,501,5011,50110,501100,5011000,50110001,501100013,5011000137,263,2638,26380,263800,2638000,26380000,263800002,2638000022,0
31950,31950,5011000319076400,5011000319076400,4.658195,5.647691,25,0,501,5011,50110,501100,5011000,50110003,501100031,5011000319,501,5011,50110,501100,5011000,50110003,501100031,5011000319,1


In [11]:
train['outlier1'] = 0
test['outlier1'] = 0
train.loc[(train['SEND_SPG_INNB'].isin(send_idx1)) | (train['REC_SPG_INNB'].isin(rec_idx1)), 'outlier1'] = 1
test.loc[(test['SEND_SPG_INNB'].isin(send_idx1)) | (test['REC_SPG_INNB'].isin(rec_idx1)), 'outlier1'] = 1

In [12]:
threshold2 = 100

temp_send2 = train.groupby('SEND_SPG_INNB').min()
temp_rec2 = train.groupby('REC_SPG_INNB').min()

send_idx2 = temp_send2[temp_send2['INVC_CONT'] >= threshold2].index
rec_idx2  = temp_rec2[temp_rec2['INVC_CONT'] >= threshold2].index

train[(train['SEND_SPG_INNB'].isin(send_idx2)) | (train['REC_SPG_INNB'].isin(rec_idx2))]

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT,ag,send_3,send_4,send_5,send_6,send_7,send_8,send_9,send_10,rec_3,rec_4,rec_5,rec_6,rec_7,rec_8,rec_9,rec_10,eq,outlier1
4634,4634,4420000529077100,5013000684033400,4.658195,5.745115,100,0,442,4420,44200,442000,4420000,44200005,442000052,4420000529,501,5013,50130,501300,5013000,50130006,501300068,5013000684,0,1
5509,5509,2638000028098100,5011001115013100,4.658195,5.007264,100,0,263,2638,26380,263800,2638000,26380000,263800002,2638000028,501,5011,50110,501100,5011001,50110011,501100111,5011001115,0,1
5958,5958,5011000375010100,1165000036095300,4.658195,5.745115,105,0,501,5011,50110,501100,5011000,50110003,501100037,5011000375,116,1165,11650,116500,1165000,11650000,116500003,1165000036,0,1
7281,7281,4119900009063100,5011000309057300,5.407921,5.634146,123,0,411,4119,41199,411990,4119900,41199000,411990000,4119900009,501,5011,50110,501100,5011000,50110003,501100030,5011000309,0,1
7639,7639,4776000723095200,5011000177073400,4.658195,6.130548,108,0,477,4776,47760,477600,4776000,47760007,477600072,4776000723,501,5011,50110,501100,5011000,50110001,501100017,5011000177,0,1
7774,7774,5013000055030400,4183000267092400,4.658195,4.297401,164,1,501,5013,50130,501300,5013000,50130000,501300005,5013000055,418,4183,41830,418300,4183000,41830002,418300026,4183000267,0,1
27227,27227,4117300018005400,5011000313002100,6.678694,10.375479,160,0,411,4117,41173,411730,4117300,41173000,411730001,4117300018,501,5011,50110,501100,5011000,50110003,501100031,5011000313,0,1


In [13]:
train['outlier2'] = 0
test['outlier2'] = 0
train.loc[(train['SEND_SPG_INNB'].isin(send_idx2)) | (train['REC_SPG_INNB'].isin(rec_idx2)), 'outlier2'] = 1
test.loc[(test['SEND_SPG_INNB'].isin(send_idx2)) | (test['REC_SPG_INNB'].isin(rec_idx2)), 'outlier2'] = 1

In [14]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT,ag,send_3,send_4,send_5,send_6,send_7,send_8,send_9,send_10,rec_3,rec_4,rec_5,rec_6,rec_7,rec_8,rec_9,rec_10,eq,outlier1,outlier2
0,0,1129000014045300,5011000220046300,6.678694,3.672897,3,0,112,1129,11290,112900,1129000,11290000,112900001,1129000014,501,5011,50110,501100,5011000,50110002,501100022,5011000220,0,0,0
1,1,1135000009051200,5011000178037300,5.407921,4.444134,3,0,113,1135,11350,113500,1135000,11350000,113500000,1135000009,501,5011,50110,501100,5011000,50110001,501100017,5011000178,0,0,0
2,2,1135000030093100,5011000265091400,6.678694,10.375479,9,0,113,1135,11350,113500,1135000,11350000,113500003,1135000030,501,5011,50110,501100,5011000,50110002,501100026,5011000265,0,0,0
3,3,1154500002014200,5011000315087400,4.658195,4.297401,10,1,115,1154,11545,115450,1154500,11545000,115450000,1154500002,501,5011,50110,501100,5011000,50110003,501100031,5011000315,0,0,0
4,4,1165000021008300,5011000177051200,4.658195,6.807151,3,0,116,1165,11650,116500,1165000,11650000,116500002,1165000021,501,5011,50110,501100,5011000,50110001,501100017,5011000177,0,0,0


# encoding

In [15]:
train.drop(['index', 'SEND_SPG_INNB', 'REC_SPG_INNB'], axis=1, inplace=True)
test.drop(['index', 'SEND_SPG_INNB', 'REC_SPG_INNB'], axis=1, inplace=True)

In [16]:
train.head()

Unnamed: 0,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT,ag,send_3,send_4,send_5,send_6,send_7,send_8,send_9,send_10,rec_3,rec_4,rec_5,rec_6,rec_7,rec_8,rec_9,rec_10,eq,outlier1,outlier2
0,6.678694,3.672897,3,0,112,1129,11290,112900,1129000,11290000,112900001,1129000014,501,5011,50110,501100,5011000,50110002,501100022,5011000220,0,0,0
1,5.407921,4.444134,3,0,113,1135,11350,113500,1135000,11350000,113500000,1135000009,501,5011,50110,501100,5011000,50110001,501100017,5011000178,0,0,0
2,6.678694,10.375479,9,0,113,1135,11350,113500,1135000,11350000,113500003,1135000030,501,5011,50110,501100,5011000,50110002,501100026,5011000265,0,0,0
3,4.658195,4.297401,10,1,115,1154,11545,115450,1154500,11545000,115450000,1154500002,501,5011,50110,501100,5011000,50110003,501100031,5011000315,0,0,0
4,4.658195,6.807151,3,0,116,1165,11650,116500,1165000,11650000,116500002,1165000021,501,5011,50110,501100,5011000,50110001,501100017,5011000177,0,0,0


# modeling

In [17]:
X = train.drop('INVC_CONT', axis=1)
y = train['INVC_CONT']

In [18]:
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor

model = CatBoostRegressor(random_state=0, random_strength=0)
model.fit(X, y)
pd.DataFrame({'col':X.columns, 'fi':model.feature_importances_})

Learning rate set to 0.070793
0:	learn: 5.6198367	total: 238ms	remaining: 3m 58s
1:	learn: 5.5009886	total: 316ms	remaining: 2m 37s
2:	learn: 5.3944098	total: 381ms	remaining: 2m 6s
3:	learn: 5.3007309	total: 456ms	remaining: 1m 53s
4:	learn: 5.2155852	total: 541ms	remaining: 1m 47s
5:	learn: 5.1392624	total: 610ms	remaining: 1m 41s
6:	learn: 5.0724930	total: 670ms	remaining: 1m 35s
7:	learn: 5.0116907	total: 734ms	remaining: 1m 31s
8:	learn: 4.9580359	total: 783ms	remaining: 1m 26s
9:	learn: 4.9109876	total: 836ms	remaining: 1m 22s
10:	learn: 4.8681906	total: 934ms	remaining: 1m 23s
11:	learn: 4.8287136	total: 1.13s	remaining: 1m 32s
12:	learn: 4.7950965	total: 1.25s	remaining: 1m 35s
13:	learn: 4.7635951	total: 1.32s	remaining: 1m 32s
14:	learn: 4.7357500	total: 1.41s	remaining: 1m 32s
15:	learn: 4.7115296	total: 1.5s	remaining: 1m 32s
16:	learn: 4.6890212	total: 1.67s	remaining: 1m 36s
17:	learn: 4.6689091	total: 1.77s	remaining: 1m 36s
18:	learn: 4.6503234	total: 1.84s	remaining: 1

Unnamed: 0,col,fi
0,DL_GD_LCLS_NM,2.325698
1,DL_GD_MCLS_NM,12.924685
2,ag,0.080074
3,send_3,0.637367
4,send_4,7.771393
5,send_5,7.019136
6,send_6,0.0
7,send_7,4.667894
8,send_8,5.438084
9,send_9,1.987746


In [None]:
# drop_cols = ['ag', 'send_3', 'send_6', 'rec_3', 'rec_5', 'rec_6', 'rec_7', 'rec_10', 'eq']
# X.drop(drop_cols, axis=1, inplace=True)
# X.head()

In [None]:
# model = CatBoostRegressor(random_state=0, random_strength=0)
# model.fit(X, y)
# pd.DataFrame({'col':X.columns, 'fi':model.feature_importances_})

In [19]:
from sklearn.model_selection import GridSearchCV

model = CatBoostRegressor(random_state=0, random_strength=0)

param = {
    
}

gscv = GridSearchCV(
    estimator=model, param_grid=param, scoring='neg_root_mean_squared_error', 
    refit=True, cv=5
)

gscv.fit(X, y)

print(-gscv.best_score_)

Learning rate set to 0.068341
0:	learn: 5.3203587	total: 98.4ms	remaining: 1m 38s
1:	learn: 5.2066252	total: 291ms	remaining: 2m 25s
2:	learn: 5.1039978	total: 430ms	remaining: 2m 22s
3:	learn: 5.0115177	total: 686ms	remaining: 2m 50s
4:	learn: 4.9290796	total: 779ms	remaining: 2m 35s
5:	learn: 4.8532462	total: 856ms	remaining: 2m 21s
6:	learn: 4.7862283	total: 935ms	remaining: 2m 12s
7:	learn: 4.7231493	total: 1.08s	remaining: 2m 14s
8:	learn: 4.6679803	total: 1.2s	remaining: 2m 12s
9:	learn: 4.6181303	total: 1.32s	remaining: 2m 10s
10:	learn: 4.5732058	total: 1.49s	remaining: 2m 13s
11:	learn: 4.5315077	total: 1.62s	remaining: 2m 13s
12:	learn: 4.4947970	total: 1.9s	remaining: 2m 24s
13:	learn: 4.4624972	total: 2.13s	remaining: 2m 30s
14:	learn: 4.4329030	total: 2.31s	remaining: 2m 31s
15:	learn: 4.4063863	total: 2.44s	remaining: 2m 30s
16:	learn: 4.3820667	total: 2.6s	remaining: 2m 30s
17:	learn: 4.3594998	total: 2.76s	remaining: 2m 30s
18:	learn: 4.3390718	total: 2.84s	remaining: 2

In [20]:
pred = gscv.predict(test)
sub['INVC_CONT'] = pred

In [21]:
sub.head()

Unnamed: 0,index,INVC_CONT
0,32000,4.390872
1,32001,4.412203
2,32002,4.516655
3,32003,4.788102
4,32004,4.513901


In [22]:
# sub.to_csv('./sub/logistics_10.csv', index=False)