# read

In [1]:
import pandas as pd; pd.set_option('display.max_columns', None)
import numpy as np; np.random.seed(0)
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('./data/train_df.csv', encoding='cp949')
test  = pd.read_csv('./data/test_df.csv', encoding='cp949')
sub   = pd.read_csv('./data/sample_submission.csv', encoding='cp949')

In [2]:
train.shape, test.shape, sub.shape

((32000, 6), (4640, 5), (4640, 2))

In [3]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT
0,0,1129000014045300,5011000220046300,패션의류,상의,3
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9
3,3,1154500002014200,5011000315087400,식품,농산물,10
4,4,1165000021008300,5011000177051200,식품,가공식품,3


# 격자공간고유번호 슬라이싱

In [4]:
for i in range(1, 16):
    train['send_' + str(i)] = train['SEND_SPG_INNB'].astype('str').str[:i].astype('int64')
    test['send_' + str(i)]  = test['SEND_SPG_INNB'].astype('str').str[:i].astype('int64')

for i in range(1, 16):
    train['rec_' + str(i)] = train['REC_SPG_INNB'].astype('str').str[:i].astype('int64')
    test['rec_' + str(i)]  = test['REC_SPG_INNB'].astype('str').str[:i].astype('int64')

In [5]:
for col in train.columns[6:]:
    print(f'{col} : {train[col].nunique()}')

send_1 : 5
send_2 : 17
send_3 : 64
send_4 : 213
send_5 : 234
send_6 : 234
send_7 : 240
send_8 : 511
send_9 : 1161
send_10 : 2069
send_11 : 2082
send_12 : 3081
send_13 : 3581
send_14 : 3684
send_15 : 3684
rec_1 : 5
rec_2 : 17
rec_3 : 65
rec_4 : 229
rec_5 : 251
rec_6 : 251
rec_7 : 268
rec_8 : 979
rec_9 : 3353
rec_10 : 7739
rec_11 : 7945
rec_12 : 19039
rec_13 : 26039
rec_14 : 27826
rec_15 : 27826


In [6]:
# 유니크 갯수 유사한 컬럼 버림
drop_cols = [
    'send_1', 'send_2', 'send_4', 'send_5', 'send_6', 
    'send_10', 'send_14', 'send_15', 'rec_1', 'rec_2', 
    'rec_4', 'rec_5', 'rec_6', 'rec_10', 'rec_13', 
    'rec_14', 'rec_15', 
]

# 유사지역 운송

In [7]:
for i in range(1, 16):
    train_sim = train[train[f'send_{i}'] == train[f'rec_{i}']].shape[0]
    test_sim  = test[test[f'send_{i}'] == test[f'rec_{i}']].shape[0]
    print(f'send_rec_{i} : train {train_sim}, test {test_sim}')

send_rec_1 : train 177, test 28
send_rec_2 : train 177, test 28
send_rec_3 : train 177, test 28
send_rec_4 : train 132, test 19
send_rec_5 : train 132, test 19
send_rec_6 : train 132, test 19
send_rec_7 : train 129, test 17
send_rec_8 : train 68, test 12
send_rec_9 : train 56, test 6
send_rec_10 : train 49, test 5
send_rec_11 : train 49, test 5
send_rec_12 : train 39, test 2
send_rec_13 : train 39, test 2
send_rec_14 : train 37, test 2
send_rec_15 : train 37, test 2


In [8]:
for i in [3, 7, 8, 10]:
    train[f'sim_INNB_{i}'] = 0
    test[f'sim_INNB_{i}']  = 0
    train.loc[train[f'send_{i}'] == train[f'rec_{i}'], f'sim_INNB_{i}'] = 1
    test.loc[test[f'send_{i}'] == test[f'rec_{i}'], f'sim_INNB_{i}']    = 1

In [9]:
train.loc[:, 'sim_INNB_3':].corr()

Unnamed: 0,sim_INNB_3,sim_INNB_7,sim_INNB_8,sim_INNB_10
sim_INNB_3,1.0,0.853063,0.618764,0.525097
sim_INNB_7,0.853063,1.0,0.725345,0.615544
sim_INNB_8,0.618764,0.725345,1.0,0.848622
sim_INNB_10,0.525097,0.615544,0.848622,1.0


# 지역별 운송 빈도 변수

In [10]:
for col in train.columns[6:-4]:
    print(f'##### {col} value_counts')
    print(train[col].value_counts())
    print('-----------------------------')

##### send_1 value_counts
5    25344
4     4984
1      859
2      550
3      263
Name: send_1, dtype: int64
-----------------------------
##### send_2 value_counts
50    25344
41     3950
11      859
30      231
26      187
27      185
43      184
44      175
47      171
45      163
28      132
46      124
42      118
48       99
29       46
31       28
36        4
Name: send_2, dtype: int64
-----------------------------
##### send_3 value_counts
501    25344
414     1887
415      909
412      358
413      352
       ...  
361        4
458        4
422        3
469        2
312        1
Name: send_3, Length: 64, dtype: int64
-----------------------------
##### send_4 value_counts
5011    14003
5013    11341
4148     1203
4159      395
4141      363
        ...  
4792        1
4129        1
4777        1
4277        1
4672        1
Name: send_4, Length: 213, dtype: int64
-----------------------------
##### send_5 value_counts
50110    14003
50130    11341
41480     1203
41590      395
4

In [11]:
train['send_2'].value_counts()

50    25344
41     3950
11      859
30      231
26      187
27      185
43      184
44      175
47      171
45      163
28      132
46      124
42      118
48       99
29       46
31       28
36        4
Name: send_2, dtype: int64

In [12]:
send_freq2_map = {11: 1, 41: 2, 50: 3}
train['send_freq_2'] = train['send_2'].map(send_freq2_map)
test['send_freq_2']  = test['send_2'].map(send_freq2_map)

train['send_freq_2'].fillna(0, inplace=True)
test['send_freq_2'].fillna(0, inplace=True)

In [13]:
train['rec_2'].value_counts()

41    7061
50    6833
11    4984
26    1548
28    1419
48    1397
44    1193
47    1138
45     942
27     911
46     885
42     874
43     770
30     682
29     636
31     544
36     183
Name: rec_2, dtype: int64

In [14]:
rec_freq2_map = {47: 1, 44: 1, 48: 1, 28: 1, 26: 1, 11: 2, 50: 3, 41: 3}
train['rec_freq_2'] = train['rec_2'].map(rec_freq2_map)
test['rec_freq_2']  = test['rec_2'].map(rec_freq2_map)

train['rec_freq_2'].fillna(0, inplace=True)
test['rec_freq_2'].fillna(0, inplace=True)

In [15]:
train['rec_freq_2'].value_counts()

3.0    13894
1.0     6695
0.0     6427
2.0     4984
Name: rec_freq_2, dtype: int64

In [16]:
train[['send_freq_2', 'rec_freq_2']] = train[['send_freq_2', 'rec_freq_2']].astype('int64')
test[['send_freq_2', 'rec_freq_2']]  = test[['send_freq_2', 'rec_freq_2']].astype('int64')

# 분류별 target 평균

In [17]:
large_label = dict(train.groupby('DL_GD_LCLS_NM').mean().sort_values('INVC_CONT')[['INVC_CONT']].reset_index().values)
mid_label   = dict(train.groupby('DL_GD_MCLS_NM').mean().sort_values('INVC_CONT')[['INVC_CONT']].reset_index().values)

train['DL_GD_LCLS_NM'] = train['DL_GD_LCLS_NM'].map(large_label)
test['DL_GD_LCLS_NM']  = test['DL_GD_LCLS_NM'].map(large_label)

train['DL_GD_MCLS_NM'] = train['DL_GD_MCLS_NM'].map(mid_label)
test['DL_GD_MCLS_NM']  = test['DL_GD_MCLS_NM'].map(mid_label)

# drop

In [18]:
drop_cols = drop_cols + ['index', 'SEND_SPG_INNB', 'REC_SPG_INNB']
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

In [19]:
train.head()

Unnamed: 0,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT,send_3,send_7,send_8,send_9,send_11,send_12,send_13,rec_3,rec_7,rec_8,rec_9,rec_11,rec_12,sim_INNB_3,sim_INNB_7,sim_INNB_8,sim_INNB_10,send_freq_2,rec_freq_2
0,6.678694,3.672897,3,112,1129000,11290000,112900001,11290000140,112900001404,1129000014045,501,5011000,50110002,501100022,50110002200,501100022004,0,0,0,0,1,3
1,5.407921,4.444134,3,113,1135000,11350000,113500000,11350000090,113500000905,1135000009051,501,5011000,50110001,501100017,50110001780,501100017803,0,0,0,0,1,3
2,6.678694,10.375479,9,113,1135000,11350000,113500003,11350000300,113500003009,1135000030093,501,5011000,50110002,501100026,50110002650,501100026509,0,0,0,0,1,3
3,4.658195,4.297401,10,115,1154500,11545000,115450000,11545000020,115450000201,1154500002014,501,5011000,50110003,501100031,50110003150,501100031508,0,0,0,0,1,3
4,4.658195,6.807151,3,116,1165000,11650000,116500002,11650000210,116500002100,1165000021008,501,5011000,50110001,501100017,50110001770,501100017705,0,0,0,0,1,3


# corr

In [20]:
train.corr()

Unnamed: 0,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT,send_3,send_7,send_8,send_9,send_11,send_12,send_13,rec_3,rec_7,rec_8,rec_9,rec_11,rec_12,sim_INNB_3,sim_INNB_7,sim_INNB_8,sim_INNB_10,send_freq_2,rec_freq_2
DL_GD_LCLS_NM,1.0,0.342224,0.06219,-0.451482,-0.451231,-0.451231,-0.451231,-0.451231,-0.451231,-0.451231,0.203205,0.202902,0.202902,0.202902,0.202902,0.202902,-0.004301,-0.000561,-0.004987,-0.011335,-0.370076,0.242896
DL_GD_MCLS_NM,0.342224,1.0,0.181723,-0.375542,-0.375489,-0.375489,-0.375489,-0.375489,-0.375489,-0.375489,0.234644,0.234332,0.234332,0.234332,0.234332,0.234332,0.059245,0.062619,0.041706,0.039097,-0.420906,0.228823
INVC_CONT,0.06219,0.181723,1.0,-0.058765,-0.058702,-0.058702,-0.058702,-0.058702,-0.058702,-0.058702,0.063267,0.063201,0.063201,0.063201,0.063201,0.063201,0.015316,0.013114,0.0172,0.018531,-0.088158,0.036048
send_3,-0.451482,-0.375542,-0.058765,1.0,0.999997,0.999997,0.999997,0.999997,0.999997,0.999997,-0.376118,-0.375639,-0.375639,-0.375639,-0.375639,-0.375639,0.028495,0.024308,0.017632,0.014963,0.733644,-0.377887
send_7,-0.451231,-0.375489,-0.058702,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,-0.375697,-0.375218,-0.375218,-0.375218,-0.375218,-0.375218,0.028429,0.024229,0.017583,0.014924,0.733314,-0.377442
send_8,-0.451231,-0.375489,-0.058702,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,-0.375697,-0.375218,-0.375218,-0.375218,-0.375218,-0.375218,0.028429,0.024229,0.017583,0.014924,0.733314,-0.377442
send_9,-0.451231,-0.375489,-0.058702,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,-0.375697,-0.375218,-0.375218,-0.375218,-0.375218,-0.375218,0.028429,0.024229,0.017583,0.014924,0.733314,-0.377442
send_11,-0.451231,-0.375489,-0.058702,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,-0.375697,-0.375218,-0.375218,-0.375218,-0.375218,-0.375218,0.028429,0.024229,0.017583,0.014924,0.733314,-0.377442
send_12,-0.451231,-0.375489,-0.058702,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,-0.375697,-0.375218,-0.375218,-0.375218,-0.375218,-0.375218,0.028429,0.024229,0.017583,0.014924,0.733314,-0.377442
send_13,-0.451231,-0.375489,-0.058702,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,-0.375697,-0.375218,-0.375218,-0.375218,-0.375218,-0.375218,0.028429,0.024229,0.017583,0.014924,0.733314,-0.377442


# modeling

In [21]:
X = train.drop('INVC_CONT', axis=1)
y = train['INVC_CONT']

In [22]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(random_state=0, random_strength=0)

model.fit(X, y)
pd.DataFrame({'col':X.columns, 'fi':model.feature_importances_})

Learning rate set to 0.070793
0:	learn: 5.7066495	total: 218ms	remaining: 3m 37s
1:	learn: 5.6652039	total: 291ms	remaining: 2m 25s
2:	learn: 5.6228657	total: 391ms	remaining: 2m 9s
3:	learn: 5.5828851	total: 518ms	remaining: 2m 8s
4:	learn: 5.5727241	total: 581ms	remaining: 1m 55s
5:	learn: 5.5425647	total: 684ms	remaining: 1m 53s
6:	learn: 5.5273188	total: 795ms	remaining: 1m 52s
7:	learn: 5.5194925	total: 860ms	remaining: 1m 46s
8:	learn: 5.5126339	total: 941ms	remaining: 1m 43s
9:	learn: 5.5064809	total: 1.02s	remaining: 1m 40s
10:	learn: 5.4822679	total: 1.11s	remaining: 1m 40s
11:	learn: 5.4600456	total: 1.19s	remaining: 1m 37s
12:	learn: 5.4544370	total: 1.24s	remaining: 1m 34s
13:	learn: 5.4495532	total: 1.29s	remaining: 1m 31s
14:	learn: 5.4453512	total: 1.38s	remaining: 1m 30s
15:	learn: 5.4247405	total: 1.45s	remaining: 1m 29s
16:	learn: 5.4211896	total: 1.58s	remaining: 1m 31s
17:	learn: 5.4028130	total: 1.67s	remaining: 1m 31s
18:	learn: 5.3993099	total: 1.77s	remaining: 1

Unnamed: 0,col,fi
0,DL_GD_LCLS_NM,3.557438
1,DL_GD_MCLS_NM,21.657096
2,send_3,1.943295
3,send_7,11.25767
4,send_8,4.629009
5,send_9,4.808652
6,send_11,7.120496
7,send_12,18.154612
8,send_13,8.066726
9,rec_3,0.713813


In [23]:
from sklearn.model_selection import GridSearchCV

model = CatBoostRegressor(random_state=0, random_strength=0)

gscv = GridSearchCV(
    estimator=model, param_grid={}, 
    scoring='neg_root_mean_squared_error', 
    refit=True, cv=10
)

gscv.fit(X, y)

Learning rate set to 0.069625
0:	learn: 5.6856166	total: 28ms	remaining: 28s
1:	learn: 5.6458038	total: 64ms	remaining: 32s
2:	learn: 5.6342396	total: 78.3ms	remaining: 26s
3:	learn: 5.5995231	total: 106ms	remaining: 26.4s
4:	learn: 5.5899137	total: 124ms	remaining: 24.7s
5:	learn: 5.5812388	total: 140ms	remaining: 23.1s
6:	learn: 5.5741217	total: 160ms	remaining: 22.6s
7:	learn: 5.5674673	total: 185ms	remaining: 23s
8:	learn: 5.5613511	total: 203ms	remaining: 22.3s
9:	learn: 5.5329037	total: 237ms	remaining: 23.5s
10:	learn: 5.5279223	total: 259ms	remaining: 23.3s
11:	learn: 5.5230545	total: 354ms	remaining: 29.2s
12:	learn: 5.5026358	total: 477ms	remaining: 36.2s
13:	learn: 5.4982951	total: 511ms	remaining: 36s
14:	learn: 5.4941049	total: 531ms	remaining: 34.8s
15:	learn: 5.4882077	total: 562ms	remaining: 34.5s
16:	learn: 5.4852540	total: 616ms	remaining: 35.6s
17:	learn: 5.4814385	total: 646ms	remaining: 35.2s
18:	learn: 5.4598082	total: 660ms	remaining: 34.1s
19:	learn: 5.4569406	t

GridSearchCV(cv=10,
             estimator=<catboost.core.CatBoostRegressor object at 0x000002C14AEF8280>,
             param_grid={}, scoring='neg_root_mean_squared_error')

In [24]:
-gscv.best_score_

5.382404705764441

# sub

In [25]:
sub['INVC_CONT'] = gscv.predict(test)
best = pd.read_csv('./sub/logistics_06.csv')

In [27]:
pd.merge(sub, best, how='left', on='index').head(20)

Unnamed: 0,index,INVC_CONT_x,INVC_CONT_y
0,32000,4.751713,4.528327
1,32001,4.881143,4.738477
2,32002,5.283379,5.040101
3,32003,6.02241,5.868895
4,32004,5.570852,4.840474
5,32005,6.231268,6.048482
6,32006,5.126988,4.982499
7,32007,4.976855,4.913057
8,32008,5.699517,5.277072
9,32009,4.83756,5.274227


In [28]:
# sub.to_csv('./sub/logistics_11.csv', index=False)