In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder

pd.set_option('display.max_rows', None)

train = pd.read_csv('/content/drive/MyDrive/물류예측/train_df.csv', encoding="cp949")
test = pd.read_csv('/content/drive/MyDrive/물류예측/test_df.csv', encoding="cp949")

submission = pd.read_csv('/content/drive/MyDrive/물류예측/sample_submission.csv', encoding="cp949")

In [3]:
train.shape, test.shape

((32000, 6), (4640, 5))

In [4]:
df = pd.concat([train, test], axis=0)

In [5]:
df.shape

(36640, 6)

In [6]:
df.columns = ['index', 'send_num', 'receive_num', "cate_big", "cate_mid","target"]

In [7]:
df.head()

Unnamed: 0,index,send_num,receive_num,cate_big,cate_mid,target
0,0,1129000014045300,5011000220046300,패션의류,상의,3.0
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3.0
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9.0
3,3,1154500002014200,5011000315087400,식품,농산물,10.0
4,4,1165000021008300,5011000177051200,식품,가공식품,3.0


In [8]:
df['send_num'].nunique(), df['receive_num'].nunique()

(3983, 31413)

In [9]:
df['send_num_2'] = df["send_num"].astype('str').str[:2].astype('int64')
df['send_num_3'] = df["send_num"].astype('str').str[:3].astype('int64')
df['send_num_4'] = df['send_num'].astype('str').str[:4].astype('int64')
# df['send_num_5'] = df['send_num'].astype('str').str[:5].astype('int64')

df['send_num_7'] = df['send_num'].astype('str').str[:7].astype('int64')
df['send_num_8'] = df['send_num'].astype('str').str[:8].astype('int64')
df['send_num_9'] = df['send_num'].astype('str').str[:9].astype('int64')


In [10]:
df['rec_num_2'] = df["receive_num"].astype('str').str[:2].astype('int64')
df['rec_num_3'] = df["receive_num"].astype('str').str[:3].astype('int64')
df['rec_num_4'] = df['receive_num'].astype('str').str[:4].astype('int64')
# df['rec_num_5'] = df['receive_num'].astype('str').str[:5].astype('int64')

df['rec_num_7'] = df['receive_num'].astype('str').str[:7].astype('int64')
df['rec_num_8'] = df['receive_num'].astype('str').str[:8].astype('int64')
df['rec_num_9'] = df['receive_num'].astype('str').str[:9].astype('int64')


In [11]:
df.head()

Unnamed: 0,index,send_num,receive_num,cate_big,cate_mid,target,send_num_2,send_num_3,send_num_4,send_num_7,send_num_8,send_num_9,rec_num_2,rec_num_3,rec_num_4,rec_num_7,rec_num_8,rec_num_9
0,0,1129000014045300,5011000220046300,패션의류,상의,3.0,11,112,1129,1129000,11290000,112900001,50,501,5011,5011000,50110002,501100022
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3.0,11,113,1135,1135000,11350000,113500000,50,501,5011,5011000,50110001,501100017
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9.0,11,113,1135,1135000,11350000,113500003,50,501,5011,5011000,50110002,501100026
3,3,1154500002014200,5011000315087400,식품,농산물,10.0,11,115,1154,1154500,11545000,115450000,50,501,5011,5011000,50110003,501100031
4,4,1165000021008300,5011000177051200,식품,가공식품,3.0,11,116,1165,1165000,11650000,116500002,50,501,5011,5011000,50110001,501100017


In [12]:
df.groupby(by='rec_num_3')['target'].count().sort_values(ascending=False)

rec_num_3
501    6833
411    2032
412    1481
414    1174
115    1003
415     982
282     927
116     923
112     821
413     813
451     752
114     645
113     642
471     636
272     600
481     548
117     533
421     503
442     488
431     476
291     468
262     464
416     462
263     460
281     459
441     457
461     442
111     417
301     408
482     396
311     303
472     284
264     277
302     274
437     272
483     265
468     221
271     193
427     189
361     183
292     168
265     157
448     153
457     130
488     129
422     123
317     121
312     120
277     118
261     118
418     117
467     110
447      95
478      92
477      78
462      76
267      72
487      59
428      59
479      48
469      36
452      36
287      33
458      24
438      22
Name: target, dtype: int64

In [13]:
# df['rec_num_5'].nunique()   # 5 ,6 유니크 갯수가 같음 

In [14]:
# 같은 곳에서 같은 곳으로 보내는

same_idx = df.loc[df['send_num'] == df['receive_num'], 'index'].index
df['same'] = 0

for idx in same_idx:
    df.loc[idx, 'same'] = 1

In [15]:
# 제주도 내의 물류

df.loc[(df['send_num_3'] == 501) & (df['rec_num_3'] == 501), '도내물류' ] = 1

In [16]:
df['도내물류'].fillna(0, inplace=True)
df['도내물류'].value_counts()

0.0    36435
1.0      205
Name: 도내물류, dtype: int64

In [17]:
# df['도내물류'].value_counts()

In [18]:
# 도외물류

df.loc[~((df['send_num_3'] == 501) | (df['rec_num_3'] == 501)), '도외물류' ] = 1
df['도외물류'].fillna(0, inplace=True)

In [19]:
df['도외물류'].value_counts()

0.0    36640
Name: 도외물류, dtype: int64

In [20]:
#경기도 물류

df.loc[(df['send_num_2'] == 41) | (df['rec_num_2'] == 41), '경기도물류' ] = 1
df['경기도물류'].fillna(0, inplace=True)

In [21]:
df['경기도물류'].value_counts()

0.0    24035
1.0    12605
Name: 경기도물류, dtype: int64

In [22]:
df[df['cate_big']== '식품']['cate_mid'].value_counts() #/ df['cate_mid'].count() #31785 = 식품

농산물          25401
음료            1293
수산            1075
가공식품          1057
기타식품           873
건강식품           739
축산             479
냉동/간편조리식품      468
과자             400
Name: cate_mid, dtype: int64

농산물 가중치

In [23]:
#  aa = (df[df['cate_mid']== '농산물']['cate_mid'].count() /  df[df['cate_big']== '식품']['cate_big'].count()) + 1
#  aa

In [24]:
df['gri'] = df['cate_mid'].apply(lambda x: 1.2 if x=='농산물' else 0)

In [25]:
df.head()

Unnamed: 0,index,send_num,receive_num,cate_big,cate_mid,target,send_num_2,send_num_3,send_num_4,send_num_7,send_num_8,send_num_9,rec_num_2,rec_num_3,rec_num_4,rec_num_7,rec_num_8,rec_num_9,same,도내물류,도외물류,경기도물류,gri
0,0,1129000014045300,5011000220046300,패션의류,상의,3.0,11,112,1129,1129000,11290000,112900001,50,501,5011,5011000,50110002,501100022,0,0.0,0.0,0.0,0.0
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3.0,11,113,1135,1135000,11350000,113500000,50,501,5011,5011000,50110001,501100017,0,0.0,0.0,0.0,0.0
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9.0,11,113,1135,1135000,11350000,113500003,50,501,5011,5011000,50110002,501100026,0,0.0,0.0,0.0,0.0
3,3,1154500002014200,5011000315087400,식품,농산물,10.0,11,115,1154,1154500,11545000,115450000,50,501,5011,5011000,50110003,501100031,0,0.0,0.0,0.0,1.2
4,4,1165000021008300,5011000177051200,식품,가공식품,3.0,11,116,1165,1165000,11650000,116500002,50,501,5011,5011000,50110001,501100017,0,0.0,0.0,0.0,0.0


In [26]:
df['cate_big'].unique()

array(['패션의류', '생활/건강', '식품', '화장품/미용', '여행/문화', '디지털/가전'], dtype=object)

In [27]:
# df[df['cate_big']== '패션의류']['cate_mid'].value_counts()

In [28]:
# df[df['cate_big']== '생활/건강']['cate_mid'].value_counts()

In [29]:
# bb = ( df[df['cate_mid']== '음반']['cate_mid'].count() /  df[df['cate_big']== '생활/건강']['cate_big'].count()) +1

In [30]:
# df['music'] = df['cate_mid'].apply(lambda x: bb if x=='음반' else 0)

In [31]:
# df[df['cate_big']== '화장품/미용']['cate_mid'].value_counts()

In [32]:
# df[df['cate_big']== '여행/문화']['cate_mid'].value_counts()

In [33]:
# df[df['cate_big']== '디지털/가전']['cate_mid'].value_counts()

In [34]:
df.drop(['index','send_num','receive_num', 'send_num_2', 'rec_num_2'], axis=1, inplace=True)

In [35]:
# df['send_4'] = df['SEND_SPG_INNB'].astype(str).apply(lambda x: x[:4])
# df['rec_4'] = df['REC_SPG_INNB'].astype(str).apply(lambda x: x[:4])

# df["send_4"] = df["send_4"].astype(int)
# df["rec_4"] = df["rec_4"].astype(int)

# fig, ax = plt.subplots(figsize=(10,10))
# sns.heatmap(df.corr(), annot=True, fmt=".4f", cmap="Blues")

In [36]:
df.head()

Unnamed: 0,cate_big,cate_mid,target,send_num_3,send_num_4,send_num_7,send_num_8,send_num_9,rec_num_3,rec_num_4,rec_num_7,rec_num_8,rec_num_9,same,도내물류,도외물류,경기도물류,gri
0,패션의류,상의,3.0,112,1129,1129000,11290000,112900001,501,5011,5011000,50110002,501100022,0,0.0,0.0,0.0,0.0
1,생활/건강,반려동물,3.0,113,1135,1135000,11350000,113500000,501,5011,5011000,50110001,501100017,0,0.0,0.0,0.0,0.0
2,패션의류,기타패션의류,9.0,113,1135,1135000,11350000,113500003,501,5011,5011000,50110002,501100026,0,0.0,0.0,0.0,0.0
3,식품,농산물,10.0,115,1154,1154500,11545000,115450000,501,5011,5011000,50110003,501100031,0,0.0,0.0,0.0,1.2
4,식품,가공식품,3.0,116,1165,1165000,11650000,116500002,501,5011,5011000,50110001,501100017,0,0.0,0.0,0.0,0.0


배송양이 많으면 업자 아니면 개인택배 가중치

In [37]:
# df.groupby(by='send_num_3')['target'].count().sort_values(ascending=False)

# # 501    25344

In [38]:
# # df.groupby(by='send_num_4')['target'].count().sort_values(ascending=False)

# # send_num_4
# 5011    14003
# 5013    11341

In [39]:
# df.groupby(by='send_num_5')['target'].count().sort_values(ascending=False)
# send_num_5
# 50110    14003
# 50130    11341
# 41480     1203


In [40]:
# df.groupby(by='send_num_7')['target'].count().sort_values(ascending=False)
# send_num_7
# 5011000    13133
# 5013000    11341
# 4148000     1203
# 5011001      870

In [41]:
# df.groupby(by='send_num_8')['target'].count().sort_values(ascending=False)

# send_num_8
# 50110000    4270
# 50130007    3912
# 50130006    3194
# 50130008    2332
# 50110003    1808
# 50110001    1791
# 50110004    1585
# 50110002    1551
# 41480006    1122
# 50110005    1062
# 50110010     699
# 50110006     629

물류이동이 많은 지역

In [42]:
# big = df.groupby(by='send_num_8')['target'].count().sort_values(ascending=False)[:2].to_dict().keys()
big = [501]

In [43]:


# same_idx = df.loc[df['rec_num_3'].isin(big), 'send_num_8'].index
# df['제주도'] = 0

# for idx in same_idx:
#     df.loc[idx, '제주도'] = 1

스케일링 

In [44]:
df.head()

Unnamed: 0,cate_big,cate_mid,target,send_num_3,send_num_4,send_num_7,send_num_8,send_num_9,rec_num_3,rec_num_4,rec_num_7,rec_num_8,rec_num_9,same,도내물류,도외물류,경기도물류,gri
0,패션의류,상의,3.0,112,1129,1129000,11290000,112900001,501,5011,5011000,50110002,501100022,0,0.0,0.0,0.0,0.0
1,생활/건강,반려동물,3.0,113,1135,1135000,11350000,113500000,501,5011,5011000,50110001,501100017,0,0.0,0.0,0.0,0.0
2,패션의류,기타패션의류,9.0,113,1135,1135000,11350000,113500003,501,5011,5011000,50110002,501100026,0,0.0,0.0,0.0,0.0
3,식품,농산물,10.0,115,1154,1154500,11545000,115450000,501,5011,5011000,50110003,501100031,0,0.0,0.0,0.0,1.2
4,식품,가공식품,3.0,116,1165,1165000,11650000,116500002,501,5011,5011000,50110001,501100017,0,0.0,0.0,0.0,0.0


In [45]:
# from sklearn.preprocessing import MinMaxScaler
# sc = MinMaxScaler()
# df2 = df.copy()

# df[['send_num_3','send_num_4','send_num_5','send_num_7','rec_num_3','rec_num_4','rec_num_5','rec_num_7']] = sc.fit_transform(df2[['send_num_3','send_num_4','send_num_5','send_num_7','rec_num_3','rec_num_4','rec_num_5','rec_num_7']])

In [46]:
df.head()

Unnamed: 0,cate_big,cate_mid,target,send_num_3,send_num_4,send_num_7,send_num_8,send_num_9,rec_num_3,rec_num_4,rec_num_7,rec_num_8,rec_num_9,same,도내물류,도외물류,경기도물류,gri
0,패션의류,상의,3.0,112,1129,1129000,11290000,112900001,501,5011,5011000,50110002,501100022,0,0.0,0.0,0.0,0.0
1,생활/건강,반려동물,3.0,113,1135,1135000,11350000,113500000,501,5011,5011000,50110001,501100017,0,0.0,0.0,0.0,0.0
2,패션의류,기타패션의류,9.0,113,1135,1135000,11350000,113500003,501,5011,5011000,50110002,501100026,0,0.0,0.0,0.0,0.0
3,식품,농산물,10.0,115,1154,1154500,11545000,115450000,501,5011,5011000,50110003,501100031,0,0.0,0.0,0.0,1.2
4,식품,가공식품,3.0,116,1165,1165000,11650000,116500002,501,5011,5011000,50110001,501100017,0,0.0,0.0,0.0,0.0


## 데이터 전처리

In [47]:
train = df[~df['target'].isna()]
test  = df[df['target'].isna()]

test.drop('target', axis=1, inplace=True)

print(df.shape, train.shape, test.shape)

(36640, 18) (32000, 18) (4640, 17)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [48]:
train['target'] = train['target'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [49]:
train.head()

Unnamed: 0,cate_big,cate_mid,target,send_num_3,send_num_4,send_num_7,send_num_8,send_num_9,rec_num_3,rec_num_4,rec_num_7,rec_num_8,rec_num_9,same,도내물류,도외물류,경기도물류,gri
0,패션의류,상의,3,112,1129,1129000,11290000,112900001,501,5011,5011000,50110002,501100022,0,0.0,0.0,0.0,0.0
1,생활/건강,반려동물,3,113,1135,1135000,11350000,113500000,501,5011,5011000,50110001,501100017,0,0.0,0.0,0.0,0.0
2,패션의류,기타패션의류,9,113,1135,1135000,11350000,113500003,501,5011,5011000,50110002,501100026,0,0.0,0.0,0.0,0.0
3,식품,농산물,10,115,1154,1154500,11545000,115450000,501,5011,5011000,50110003,501100031,0,0.0,0.0,0.0,1.2
4,식품,가공식품,3,116,1165,1165000,11650000,116500002,501,5011,5011000,50110001,501100017,0,0.0,0.0,0.0,0.0


In [50]:
# # scaling
# scaler = StandardScaler()

# scaler.fit(train)

In [51]:
# 원핫 인코딩
train_one = pd.get_dummies(train)
test_one = pd.get_dummies(test)

In [52]:
pd.set_option('display.max_rows', None)
train_one.head()

Unnamed: 0,target,send_num_3,send_num_4,send_num_7,send_num_8,send_num_9,rec_num_3,rec_num_4,rec_num_7,rec_num_8,rec_num_9,same,도내물류,도외물류,경기도물류,gri,cate_big_디지털/가전,cate_big_생활/건강,cate_big_식품,cate_big_여행/문화,cate_big_패션의류,cate_big_화장품/미용,cate_mid_가공식품,cate_mid_건강식품,cate_mid_건강용품,cate_mid_과자,cate_mid_기타디지털/가전,cate_mid_기타식품,cate_mid_기타패션의류,cate_mid_냉동/간편조리식품,cate_mid_농산물,cate_mid_문구/사무용품,cate_mid_문화컨텐츠,cate_mid_반려동물,cate_mid_상의,cate_mid_생활용품,cate_mid_수산,cate_mid_스킨케어,cate_mid_음료,cate_mid_음반,cate_mid_주방용품,cate_mid_축산
0,3,112,1129,1129000,11290000,112900001,501,5011,5011000,50110002,501100022,0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,3,113,1135,1135000,11350000,113500000,501,5011,5011000,50110001,501100017,0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,9,113,1135,1135000,11350000,113500003,501,5011,5011000,50110002,501100026,0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,10,115,1154,1154500,11545000,115450000,501,5011,5011000,50110003,501100031,0,0.0,0.0,0.0,1.2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,3,116,1165,1165000,11650000,116500002,501,5011,5011000,50110001,501100017,0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 모델링

In [53]:
!pip install catboost



In [54]:
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

In [55]:
np.random.seed(27)
tf.random.set_seed(27)

In [57]:
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor 
train_Y = train_one['target']
train_X = train_one.drop('target',axis = 1)
test_X = test_one

#모델 정의
model = CatBoostRegressor(random_seed=27)

In [58]:
model.fit(train_X, train_Y)

Learning rate set to 0.070793
0:	learn: 5.7117262	total: 65.7ms	remaining: 1m 5s
1:	learn: 5.6821637	total: 84.6ms	remaining: 42.2s
2:	learn: 5.6476881	total: 103ms	remaining: 34.1s
3:	learn: 5.6348664	total: 118ms	remaining: 29.3s
4:	learn: 5.6007685	total: 135ms	remaining: 26.9s
5:	learn: 5.5697545	total: 149ms	remaining: 24.7s
6:	learn: 5.5416723	total: 168ms	remaining: 23.8s
7:	learn: 5.5166839	total: 187ms	remaining: 23.2s
8:	learn: 5.4930637	total: 200ms	remaining: 22s
9:	learn: 5.4886384	total: 209ms	remaining: 20.7s
10:	learn: 5.4827179	total: 217ms	remaining: 19.5s
11:	learn: 5.4764407	total: 226ms	remaining: 18.6s
12:	learn: 5.4663168	total: 237ms	remaining: 18s
13:	learn: 5.4491017	total: 244ms	remaining: 17.2s
14:	learn: 5.4457917	total: 253ms	remaining: 16.6s
15:	learn: 5.4422999	total: 260ms	remaining: 16s
16:	learn: 5.4374832	total: 277ms	remaining: 16s
17:	learn: 5.4351236	total: 284ms	remaining: 15.5s
18:	learn: 5.4308590	total: 291ms	remaining: 15s
19:	learn: 5.425876

<catboost.core.CatBoostRegressor at 0x7f2a527a8710>

In [59]:
# test 데이터 예측
pred = model.predict(test_X)

In [None]:
pred[:10]

## 정답파일 생성

In [None]:
submission.drop('target', axis=1, inplace=True)

In [60]:
submission['INVC_CONT'] = pred

In [61]:
submission.to_csv('/content/drive/MyDrive/물류예측/baseline_금욜3.csv',index = False)

In [None]:
sub = pd.read_csv('baseline.csv')

In [None]:
sub.head()

In [None]:
from sklearn.model_selection import kolds

In [None]:
cat_kfold