In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder

pd.set_option('display.max_rows', None)

train = pd.read_csv('./data/train_df.csv', encoding="cp949")
test = pd.read_csv('./data/test_df.csv', encoding="cp949")

submission = pd.read_csv('./data/sample_submission.csv', encoding="cp949")

In [33]:
train.shape, test.shape

((32000, 6), (4640, 5))

In [34]:
df = pd.concat([train, test], axis=0)

In [35]:
df.shape

(36640, 6)

In [36]:
df.columns = ['index', 'send_num', 'receive_num', "cate_big", "cate_mid","target"]

In [37]:
df.head()

Unnamed: 0,index,send_num,receive_num,cate_big,cate_mid,target
0,0,1129000014045300,5011000220046300,패션의류,상의,3.0
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3.0
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9.0
3,3,1154500002014200,5011000315087400,식품,농산물,10.0
4,4,1165000021008300,5011000177051200,식품,가공식품,3.0


In [38]:
df['send_num'].nunique(), df['receive_num'].nunique()

(3983, 31413)

In [39]:
df['send_num_3'] = df["send_num"].astype('str').str[:3].astype('int64')
df['send_num_4'] = df['send_num'].astype('str').str[:4].astype('int64')
df['send_num_5'] = df['send_num'].astype('str').str[:5].astype('int64')
df['send_num_6'] = df['send_num'].astype('str').str[:6].astype('int64')
df['send_num_7'] = df['send_num'].astype('str').str[:7].astype('int64')
df['send_num_8'] = df['send_num'].astype('str').str[:8].astype('int64')
df['send_num_9'] = df['send_num'].astype('str').str[:9].astype('int64')
# df['send_num_10'] = df['send_num'].astype('str').str[:10].astype('int64')

In [40]:
df['rec_num_3'] = df["receive_num"].astype('str').str[:3].astype('int64')
df['rec_num_4'] = df['receive_num'].astype('str').str[:4].astype('int64')
df['rec_num_5'] = df['receive_num'].astype('str').str[:5].astype('int64')
df['rec_num_6'] = df['receive_num'].astype('str').str[:6].astype('int64')
df['rec_num_7'] = df['receive_num'].astype('str').str[:7].astype('int64')
df['rec_num_8'] = df['receive_num'].astype('str').str[:8].astype('int64')
df['rec_num_9'] = df['receive_num'].astype('str').str[:9].astype('int64')
# df['rec_num_10'] = df['receive_num'].astype('str').str[:10].astype('int64')

In [41]:
df.head()

Unnamed: 0,index,send_num,receive_num,cate_big,cate_mid,target,send_num_3,send_num_4,send_num_5,send_num_6,send_num_7,send_num_8,send_num_9,rec_num_3,rec_num_4,rec_num_5,rec_num_6,rec_num_7,rec_num_8,rec_num_9
0,0,1129000014045300,5011000220046300,패션의류,상의,3.0,112,1129,11290,112900,1129000,11290000,112900001,501,5011,50110,501100,5011000,50110002,501100022
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3.0,113,1135,11350,113500,1135000,11350000,113500000,501,5011,50110,501100,5011000,50110001,501100017
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9.0,113,1135,11350,113500,1135000,11350000,113500003,501,5011,50110,501100,5011000,50110002,501100026
3,3,1154500002014200,5011000315087400,식품,농산물,10.0,115,1154,11545,115450,1154500,11545000,115450000,501,5011,50110,501100,5011000,50110003,501100031
4,4,1165000021008300,5011000177051200,식품,가공식품,3.0,116,1165,11650,116500,1165000,11650000,116500002,501,5011,50110,501100,5011000,50110001,501100017


In [42]:
# 같은 곳에서 같은 곳으로 보내는

same_idx = df.loc[df['send_num'] == df['receive_num'], 'index'].index
df['same'] = 0

for idx in same_idx:
    df.loc[idx, 'same'] = 1

In [43]:
df['gri'] = df['cate_mid'].apply(lambda x: 1.2 if x=='농산물' else 0)

In [44]:
df.head()

Unnamed: 0,index,send_num,receive_num,cate_big,cate_mid,target,send_num_3,send_num_4,send_num_5,send_num_6,...,send_num_9,rec_num_3,rec_num_4,rec_num_5,rec_num_6,rec_num_7,rec_num_8,rec_num_9,same,gri
0,0,1129000014045300,5011000220046300,패션의류,상의,3.0,112,1129,11290,112900,...,112900001,501,5011,50110,501100,5011000,50110002,501100022,0,0.0
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3.0,113,1135,11350,113500,...,113500000,501,5011,50110,501100,5011000,50110001,501100017,0,0.0
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9.0,113,1135,11350,113500,...,113500003,501,5011,50110,501100,5011000,50110002,501100026,0,0.0
3,3,1154500002014200,5011000315087400,식품,농산물,10.0,115,1154,11545,115450,...,115450000,501,5011,50110,501100,5011000,50110003,501100031,0,1.2
4,4,1165000021008300,5011000177051200,식품,가공식품,3.0,116,1165,11650,116500,...,116500002,501,5011,50110,501100,5011000,50110001,501100017,0,0.0


In [45]:
df.drop(['index','send_num','receive_num'], axis=1, inplace=True)

In [46]:
# df['send_4'] = df['SEND_SPG_INNB'].astype(str).apply(lambda x: x[:4])
# df['rec_4'] = df['REC_SPG_INNB'].astype(str).apply(lambda x: x[:4])

# df["send_4"] = df["send_4"].astype(int)
# df["rec_4"] = df["rec_4"].astype(int)

# fig, ax = plt.subplots(figsize=(10,10))
# sns.heatmap(df.corr(), annot=True, fmt=".4f", cmap="Blues")

## 데이터 전처리

In [47]:
train = df[~df['target'].isna()]
test  = df[df['target'].isna()]

test.drop('target', axis=1, inplace=True)

print(df.shape, train.shape, test.shape)

(36640, 19) (32000, 19) (4640, 18)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [48]:
train['target'] = train['target'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['target'] = train['target'].astype('int64')


In [49]:
train.head()

Unnamed: 0,cate_big,cate_mid,target,send_num_3,send_num_4,send_num_5,send_num_6,send_num_7,send_num_8,send_num_9,rec_num_3,rec_num_4,rec_num_5,rec_num_6,rec_num_7,rec_num_8,rec_num_9,same,gri
0,패션의류,상의,3,112,1129,11290,112900,1129000,11290000,112900001,501,5011,50110,501100,5011000,50110002,501100022,0,0.0
1,생활/건강,반려동물,3,113,1135,11350,113500,1135000,11350000,113500000,501,5011,50110,501100,5011000,50110001,501100017,0,0.0
2,패션의류,기타패션의류,9,113,1135,11350,113500,1135000,11350000,113500003,501,5011,50110,501100,5011000,50110002,501100026,0,0.0
3,식품,농산물,10,115,1154,11545,115450,1154500,11545000,115450000,501,5011,50110,501100,5011000,50110003,501100031,0,1.2
4,식품,가공식품,3,116,1165,11650,116500,1165000,11650000,116500002,501,5011,50110,501100,5011000,50110001,501100017,0,0.0


In [50]:
# # scaling
# scaler = StandardScaler()

# scaler.fit(train)

In [51]:
# 원핫 인코딩
train_one = pd.get_dummies(train)
test_one = pd.get_dummies(test)

In [52]:
pd.set_option('display.max_rows', None)
train_one.head()

Unnamed: 0,target,send_num_3,send_num_4,send_num_5,send_num_6,send_num_7,send_num_8,send_num_9,rec_num_3,rec_num_4,...,cate_mid_문화컨텐츠,cate_mid_반려동물,cate_mid_상의,cate_mid_생활용품,cate_mid_수산,cate_mid_스킨케어,cate_mid_음료,cate_mid_음반,cate_mid_주방용품,cate_mid_축산
0,3,112,1129,11290,112900,1129000,11290000,112900001,501,5011,...,0,0,1,0,0,0,0,0,0,0
1,3,113,1135,11350,113500,1135000,11350000,113500000,501,5011,...,0,1,0,0,0,0,0,0,0,0
2,9,113,1135,11350,113500,1135000,11350000,113500003,501,5011,...,0,0,0,0,0,0,0,0,0,0
3,10,115,1154,11545,115450,1154500,11545000,115450000,501,5011,...,0,0,0,0,0,0,0,0,0,0
4,3,116,1165,11650,116500,1165000,11650000,116500002,501,5011,...,0,0,0,0,0,0,0,0,0,0


## 모델링

In [53]:
!pip install catboost



You should consider upgrading via the 'C:\AI\pythonProject\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [54]:
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor 
train_Y = train_one['target']
train_X = train_one.drop('target',axis = 1)


#모델 정의
model = CatBoostRegressor()

In [55]:
# 모델 학습
model.fit(train_X,train_Y)

Learning rate set to 0.07233
0:	learn: 5.7071446	total: 54.3ms	remaining: 54.2s
1:	learn: 5.6693488	total: 60.6ms	remaining: 30.3s
2:	learn: 5.6432339	total: 66.5ms	remaining: 22.1s
3:	learn: 5.6086525	total: 72.5ms	remaining: 18s
4:	learn: 5.5763458	total: 78.6ms	remaining: 15.6s
5:	learn: 5.5493845	total: 84.2ms	remaining: 13.9s
6:	learn: 5.5429216	total: 90.1ms	remaining: 12.8s
7:	learn: 5.5338421	total: 95.8ms	remaining: 11.9s
8:	learn: 5.5141340	total: 102ms	remaining: 11.2s
9:	learn: 5.5085137	total: 107ms	remaining: 10.6s
10:	learn: 5.5035780	total: 113ms	remaining: 10.1s
11:	learn: 5.4825562	total: 119ms	remaining: 9.82s
12:	learn: 5.4629921	total: 125ms	remaining: 9.49s
13:	learn: 5.4461433	total: 130ms	remaining: 9.19s
14:	learn: 5.4310535	total: 137ms	remaining: 8.98s
15:	learn: 5.4280065	total: 142ms	remaining: 8.71s
16:	learn: 5.4227596	total: 148ms	remaining: 8.53s
17:	learn: 5.4173963	total: 154ms	remaining: 8.41s
18:	learn: 5.4134176	total: 160ms	remaining: 8.25s
19:	le

<catboost.core.CatBoostRegressor at 0x1ac76f39fa0>

In [56]:
# test 데이터 예측
pred = model.predict(test_one)

In [57]:
pred[:10]

array([4.41883886, 4.65175691, 5.41410257, 5.6195816 , 4.83057832,
       6.13038935, 4.94797233, 4.87830994, 5.53403184, 5.4602541 ])

## 정답파일 생성

In [58]:
submission['INVC_CONT'] = pred

In [59]:
submission.to_csv('baseline.csv',index = False)

In [60]:
sub = pd.read_csv('baseline.csv')

In [61]:
sub.head()

Unnamed: 0,index,INVC_CONT
0,32000,4.418839
1,32001,4.651757
2,32002,5.414103
3,32003,5.619582
4,32004,4.830578
