# read

In [330]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder # LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error

np.random.seed(0)

train = pd.read_csv('./train_df.csv', encoding='cp949')
test  = pd.read_csv('./test_df.csv', encoding='cp949')
sub   = pd.read_csv('./sample_submission.csv', encoding='cp949')

In [331]:
train.shape, test.shape, sub.shape

((32000, 6), (4640, 5), (4640, 2))

In [332]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT
0,0,1129000014045300,5011000220046300,패션의류,상의,3
1,1,1135000009051200,5011000178037300,생활/건강,반려동물,3
2,2,1135000030093100,5011000265091400,패션의류,기타패션의류,9
3,3,1154500002014200,5011000315087400,식품,농산물,10
4,4,1165000021008300,5011000177051200,식품,가공식품,3


In [333]:
test.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM
0,32000,5013000043028400,1165000021097200,식품,농산물
1,32001,5013000044016100,1154500002066400,식품,농산물
2,32002,5013000205030200,4139000102013200,식품,농산물
3,32003,5013000205030200,4221000040093400,식품,농산물
4,32004,5013000268011400,2726000004017100,식품,농산물


In [334]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000 entries, 0 to 31999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   index          32000 non-null  int64 
 1   SEND_SPG_INNB  32000 non-null  int64 
 2   REC_SPG_INNB   32000 non-null  int64 
 3   DL_GD_LCLS_NM  32000 non-null  object
 4   DL_GD_MCLS_NM  32000 non-null  object
 5   INVC_CONT      32000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 1.5+ MB


In [335]:
train.isnull().sum()

index            0
SEND_SPG_INNB    0
REC_SPG_INNB     0
DL_GD_LCLS_NM    0
DL_GD_MCLS_NM    0
INVC_CONT        0
dtype: int64

In [336]:
test.isnull().sum()

index            0
SEND_SPG_INNB    0
REC_SPG_INNB     0
DL_GD_LCLS_NM    0
DL_GD_MCLS_NM    0
dtype: int64

In [337]:
train.groupby('DL_GD_MCLS_NM').count()[['SEND_SPG_INNB']].sort_values('SEND_SPG_INNB', ascending=False)

Unnamed: 0_level_0,SEND_SPG_INNB
DL_GD_MCLS_NM,Unnamed: 1_level_1
농산물,22162
문화컨텐츠,1192
음료,1126
수산,931
가공식품,923
기타식품,766
건강식품,651
음반,635
축산,418
냉동/간편조리식품,413


In [338]:
train.groupby('SEND_SPG_INNB').count()[['REC_SPG_INNB']].sort_values('REC_SPG_INNB', ascending=False)

Unnamed: 0_level_0,REC_SPG_INNB
SEND_SPG_INNB,Unnamed: 1_level_1
5011000078068400,4227
5013000610049100,1994
5013000731055200,1335
5011000137030100,1206
5013000821028200,773
...,...
4418000453076200,1
4418000539016100,1
4420000034070100,1
4420000054076100,1


In [339]:
train[train['SEND_SPG_INNB']==5011000078068400]

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT
83,83,5011000078068400,1123000014015100,식품,농산물,3
84,84,5011000078068400,1147000007073300,식품,농산물,3
85,85,5011000078068400,1150000022009400,식품,농산물,3
86,86,5011000078068400,1156000026072400,식품,농산물,16
87,87,5011000078068400,2617000008051100,식품,농산물,4
...,...,...,...,...,...,...
31909,31909,5011000078068400,4711300727081200,식품,농산물,3
31910,31910,5011000078068400,4825000448063100,식품,농산물,5
31911,31911,5011000078068400,4827000063066100,식품,농산물,3
31912,31912,5011000078068400,4831000335042100,식품,농산물,3


In [340]:
train['SEND_SPG_INNB'].sort_values(ascending=False).tail(15) #11:5

24152    1111000023068400
12168    1111000023068400
31155    1111000023059400
10978    1111000023043300
3676     1111000023022100
1527     1111000022091400
13062    1111000022015400
30574    1111000022002100
7566     1111000019099100
7565     1111000019090400
28455    1111000019090400
22950    1111000019090400
5187     1111000019090400
8458     1111000017015300
16521    1111000015066100
Name: SEND_SPG_INNB, dtype: int64

In [341]:
# train['send_11'] = train['SEND_SPG_INNB'].astype(str).apply(lambda x: x[:11])
train['send_11'] = train["SEND_SPG_INNB"].astype('str').str[:11].astype('int64')
test['send_11'] = test["SEND_SPG_INNB"].astype('str').str[:11].astype('int64')

train['send_11']

0        11290000140
1        11350000090
2        11350000300
3        11545000020
4        11650000210
            ...     
31995    50110010600
31996    50110010950
31997    50110011080
31998    50110011150
31999    50110011160
Name: send_11, Length: 32000, dtype: int64

In [342]:
train['send_back_5'] = train["SEND_SPG_INNB"].astype('str').str[-5:].astype('int64')
test['send_back_5'] = test["SEND_SPG_INNB"].astype('str').str[-5:].astype('int64')

train['send_back_5']

0        45300
1        51200
2        93100
3        14200
4         8300
         ...  
31995    63300
31996    42400
31997    36200
31998    11400
31999    66400
Name: send_back_5, Length: 32000, dtype: int64

In [343]:
train['rec_11'] = train["REC_SPG_INNB"].astype('str').str[:11].astype('int64')
test['rec_11'] = test["REC_SPG_INNB"].astype('str').str[:11].astype('int64')

train['rec_11']

0        50110002200
1        50110001780
2        50110002650
3        50110003150
4        50110001770
            ...     
31995    26350000260
31996    11680000170
31997    41197000080
31998    11320000150
31999    47190005940
Name: rec_11, Length: 32000, dtype: int64

In [344]:
train['rec_back_5'] = train["REC_SPG_INNB"].astype('str').str[-5:].astype('int64')
test['rec_back_5'] = test["REC_SPG_INNB"].astype('str').str[-5:].astype('int64')

train['rec_back_5']

0        46300
1        37300
2        91400
3        87400
4        51200
         ...  
31995    53400
31996     2200
31997    12100
31998    85100
31999    22400
Name: rec_back_5, Length: 32000, dtype: int64

In [345]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000 entries, 0 to 31999
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   index          32000 non-null  int64 
 1   SEND_SPG_INNB  32000 non-null  int64 
 2   REC_SPG_INNB   32000 non-null  int64 
 3   DL_GD_LCLS_NM  32000 non-null  object
 4   DL_GD_MCLS_NM  32000 non-null  object
 5   INVC_CONT      32000 non-null  int64 
 6   send_11        32000 non-null  int64 
 7   send_back_5    32000 non-null  int64 
 8   rec_11         32000 non-null  int64 
 9   rec_back_5     32000 non-null  int64 
dtypes: int64(8), object(2)
memory usage: 2.4+ MB


# Encoding

In [346]:
# encoder = OrdinalEncoder()
# train[['DL_GD_LCLS_NM', 'DL_GD_MCLS_NM']] = encoder.fit_transform(train[['DL_GD_LCLS_NM', 'DL_GD_MCLS_NM']])

train = pd.get_dummies(train)
test  = pd.get_dummies(test)

# Drop

In [347]:
train.drop(['SEND_SPG_INNB', 'REC_SPG_INNB', 'index'], axis=1, inplace=True)
test.drop(['SEND_SPG_INNB', 'REC_SPG_INNB', 'index'], axis=1, inplace=True)

In [348]:
# fig = plt.subplots(figsize=(10, 10))
# sns.heatmap(train.corr(), fmt='.4f', annot=True, cmap='Blues')

# Scaler

In [349]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

In [350]:
scaler= MinMaxScaler()

In [351]:
scaler_idx = ['send_11', 'send_back_5', 'rec_11', 'rec_back_5']

for idx in scaler_idx:
    train[idx] = scaler.fit_transform(train[idx].values.reshape(-1, 1))
    test[idx] = scaler.fit_transform(test[idx].values.reshape(-1, 1))
    

In [352]:
train

Unnamed: 0,INVC_CONT,send_11,send_back_5,rec_11,rec_back_5,DL_GD_LCLS_NM_디지털/가전,DL_GD_LCLS_NM_생활/건강,DL_GD_LCLS_NM_식품,DL_GD_LCLS_NM_여행/문화,DL_GD_LCLS_NM_패션의류,...,DL_GD_MCLS_NM_문화컨텐츠,DL_GD_MCLS_NM_반려동물,DL_GD_MCLS_NM_상의,DL_GD_MCLS_NM_생활용품,DL_GD_MCLS_NM_수산,DL_GD_MCLS_NM_스킨케어,DL_GD_MCLS_NM_음료,DL_GD_MCLS_NM_음반,DL_GD_MCLS_NM_주방용품,DL_GD_MCLS_NM_축산
0,3,0.004613,0.455186,0.999487,0.465257,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
1,3,0.006151,0.514602,0.999487,0.374622,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,9,0.006151,0.936556,0.999487,0.919436,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,10,0.011148,0.141994,0.999487,0.879154,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,0.013839,0.082578,0.999487,0.514602,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31995,6,0.999487,0.636455,0.390569,0.536757,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31996,5,0.999487,0.425982,0.014608,0.021148,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31997,9,0.999487,0.363545,0.771066,0.120846,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31998,3,0.999487,0.113797,0.005382,0.855992,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# modeling

In [353]:
!pip install catboost



You should consider upgrading via the 'C:\AI\pythonProject\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [355]:
from catboost import CatBoostRegressor 

train_X = train.drop('INVC_CONT',axis = 1)
train_Y = train['INVC_CONT']

model = CatBoostRegressor()
model.fit(train_X,train_Y)

pred = model.predict(test)

Learning rate set to 0.07233
0:	learn: 5.7026449	total: 164ms	remaining: 2m 43s
1:	learn: 5.6622051	total: 168ms	remaining: 1m 23s
2:	learn: 5.6228348	total: 172ms	remaining: 57.1s
3:	learn: 5.5880701	total: 176ms	remaining: 43.9s
4:	learn: 5.5696601	total: 181ms	remaining: 36s
5:	learn: 5.5396244	total: 185ms	remaining: 30.7s
6:	learn: 5.5160423	total: 190ms	remaining: 26.9s
7:	learn: 5.4923444	total: 194ms	remaining: 24s
8:	learn: 5.4842118	total: 197ms	remaining: 21.7s
9:	learn: 5.4747014	total: 202ms	remaining: 20s
10:	learn: 5.4679547	total: 206ms	remaining: 18.5s
11:	learn: 5.4481818	total: 210ms	remaining: 17.3s
12:	learn: 5.4374424	total: 215ms	remaining: 16.3s
13:	learn: 5.4323711	total: 220ms	remaining: 15.5s
14:	learn: 5.4243646	total: 224ms	remaining: 14.7s
15:	learn: 5.4142218	total: 229ms	remaining: 14.1s
16:	learn: 5.4089961	total: 233ms	remaining: 13.5s
17:	learn: 5.4025815	total: 238ms	remaining: 13s
18:	learn: 5.3860985	total: 242ms	remaining: 12.5s
19:	learn: 5.38125

In [356]:
pred[:10]

array([5.32508747, 5.38376626, 4.67061546, 7.03629872, 5.46972553,
       6.57465457, 5.86555743, 5.71647889, 5.38712686, 5.35295716])

In [322]:
# X = train.drop('INVC_CONT', axis=1)
# y = np.log1p(train['INVC_CONT'])

In [323]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
# X_train.shape, X_val.shape, y_train.shape, y_val.shape

((25600, 30), (6400, 30), (25600,), (6400,))

In [324]:
# models = [DecisionTreeRegressor(), RandomForestRegressor(), LinearRegression(), Ridge(), Lasso(), ElasticNet(), XGBRegressor(), LGBMRegressor()]

# for model in models:
#     model.fit(X_train, y_train)
#     pred = model.predict(X_val)
    
#     y_val_exp = np.expm1(y_val)
    
#     rmse = mean_squared_error(y_val_exp, pred, squared=False)
    
#     print(model.__class__.__name__)
#     print('RMSE:', rmse)
#     print(pred[:5], y_val_exp.values[:5])

DecisionTreeRegressor
RMSE: 6.512784674492238
[1.38629436 1.38629436 1.94591015 ... 1.61734342 2.39789527 2.07944154] 31330     3.0
3514      3.0
12363     3.0
25927    29.0
31886     3.0
         ... 
21938     3.0
27919     3.0
8500      5.0
8792      3.0
12996     4.0
Name: INVC_CONT, Length: 6400, dtype: float64
RandomForestRegressor
RMSE: 6.501287416791143
[1.51416809 1.80436654 1.59605366 ... 1.64109196 2.54089013 1.66897034] 31330     3.0
3514      3.0
12363     3.0
25927    29.0
31886     3.0
         ... 
21938     3.0
27919     3.0
8500      5.0
8792      3.0
12996     4.0
Name: INVC_CONT, Length: 6400, dtype: float64
LinearRegression
RMSE: 6.550587695369566
[1.59863281 1.74658203 1.56982422 ... 1.65380859 1.94433594 1.57373047] 31330     3.0
3514      3.0
12363     3.0
25927    29.0
31886     3.0
         ... 
21938     3.0
27919     3.0
8500      5.0
8792      3.0
12996     4.0
Name: INVC_CONT, Length: 6400, dtype: float64
Ridge
RMSE: 6.550721206173661
[1.58867324 1.7477834

In [325]:
# boosting_model = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=1000,
#                  subsample=1.0, criterion='friedman_mse',  #{'friedman_mse', 'mse', 'mae'}
#                           random_state=1414, alpha=0.9)

# bagging_model = BaggingRegressor(base_estimator=boosting_model,  # None = DecisionTreeRegressor.
#                  n_estimators=1000,
#                  max_samples=1.0,
#                  max_features=1.0,
#                  bootstrap=True,
#                  oob_score=False,
#                  random_state=1414)
# bagging_model.fit(X, y)

In [326]:
# baseline + LabelEncoder: 5.350150025017202
# baseline : 5.273774537653919

In [357]:
# pred = model.predict(test)
sub['INVC_CONT'] = pred

In [358]:
sub.head(15)

Unnamed: 0,index,INVC_CONT
0,32000,5.325087
1,32001,5.383766
2,32002,4.670615
3,32003,7.036299
4,32004,5.469726
5,32005,6.574655
6,32006,5.865557
7,32007,5.716479
8,32008,5.387127
9,32009,5.352957


In [329]:
aaaaaaaaaaaaaa

NameError: name 'aaaaaaaaaaaaaa' is not defined

In [359]:
sub.to_csv('./sub/submission_v04.csv', index=False)