# read

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np

np.random.seed(0)

train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/input/train_df.csv', encoding='cp949')
test  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/input/test_df.csv', encoding='cp949')
sub   = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/input/sample_submission.csv', encoding='cp949')

In [6]:
train.drop('index', axis=1, inplace=True)
test.drop('index', axis=1, inplace=True)

In [7]:
train.shape, test.shape, sub.shape

((32000, 5), (4640, 4), (4640, 2))

In [8]:
train.head()

Unnamed: 0,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT
0,1129000014045300,5011000220046300,패션의류,상의,3
1,1135000009051200,5011000178037300,생활/건강,반려동물,3
2,1135000030093100,5011000265091400,패션의류,기타패션의류,9
3,1154500002014200,5011000315087400,식품,농산물,10
4,1165000021008300,5011000177051200,식품,가공식품,3


In [9]:
test.head()

Unnamed: 0,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM
0,5013000043028400,1165000021097200,식품,농산물
1,5013000044016100,1154500002066400,식품,농산물
2,5013000205030200,4139000102013200,식품,농산물
3,5013000205030200,4221000040093400,식품,농산물
4,5013000268011400,2726000004017100,식품,농산물


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000 entries, 0 to 31999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   SEND_SPG_INNB  32000 non-null  int64 
 1   REC_SPG_INNB   32000 non-null  int64 
 2   DL_GD_LCLS_NM  32000 non-null  object
 3   DL_GD_MCLS_NM  32000 non-null  object
 4   INVC_CONT      32000 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 1.2+ MB


In [11]:
train.isnull().sum()

SEND_SPG_INNB    0
REC_SPG_INNB     0
DL_GD_LCLS_NM    0
DL_GD_MCLS_NM    0
INVC_CONT        0
dtype: int64

In [12]:
test.isnull().sum()

SEND_SPG_INNB    0
REC_SPG_INNB     0
DL_GD_LCLS_NM    0
DL_GD_MCLS_NM    0
dtype: int64

# encoding

In [21]:
large_map = {}
for i, j in enumerate(train.groupby('DL_GD_LCLS_NM').sum().sort_values('INVC_CONT').index):
    large_map[j] = i
large_map

{'디지털/가전': 1, '생활/건강': 4, '식품': 5, '여행/문화': 3, '패션의류': 2, '화장품/미용': 0}

In [22]:
small_map = {}
for i, j in enumerate(train.groupby('DL_GD_MCLS_NM').sum().sort_values('INVC_CONT').index):
    small_map[j] = i
small_map

{'가공식품': 17,
 '건강식품': 13,
 '건강용품': 6,
 '과자': 8,
 '기타디지털/가전': 3,
 '기타식품': 14,
 '기타패션의류': 11,
 '냉동/간편조리식품': 9,
 '농산물': 19,
 '문구/사무용품': 2,
 '문화컨텐츠': 16,
 '반려동물': 4,
 '상의': 1,
 '생활용품': 5,
 '수산': 15,
 '스킨케어': 0,
 '음료': 18,
 '음반': 10,
 '주방용품': 7,
 '축산': 12}

In [23]:
train['DL_GD_LCLS_NM'] = train['DL_GD_LCLS_NM'].map(large_map)
test['DL_GD_LCLS_NM']  = test['DL_GD_LCLS_NM'].map(large_map)

train['DL_GD_MCLS_NM'] = train['DL_GD_MCLS_NM'].map(small_map)
test['DL_GD_MCLS_NM']  = test['DL_GD_MCLS_NM'].map(small_map)

In [24]:
train.head()

Unnamed: 0,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM,INVC_CONT
0,1129000014045300,5011000220046300,2,1,3
1,1135000009051200,5011000178037300,4,4,3
2,1135000030093100,5011000265091400,2,11,9
3,1154500002014200,5011000315087400,5,19,10
4,1165000021008300,5011000177051200,5,17,3


# modeling

In [25]:
X = train.drop('INVC_CONT', axis=1)
y = train['INVC_CONT']

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((25600, 4), (6400, 4), (25600,), (6400,))

In [27]:
from lightgbm import LGBMRegressor

model = LGBMRegressor()

model.fit(X_train, y_train)
y_pred = model.predict(X_val)

In [28]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_val, y_pred, squared=False)

5.2667544918229465

In [29]:
pred = model.predict(test)
sub['INVC_CONT'] = pred

In [30]:
sub.head()

Unnamed: 0,index,INVC_CONT
0,32000,4.85523
1,32001,5.054846
2,32002,4.877966
3,32003,7.112384
4,32004,5.688473


In [31]:
sub.to_csv('/content/drive/MyDrive/Colab Notebooks/sub/baseline_01.csv', index=False)