## 제주 빅데이터 경진대회
## _____ (팀명)
## 2020년 월 일 (제출날짜)

1. 본 코드는 대회 참가를 돕고자 단순 예시를 작성한 것으로 참고용으로 사용바랍니다.
2. 본 코드는 자유롭게 수정하여 사용 할 수 있습니다.

## 1. 라이브러리 가져오기
## Import Library

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder

In [2]:
print('Pandas : %s'%(pd.__version__))
print('Numpy : %s'%(np.__version__))
print('Scikit-Learn : %s'%(sklearn.__version__))
!python --version

Pandas : 1.0.5
Numpy : 1.18.5
Scikit-Learn : 0.23.1


Python 3.6.10 :: Anaconda, Inc.


## 2. 데이터 전처리
## Data Cleansing & Pre-Processing  

In [2]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [3]:
# 날짜 처리
data = pd.read_csv('data/201901-202003.csv')
data = data.fillna('')
data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
data = data.drop(['REG_YYMM'], axis=1)

In [4]:
# 데이터 정제
df = data.copy()

# season
df.loc[(df['month']>=3)&(df['month']<=5) ,'season'] = '봄'
df.loc[(df['month']>=6)&(df['month']<=8) ,'season'] = '여름'
df.loc[(df['month']>=9)&(df['month']<=11) ,'season'] = '가을'
df.loc[(df['month']>11),'season'] = '겨울'
df.loc[(df['month']<=2),'season'] = '겨울'

# visitor
df.loc[(df['CARD_SIDO_NM'] == df['HOM_SIDO_NM']), 'visitor'] = '거주자'
df.loc[(df['CARD_SIDO_NM'] != df['HOM_SIDO_NM']), 'visitor'] = '여행객'


# 취소/단골
df['gap']= df['CNT'] - df['CSTMR_CNT']
df.loc[df['gap'] <0,'mark'] = '취소있음'
df.loc[df['gap'] ==0,'mark'] = '고객다양'
df.loc[df['gap'] >0,'mark'] = '단골있음'
    
    
# 시군구 drop
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM', 'gap'], axis=1)

df.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT,year,month,season,visitor,mark
0,강원,건강보조식품 소매업,강원,20s,1,1,4,311200,4,2019,1,겨울,거주자,고객다양
1,강원,건강보조식품 소매업,강원,30s,1,2,7,1374500,8,2019,1,겨울,거주자,단골있음
2,강원,건강보조식품 소매업,강원,30s,2,2,6,818700,6,2019,1,겨울,거주자,고객다양
3,강원,건강보조식품 소매업,강원,40s,1,3,4,1717000,5,2019,1,겨울,거주자,단골있음
4,강원,건강보조식품 소매업,강원,40s,1,4,3,1047300,3,2019,1,겨울,거주자,고객다양


In [5]:
columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD','FLC', 'year', 'month',
           'season','visitor','mark']


df = df.groupby(columns).sum().reset_index(drop=False)

In [6]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

## 3. 탐색적 자료분석
## Exploratory Data Analysis

In [7]:
# 입력하세요.

## 4. 변수 선택 및 모델 구축
## Feature Engineering & Initial Modeling  

In [7]:
# feature, target 설정
train_num = df_num.sample(frac=1, random_state=0)

In [8]:
x = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
y = np.log1p(train_num['AMT'])

In [9]:
k = int(len(x)*0.9)

In [10]:
x_train = x[:k]
y_train = y[:k]
x_val = x[k:]
y_val = y[k:]

## 5. 모델 학습 및 검증
## Model Tuning & Evaluation

In [12]:
import catboost as cb
from catboost import Pool, datasets, CatBoostRegressor

In [17]:
cb_model = CatBoostRegressor(iterations=5000,           # 반복횟수
                           learning_rate=0.1,           # 러닝레이트
                           l2_leaf_reg=3.5,             # L2 정규화 계수
                           depth=8,                     # 트리 깊이
                           rsm=0.98,                    # Random subspace method.
                           metric_period=1000,
                           loss_function= 'MultiRMSE',      
                           eval_metric= 'MultiRMSE',     # 성능 측정
                           use_best_model=True,
                           random_seed=42               # 랜덤시드 고정
                          ) 

In [18]:
cb_model.fit(x_train,y_train,eval_set=(x_val,y_val))

0:	learn: 12.4634787	test: 12.4703335	best: 12.4703335 (0)	total: 595ms	remaining: 49m 35s
1000:	learn: 1.0048027	test: 1.0096809	best: 1.0096809 (1000)	total: 11m 9s	remaining: 44m 33s
2000:	learn: 0.8990818	test: 0.9064287	best: 0.9064287 (2000)	total: 22m 45s	remaining: 34m 6s
3000:	learn: 0.8500161	test: 0.8604828	best: 0.8604828 (3000)	total: 35m 17s	remaining: 23m 30s
4000:	learn: 0.8200369	test: 0.8332364	best: 0.8332364 (4000)	total: 48m 25s	remaining: 12m 5s
4999:	learn: 0.7997902	test: 0.8158042	best: 0.8158042 (4999)	total: 1h 2m 47s	remaining: 0us

bestTest = 0.8158041866
bestIteration = 4999



<catboost.core.CatBoostRegressor at 0x241711c1080>

In [11]:
import sklearn.externals
import joblib

file_name = 'data/catboost_8_MultiRMSE.pkl' 
# joblib.dump(cb_model, file_name) 

In [12]:
cb_model = joblib.load(file_name) 

In [16]:
import lightgbm as lgb

In [17]:
train_ds = lgb.Dataset(x_train, label=y_train)
val_ds = lgb.Dataset(x_val, label=y_val)

In [18]:
params = {
            'max_depth': -1,
            'learning_rate' : 0.05,
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'custom',
            'sub_row' : 0.75,
            'lambda_l2' : 0.1
        }

In [19]:
def rmsle_1(y_pred, data):
    y_true = np.array(data.get_label())
    score= np.sqrt(np.square(np.log1p(y_pred + 1) - np.log1p(y_true + 1)).mean())
    return 'rmsle', score, False

In [20]:
lgb_model = lgb.train(params,
                   train_ds,
                   20000,
                   val_ds,
                   verbose_eval = 1000,
                   early_stopping_rounds = 1000,
                   feval = rmsle_1
                 )

Training until validation scores don't improve for 1000 rounds
[1000]	valid_0's rmsle: 0.0702949
[2000]	valid_0's rmsle: 0.0640567
[3000]	valid_0's rmsle: 0.0608572
[4000]	valid_0's rmsle: 0.0587271
[5000]	valid_0's rmsle: 0.057203
[6000]	valid_0's rmsle: 0.0561045
[7000]	valid_0's rmsle: 0.0552409
[8000]	valid_0's rmsle: 0.0544067
[9000]	valid_0's rmsle: 0.0537998
[10000]	valid_0's rmsle: 0.0532534
[11000]	valid_0's rmsle: 0.0527649
[12000]	valid_0's rmsle: 0.0523281
[13000]	valid_0's rmsle: 0.0519961
[14000]	valid_0's rmsle: 0.0516879
[15000]	valid_0's rmsle: 0.0514468
[16000]	valid_0's rmsle: 0.0511671
[17000]	valid_0's rmsle: 0.0509466
[18000]	valid_0's rmsle: 0.0507484
[19000]	valid_0's rmsle: 0.0505638
[20000]	valid_0's rmsle: 0.0504011
Did not meet early stopping. Best iteration is:
[20000]	valid_0's rmsle: 0.0504011


In [13]:
file_name = 'data/lgb_model.pkl' 
#joblib.dump(lgb_model, file_name) 

In [14]:
lgb_model = joblib.load(file_name) 

In [35]:
import xgboost as xgb

In [40]:
train_ds = xgb.DMatrix(x_train, label=y_train)
val_ds = xgb.DMatrix(x_val, label=y_val)
watchlist = [(val_ds, 'eval'), (train_ds, 'train')]

In [54]:
param = {
            'booster': 'gbtree',
            'max_depth': 8,
            'objective': 'reg:squarederror',  # objective: loss func. 정의.
            'eta': 0.01,                      # learning rate(학습률) X의 움직임
            'colsample_bytree': 0.8,
            'colsample_bylevel': 0.9,
            'seed': 10
        }

In [55]:
xgb_model = xgb.train(params,
                  train_ds,
                  num_boost_round = 2500,
                  early_stopping_rounds = 1000,
                  verbose_eval = 1000,
                  evals = watchlist 
                  )

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	eval-rmse:12.68137	train-rmse:12.67356
Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping.

Will train until train-rmse hasn't improved in 1000 rounds.
[1000]	eval-rmse:0.92626	train-rmse:0.91443
[2000]	eval-rmse:0.84365	train-rmse:0.82444
[2499]	eval-rmse:0.81952	train-rmse:0.79730


In [15]:
file_name = 'data/xgb_model.pkl' 
#joblib.dump(xgb_model, file_name) 

In [16]:
xgb_model = joblib.load(file_name) 

## 6. 결과 및 결언
## Conclusion & Discussion

In [None]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
AGEs          = df_num['AGE'].unique()
SEX_CTGO_CDs  = df_num['SEX_CTGO_CD'].unique()
FLCs          = df_num['FLC'].unique()
years         = [2020]
months        = [4, 7]
seasons       = df_num['season'].unique()
visitors      = df_num['visitor'].unique()
marks         = df_num['mark'].unique()


temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs:
            for AGE in AGEs:
                for SEX_CTGO_CD in SEX_CTGO_CDs:
                    for FLC in FLCs:
                        for year in years:
                            for month in months:
                                for season in seasons:
                                    for visitor in visitors:
                                        for mark in marks:
                                            temp.append([CARD_SIDO_NM, STD_CLSS_NM, HOM_SIDO_NM, AGE, SEX_CTGO_CD, FLC, year, month,season,visitor,mark])
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=x.columns)

In [None]:
# 예측
cb_pred = cb_model.predict(temp)

In [None]:
lgb_pred = lgb_model.predict(temp)

In [None]:
xgb_pred = xgb_model.predict(temp)

In [None]:
ensemble = cb_pred*0.25 + lgb_pred*0.5 + xgb_pred*0.25

In [None]:
pred = np.expm1(ensemble)
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [None]:
# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [None]:
# 제출 파일 만들기
submission = pd.read_csv('data/submission.csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'
submission.to_csv('ensemble.csv', encoding='utf-8-sig')
submission.head()