In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [2]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [3]:
# 날짜 처리
data = pd.read_csv('data/201901-202003.csv')
data = data.fillna('')
data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
data = data.drop(['REG_YYMM'], axis=1)

In [4]:
# 데이터 정제
df = data.copy()
df.loc[(df['month']>=3)&(df['month']<=5) ,'season'] = '봄'
df.loc[(df['month']>=6)&(df['month']<=8) ,'season'] = '여름'
df.loc[(df['month']>=9)&(df['month']<=11) ,'season'] = '가을'
df.loc[(df['month']>11),'season'] = '겨울'
df.loc[(df['month']<=2),'season'] = '겨울'

df.loc[(df['CARD_SIDO_NM'] == df['HOM_SIDO_NM']), 'visiter'] = '거주자'
df.loc[(df['CARD_SIDO_NM'] != df['HOM_SIDO_NM']), 'visiter'] = '여행객'

covid = ['관광 민예품 및 선물용품 소매업','그외 기타 분류안된 오락관련 서비스업','그외 기타 스포츠시설 운영업','기타 대형 종합 소매업','기타 수상오락 서비스업','기타 외국식 음식점업','기타 주점업','내항 여객 운송업','마사지업','면세점','버스 운송업','비알콜 음료점업','서양식 음식점업','스포츠 및 레크레이션 용품 임대업','여관업','여행사업','욕탕업','일반유흥 주점업','일식 음식점업','자동차 임대업','전시 및 행사 대행업','정기 항공 운송업','중식 음식점업','차량용 가스 충전업','차량용 주유소 운영업','체인화 편의점','택시 운송업','피자 햄버거 샌드위치 및 유사 음식점업','한식 음식점업','호텔업','화장품 및 방향제 소매업','휴양콘도 운영업']
df['covid'] = '코로나 영향 없음'
for virus in covid:
    df.loc[(df['STD_CLSS_NM'] == virus),'covid'] = '코로나 영향있음'


In [5]:
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)

In [6]:
columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month','season','visiter','covid']
df = df.groupby(columns).sum().reset_index(drop=False)

In [7]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

In [8]:
train_num = df_num.sample(frac=1, random_state=0)

In [9]:
#train_num = train_num[train_num['year'] ==2020]

In [10]:
train_num

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,season,visiter,covid,CSTMR_CNT,AMT,CNT
149245,2,4,2,2,1,2,2019,4,2,0,1,5,275000,5
554956,8,26,11,1,1,2,2020,2,1,1,1,3,89300,6
918516,14,33,14,4,2,4,2019,5,2,0,1,10262,795920911,15027
425414,7,18,8,2,2,1,2019,10,0,1,1,120,2814110,164
640949,9,39,16,5,2,5,2020,1,1,1,1,9,349000,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359783,6,16,9,1,1,1,2019,12,1,1,1,382,3756020,530
152315,2,7,11,3,2,3,2020,3,2,1,1,3,229910,5
963395,15,19,9,5,1,5,2020,2,1,1,0,13,883000,15
117952,1,31,4,1,1,2,2019,7,3,1,1,3,107440,6


In [11]:
# feature, target 설정
train_num = df_num.sample(frac=1, random_state=0)
x = train_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
y = np.log1p(train_num['AMT'])

In [12]:
k = int(len(x)*0.9)

In [13]:
x_train = x[:k]
y_train = y[:k]
x_val = x[k:]
y_val = y[k:]

In [14]:
import lightgbm as lgb

In [15]:
!pip install xgboost



In [16]:
train_ds = lgb.Dataset(x_train, label=y_train)
val_ds = lgb.Dataset(x_val, label=y_val)

In [17]:
params = {
            'max_depth': -1,
            'learning_rate' : 0.05,
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
            'sub_row' : 0.75,
            'lambda_l2' : 0.1
        }

In [18]:
model = lgb.train(params,
                  train_ds,
                  5000,
                  val_ds,
                  verbose_eval = 100,
                  early_stopping_rounds = 100
                 )

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 1.65867
[200]	valid_0's rmse: 1.44281
[300]	valid_0's rmse: 1.33482
[400]	valid_0's rmse: 1.26224
[500]	valid_0's rmse: 1.20872
[600]	valid_0's rmse: 1.16131
[700]	valid_0's rmse: 1.12737
[800]	valid_0's rmse: 1.09831
[900]	valid_0's rmse: 1.07326
[1000]	valid_0's rmse: 1.05211
[1100]	valid_0's rmse: 1.03313
[1200]	valid_0's rmse: 1.01527
[1300]	valid_0's rmse: 0.99906
[1400]	valid_0's rmse: 0.986594
[1500]	valid_0's rmse: 0.974531
[1600]	valid_0's rmse: 0.962616
[1700]	valid_0's rmse: 0.951514
[1800]	valid_0's rmse: 0.939831
[1900]	valid_0's rmse: 0.928432
[2000]	valid_0's rmse: 0.91922
[2100]	valid_0's rmse: 0.91094
[2200]	valid_0's rmse: 0.902246
[2300]	valid_0's rmse: 0.893301
[2400]	valid_0's rmse: 0.886553
[2500]	valid_0's rmse: 0.879531
[2600]	valid_0's rmse: 0.873743
[2700]	valid_0's rmse: 0.867168
[2800]	valid_0's rmse: 0.860625
[2900]	valid_0's rmse: 0.854831
[3000]	valid_0's rmse: 0.850292
[3

In [19]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
AGEs          = df_num['AGE'].unique()
SEX_CTGO_CDs  = df_num['SEX_CTGO_CD'].unique()
FLCs          = df_num['FLC'].unique()
years         = [2020]
months        = [4, 7]
seasons          = df_num['season'].unique()
visiters          = df_num['visiter'].unique()
covids          = df_num['covid'].unique()


temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs:
            for AGE in AGEs:
                for SEX_CTGO_CD in SEX_CTGO_CDs:
                    for FLC in FLCs:
                        for year in years:
                            for month in months:
                                for season in seasons:
                                    for visiter in visiters:
                                        for covid in covids:
                                            temp.append([CARD_SIDO_NM, STD_CLSS_NM, HOM_SIDO_NM, AGE, SEX_CTGO_CD, FLC, year, month,season,visiter,covid])
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=x.columns)

In [None]:
# 예측
pred = model.predict(temp)
pred = np.expm1(pred)
temp['AMT'] = np.round(pred, 0)
temp['REG_YYMM'] = temp['year']*100 + temp['month']
temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [None]:
# 디코딩 
temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [None]:
# 제출 파일 만들기
submission = pd.read_csv('submission.csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = submission.merge(temp, left_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], right_on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'
submission.to_csv('add_featuers_submission.csv', encoding='utf-8-sig')
submission.head()