## Import

In [1]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
import geopy.distance
%matplotlib inline

# EDA
from collections import Counter

# Preprocessing & Feature Engineering
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import RFE

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import LGBMRegressor

# Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

# Utility
import os
import time
import random
import datetime as dt
from datetime import datetime
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean

import random
import re
from sklearn.metrics import *
import warnings
warnings.filterwarnings(action='ignore')

### Data Reconstruction

In [None]:
#큰 메모리 용량문제를 해결하기 위해 parquet 파일형식으로 변환

def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')
    
csv_to_parquet('./data/train.csv', 'train')
csv_to_parquet('./data/sales.csv', 'sales')
csv_to_parquet('./data/product_info.csv', 'product_info')
csv_to_parquet('./data/brand_keyword_cnt.csv', 'keyword')

In [None]:
train_data = pd.read_parquet('./data/train.parquet')
sales_data = pd.read_parquet('./data/sales.parquet')
product_info = pd.read_parquet('./data/product_info.parquet')
keyword = pd.read_parquet('./data/keyword.parquet')

In [None]:
#ML model 학습을 위해 데이터셋 재구성(train)
date_col = list(train_data.iloc[:,6:].columns) * 15890
ID_col = list(train_data['ID'].unique())
ID_col = [x for x in ID_col for i in range(459)]

pd_col = list(train_data['제품'])
pd_col = [x for x in pd_col for i in range(459)]

main_col = list(train_data['대분류'])
main_col = [x for x in main_col for i in range(459)]

middle_col = list(train_data['중분류'])
middle_col = [x for x in middle_col for i in range(459)]

sub_col = list(train_data['소분류'])
sub_col = [x for x in sub_col for i in range(459)]

brd_col = list(train_data['브랜드'])
brd_col = [x for x in brd_col for i in range(459)]

target_col = list(train_data.iloc[:,6:].stack().reset_index(drop=True))

train = pd.DataFrame({'date':date_col,
                      'ID':ID_col,
                      'pd_code':pd_col,
                      'main_code':main_col,
                      'middle_code':middle_col,
                      'sub_code':sub_col,
                      'brd_code':brd_col,
                      'target':target_col})
train.head()

In [None]:
#ML model 학습을 위해 데이터셋 재구성(sales)
date_col = list(sales_data.iloc[:,6:].columns) * 15890
ID_col = list(sales_data['ID'].unique())
ID_col = [x for x in ID_col for i in range(459)]

pd_col = list(sales_data['제품'])
pd_col = [x for x in pd_col for i in range(459)]

main_col = list(sales_data['대분류'])
main_col = [x for x in main_col for i in range(459)]

middle_col = list(sales_data['중분류'])
middle_col = [x for x in middle_col for i in range(459)]

sub_col = list(sales_data['소분류'])
sub_col = [x for x in sub_col for i in range(459)]

brd_col = list(sales_data['브랜드'])
brd_col = [x for x in brd_col for i in range(459)]

target_col = list(sales_data.iloc[:,6:].stack().reset_index(drop=True))

sales = pd.DataFrame({'date':date_col,
                      'ID':ID_col,
                      'pd_code':pd_col,
                      'main_code':main_col,
                      'middle_code':middle_col,
                      'sub_code':sub_col,
                      'brd_code':brd_col,
                      'target':target_col})
sales.head()

In [None]:
#ML model 학습을 위해 데이터셋 재구성(keyword)
keyword = keyword.fillna(-1.0)

date_col = list(keyword.iloc[:,1:].columns) * 3170

brd_col = list(keyword['브랜드'])
brd_col = [x for x in brd_col for i in range(459)]

target_col = list(keyword.iloc[:,1:].stack().reset_index(drop=True))

keyword = pd.DataFrame({'date':date_col,
                       'brd_code':brd_col,
                      'target':target_col})
keyword.head()

In [None]:
#제품특성 정보 삽입
product_info.rename(columns = {'제품':'pd_code','제품특성':'info'}, inplace = True)
train = pd.merge(train, product_info, how = 'left', on ='pd_code').fillna(value="정보없음")
train = train[['date','ID','pd_code','main_code','middle_code','sub_code','brd_code','info','target']]
train.head()

In [None]:
#판매금액, 검색키워드 merge
sales.rename(columns={'ID':'ID_2','target':'price_tot'}, inplace = True)
sales = sales[['ID_2','price_tot']]
train = pd.concat([train, sales], axis = 1).drop(columns=['ID_2'])

keyword.rename(columns={'target':'brd_kw'}, inplace = True)
train = pd.merge(train, keyword, how = 'left', on = ['date','brd_code'])

train.head()

In [None]:
#예측을 위한 임의의 test 데이터 셋 생성
submission = pd.read_csv('./data/sample_submission.csv')

date_col = list(submission.iloc[:,1:].columns) * 15890

ID_col = list(submission['ID'].unique())
ID_col = [x for x in ID_col for i in range(21)]

test = pd.DataFrame({'date':date_col,
                      'ID':ID_col})
test.head()

In [None]:
test_col = ['pd_code','main_code','middle_code','sub_code','brd_code','info']
for col in test_col:
    test[col] = train.groupby('ID').head(21).reset_index(drop=True)[col]
test.head()

In [None]:
display(train.head())
display(test.head())

In [None]:
#메모리 축소를 위해 데이터 형식 변환(parquet)
train.to_parquet('./data/ml_train.parquet')
test.to_parquet('./data/ml_test.parquet')

### Read Data

In [2]:
#2월~3월 사이 판매량이 급감하는 제품과 아닌 제품으로 나누어 학습
train_data = pd.read_csv(os.path.abspath("./data")+"/train.csv")
zero_id_lst = train_data[train_data.iloc[:,-41:-7].sum(axis=1)==0]['ID'].tolist()

In [3]:
train = pd.read_parquet("./data/ml_train.parquet")
test = pd.read_parquet("./data/ml_test.parquet")
submission = pd.read_csv("./submission/sample_submission.csv")

In [4]:
train.head()

Unnamed: 0,date,ID,pd_code,main_code,middle_code,sub_code,brd_code,info,target,price_tot,brd_kw
0,2022-01-01,0,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,헤어타입:모든 모발용 제품형태:스프레이형 주요제품특징:머릿결개선 주요제품특징:흡수력...,0,0,0.84131
1,2022-01-02,0,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,헤어타입:모든 모발용 제품형태:스프레이형 주요제품특징:머릿결개선 주요제품특징:흡수력...,0,0,0.91383
2,2022-01-03,0,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,헤어타입:모든 모발용 제품형태:스프레이형 주요제품특징:머릿결개선 주요제품특징:흡수력...,0,0,1.45053
3,2022-01-04,0,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,헤어타입:모든 모발용 제품형태:스프레이형 주요제품특징:머릿결개선 주요제품특징:흡수력...,0,0,2.42239
4,2022-01-05,0,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,헤어타입:모든 모발용 제품형태:스프레이형 주요제품특징:머릿결개선 주요제품특징:흡수력...,0,0,1.87119


In [5]:
train_1 = train.loc[train['ID'].isin(zero_id_lst),:].reset_index(drop=True)
train_2 = train.loc[~train['ID'].isin(zero_id_lst),:].reset_index(drop=True)

In [6]:
test.head()

Unnamed: 0,date,ID,pd_code,main_code,middle_code,sub_code,brd_code,info
0,2023-04-05,0,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,헤어타입:모든 모발용 제품형태:스프레이형 주요제품특징:머릿결개선 주요제품특징:흡수력...
1,2023-04-06,0,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,헤어타입:모든 모발용 제품형태:스프레이형 주요제품특징:머릿결개선 주요제품특징:흡수력...
2,2023-04-07,0,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,헤어타입:모든 모발용 제품형태:스프레이형 주요제품특징:머릿결개선 주요제품특징:흡수력...
3,2023-04-08,0,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,헤어타입:모든 모발용 제품형태:스프레이형 주요제품특징:머릿결개선 주요제품특징:흡수력...
4,2023-04-09,0,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,헤어타입:모든 모발용 제품형태:스프레이형 주요제품특징:머릿결개선 주요제품특징:흡수력...


In [7]:
test_1 = test.loc[test['ID'].isin(zero_id_lst),:]
index_1 = test_1.index
test_1 = test_1.reset_index(drop=True)

test_2 = test.loc[~test['ID'].isin(zero_id_lst),:]
index_2 = test_2.index
test_2 = test_2.reset_index(drop=True)

### DATA Split

In [8]:
#brd_kw 존재하지 않는 값 처리(데이터셋 구성할때 임의로 -1로 처리한 것)

train_1.loc[train_1['brd_kw'] == -1, "brd_kw"] = 0.0
train_2.loc[train_2['brd_kw'] == -1, "brd_kw"] = 0.0

In [9]:
#학습속도 및 용량문제로 인해 직전 3주 데이터만 학습으로 사용

train_clip1_1 = train_1.groupby('ID').tail(35).reset_index(drop=True).groupby('ID').head(14).reset_index(drop=True)
train_clip1_2 = train_1.groupby('ID').tail(35).reset_index(drop=True).groupby('ID').tail(7).reset_index(drop=True)
train_clip1 = pd.concat([train_clip1_1,train_clip1_2], axis=0).reset_index(drop=True)

train_clip2 = train_2.groupby('ID').tail(21).reset_index(drop=True)

### Preprocessing & Engineering

In [10]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['pd_code', 'main_code', 'middle_code', 'sub_code', 'brd_code','info']

for col in categorical_columns:
    label_encoder.fit(train_clip1[col])
    train_clip1[col] = label_encoder.transform(train_clip1[col])
    test_1[col] = label_encoder.transform(test_1[col])

for col in categorical_columns:
    label_encoder.fit(train_clip2[col])
    train_clip2[col] = label_encoder.transform(train_clip2[col])
    test_2[col] = label_encoder.transform(test_2[col])

In [12]:
# 시간 관련 변수들 생성
date = pd.to_datetime(train_clip1.date)
train_clip1['weekday'] = date.dt.weekday
train_clip1['month'] = date.dt.month
train_clip1['week'] = date.dt.weekofyear

date = pd.to_datetime(test_1.date)
test_1['weekday'] = date.dt.weekday
test_1['month'] = date.dt.month
test_1['week'] = date.dt.weekofyear

# 공휴일 변수 추가
train_clip1['holiday'] = train_clip1.apply(lambda x : 0 if x['weekday']<5 else 1, axis = 1)
test_1['holiday'] = test_1.apply(lambda x : 0 if x['weekday']<5 else 1, axis = 1)

# 상품별, 요일별 판매량 평균
target_mean = pd.pivot_table(train_clip1, values = 'target', index = ['ID', 'weekday'], aggfunc = np.mean).reset_index()
tqdm.pandas()
train_clip1['target_mean_1'] = train_clip1.progress_apply(lambda x : target_mean.loc[(target_mean.ID == x['ID']) & (target_mean.weekday == x['weekday']) ,'target'].values[0], axis = 1)

tqdm.pandas()
test_1['target_mean_1'] = test_1.progress_apply(lambda x : target_mean.loc[(target_mean.ID == x['ID']) & (target_mean.weekday == x['weekday']) ,'target'].values[0], axis = 1)

# 상품별 요일별 판매량 표준편차
target_std = pd.pivot_table(train_clip1, values = 'target', index = ['ID', 'weekday'], aggfunc = np.std).reset_index()
tqdm.pandas()
train_clip1['target_std_1'] = train_clip1.progress_apply(lambda x : target_std.loc[(target_std.ID == x['ID']) & (target_std.weekday == x['weekday']) ,'target'].values[0], axis = 1)

tqdm.pandas()
test_1['target_std_1'] = test_1.progress_apply(lambda x : target_std.loc[(target_std.ID == x['ID']) & (target_std.weekday == x['weekday']) ,'target'].values[0], axis = 1)

# 개당 가격, 월별 평균 할인율
train_clip1['per_price'] = np.round((train_clip1['price_tot'] / train_clip1['target']).fillna(0),0)
normal_price = train_clip1.groupby('ID')['per_price'].agg({('normal_price','max')}).reset_index()
train_clip1 = train_clip1.merge(normal_price, how = 'left', on ='ID')
test_1 = test_1.merge(normal_price, how = 'left', on ='ID')
train_clip1['discount_rate'] = np.round(((train_clip1['normal_price'] - train_clip1['per_price']) / train_clip1['normal_price']).fillna(0), 2)
discount = train_clip1.groupby(['ID','month'])['discount_rate'].agg({('discount_mean','mean')}).reset_index()
train_clip1 = train_clip1.merge(discount, how = 'left', on =['ID','month'])
test_1 = test_1.merge(discount, how = 'left', on =['ID','month'])

# 시간변수 cycling transform
train_clip1['sin_month'] = -np.sin(2 * np.pi * train_clip1['month']/12.0)
train_clip1['cos_month'] = -np.cos(2 * np.pi * train_clip1['month']/12.0)
train_clip1['sin_weekday'] = -np.sin(2 * np.pi * (train_clip1['weekday']+1)/7.0)
train_clip1['cos_weekday'] = -np.cos(2 * np.pi * (train_clip1['weekday']+1)/7.0)

test_1['sin_month'] = -np.sin(2 * np.pi * test_1['month']/12.0)
test_1['cos_month'] = -np.cos(2 * np.pi * test_1['month']/12.0)
test_1['sin_weekday'] = -np.sin(2 * np.pi * (test_1['weekday']+1)/7.0)
test_1['cos_weekday'] = -np.cos(2 * np.pi * (test_1['weekday']+1)/7.0)

train_clip1.drop(columns=['month','weekday'], inplace=True)
test_1.drop(columns=['month','weekday'], inplace=True)
display(train_clip1.head())
display(test_1.head())

100%|████████████████████████████████████████████████████████████████████████| 309162/309162 [02:47<00:00, 1843.29it/s]
100%|████████████████████████████████████████████████████████████████████████| 309162/309162 [02:57<00:00, 1737.35it/s]
100%|████████████████████████████████████████████████████████████████████████| 309162/309162 [03:27<00:00, 1491.07it/s]
100%|████████████████████████████████████████████████████████████████████████| 309162/309162 [02:49<00:00, 1819.71it/s]


Unnamed: 0,date,ID,pd_code,main_code,middle_code,sub_code,brd_code,info,target,price_tot,...,target_mean_1,target_std_1,per_price,normal_price,discount_rate,discount_mean,sin_month,cos_month,sin_weekday,cos_weekday
0,2023-03-01,0,0,1,6,37,0,7699,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-6.123234000000001e-17,-0.4338837,0.900969
1,2023-03-02,0,0,1,6,37,0,7699,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-6.123234000000001e-17,0.4338837,0.900969
2,2023-03-03,0,0,1,6,37,0,7699,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-6.123234000000001e-17,0.9749279,0.222521
3,2023-03-04,0,0,1,6,37,0,7699,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-6.123234000000001e-17,0.7818315,-0.62349
4,2023-03-05,0,0,1,6,37,0,7699,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-6.123234000000001e-17,2.449294e-16,-1.0


Unnamed: 0,date,ID,pd_code,main_code,middle_code,sub_code,brd_code,info,week,holiday,target_mean_1,target_std_1,normal_price,discount_mean,sin_month,cos_month,sin_weekday,cos_weekday
0,2023-04-05,0,0,1,6,37,0,7699,14,0,0.0,0.0,0.0,0.0,-0.866025,0.5,-0.4338837,0.900969
1,2023-04-06,0,0,1,6,37,0,7699,14,0,0.0,0.0,0.0,0.0,-0.866025,0.5,0.4338837,0.900969
2,2023-04-07,0,0,1,6,37,0,7699,14,0,0.0,0.0,0.0,0.0,-0.866025,0.5,0.9749279,0.222521
3,2023-04-08,0,0,1,6,37,0,7699,14,1,0.0,0.0,0.0,0.0,-0.866025,0.5,0.7818315,-0.62349
4,2023-04-09,0,0,1,6,37,0,7699,14,1,0.0,0.0,0.0,0.0,-0.866025,0.5,2.449294e-16,-1.0


In [13]:
# 시간 관련 변수들 생성
date = pd.to_datetime(train_clip2.date)
train_clip2['weekday'] = date.dt.weekday
train_clip2['month'] = date.dt.month
train_clip2['week'] = date.dt.weekofyear

date = pd.to_datetime(test_2.date)
test_2['weekday'] = date.dt.weekday
test_2['month'] = date.dt.month
test_2['week'] = date.dt.weekofyear

# 공휴일 변수 추가
train_clip2['holiday'] = train_clip2.apply(lambda x : 0 if x['weekday']<5 else 1, axis = 1)
test_2['holiday'] = test_2.apply(lambda x : 0 if x['weekday']<5 else 1, axis = 1)

# 상품별, 요일별 판매량 평균
target_mean = pd.pivot_table(train_clip2, values = 'target', index = ['ID', 'weekday'], aggfunc = np.mean).reset_index()
tqdm.pandas()
train_clip2['target_mean_1'] = train_clip2.progress_apply(lambda x : target_mean.loc[(target_mean.ID == x['ID']) & (target_mean.weekday == x['weekday']) ,'target'].values[0], axis = 1)

tqdm.pandas()
test_2['target_mean_1'] = test_2.progress_apply(lambda x : target_mean.loc[(target_mean.ID == x['ID']) & (target_mean.weekday == x['weekday']) ,'target'].values[0], axis = 1)

# 상품별 요일별 판매량 표준편차
target_std = pd.pivot_table(train_clip2, values = 'target', index = ['ID', 'weekday'], aggfunc = np.std).reset_index()
tqdm.pandas()
train_clip2['target_std_1'] = train_clip2.progress_apply(lambda x : target_std.loc[(target_std.ID == x['ID']) & (target_std.weekday == x['weekday']) ,'target'].values[0], axis = 1)

tqdm.pandas()
test_2['target_std_1'] = test_2.progress_apply(lambda x : target_std.loc[(target_std.ID == x['ID']) & (target_std.weekday == x['weekday']) ,'target'].values[0], axis = 1)

# 개당 가격, 월별 평균 할인율
train_clip2['per_price'] = np.round((train_clip2['price_tot'] / train_clip2['target']).fillna(0),0)
normal_price = train_clip2.groupby('ID')['per_price'].agg({('normal_price','max')}).reset_index()
train_clip2 = train_clip2.merge(normal_price, how = 'left', on ='ID')
test_2 = test_2.merge(normal_price, how = 'left', on ='ID')
train_clip2['discount_rate'] = np.round(((train_clip2['normal_price'] - train_clip2['per_price']) / train_clip2['normal_price']).fillna(0), 2)
discount = train_clip2.groupby(['ID','month'])['discount_rate'].agg({('discount_mean','mean')}).reset_index()
train_clip2 = train_clip2.merge(discount, how = 'left', on =['ID','month'])
test_2 = test_2.merge(discount, how = 'left', on =['ID','month'])

# 시간변수 cycling transform
train_clip2['sin_month'] = -np.sin(2 * np.pi * train_clip2['month']/12.0)
train_clip2['cos_month'] = -np.cos(2 * np.pi * train_clip2['month']/12.0)
train_clip2['sin_weekday'] = -np.sin(2 * np.pi * (train_clip2['weekday']+1)/7.0)
train_clip2['cos_weekday'] = -np.cos(2 * np.pi * (train_clip2['weekday']+1)/7.0)

test_2['sin_month'] = -np.sin(2 * np.pi * test_2['month']/12.0)
test_2['cos_month'] = -np.cos(2 * np.pi * test_2['month']/12.0)
test_2['sin_weekday'] = -np.sin(2 * np.pi * (test_2['weekday']+1)/7.0)
test_2['cos_weekday'] = -np.cos(2 * np.pi * (test_2['weekday']+1)/7.0)

train_clip2.drop(columns=['month','weekday'], inplace=True)
test_2.drop(columns=['month','weekday'], inplace=True)
display(train_clip2.head())
display(test_2.head())

100%|██████████████████████████████████████████████████████████████████████████| 24528/24528 [00:09<00:00, 2675.14it/s]
100%|██████████████████████████████████████████████████████████████████████████| 24528/24528 [00:09<00:00, 2664.55it/s]
100%|██████████████████████████████████████████████████████████████████████████| 24528/24528 [00:09<00:00, 2669.39it/s]
100%|██████████████████████████████████████████████████████████████████████████| 24528/24528 [00:09<00:00, 2588.36it/s]


Unnamed: 0,date,ID,pd_code,main_code,middle_code,sub_code,brd_code,info,target,price_tot,...,target_mean_1,target_std_1,per_price,normal_price,discount_rate,discount_mean,sin_month,cos_month,sin_weekday,cos_weekday
0,2023-03-15,85,0,0,0,0,0,144,0,0,...,1.333333,1.527525,0.0,23900.0,1.0,0.294118,-1.0,-6.123234000000001e-17,-0.4338837,0.900969
1,2023-03-16,85,0,0,0,0,0,144,1,23900,...,1.666667,1.154701,23900.0,23900.0,0.0,0.294118,-1.0,-6.123234000000001e-17,0.4338837,0.900969
2,2023-03-17,85,0,0,0,0,0,144,1,23900,...,1.0,0.0,23900.0,23900.0,0.0,0.294118,-1.0,-6.123234000000001e-17,0.9749279,0.222521
3,2023-03-18,85,0,0,0,0,0,144,0,0,...,0.666667,0.57735,0.0,23900.0,1.0,0.294118,-1.0,-6.123234000000001e-17,0.7818315,-0.62349
4,2023-03-19,85,0,0,0,0,0,144,0,0,...,0.666667,1.154701,0.0,23900.0,1.0,0.294118,-1.0,-6.123234000000001e-17,2.449294e-16,-1.0


Unnamed: 0,date,ID,pd_code,main_code,middle_code,sub_code,brd_code,info,week,holiday,target_mean_1,target_std_1,normal_price,discount_mean,sin_month,cos_month,sin_weekday,cos_weekday
0,2023-04-05,85,0,0,0,0,0,144,14,0,1.333333,1.527525,23900.0,0.0,-0.866025,0.5,-0.4338837,0.900969
1,2023-04-06,85,0,0,0,0,0,144,14,0,1.666667,1.154701,23900.0,0.0,-0.866025,0.5,0.4338837,0.900969
2,2023-04-07,85,0,0,0,0,0,144,14,0,1.0,0.0,23900.0,0.0,-0.866025,0.5,0.9749279,0.222521
3,2023-04-08,85,0,0,0,0,0,144,14,1,0.666667,0.57735,23900.0,0.0,-0.866025,0.5,0.7818315,-0.62349
4,2023-04-09,85,0,0,0,0,0,144,14,1,0.666667,1.154701,23900.0,0.0,-0.866025,0.5,2.449294e-16,-1.0


In [14]:
X_train_1 = train_clip1.drop(columns=['date', 'pd_code','target','price_tot','brd_kw','per_price','discount_rate'])
y_train_1 = train_clip1['target']
X_test_1 = test_1.drop(columns=['date', 'pd_code'])

X_train_2 = train_clip2.drop(columns=['date', 'pd_code','target','price_tot','brd_kw','per_price','discount_rate'])
y_train_2 = train_clip2['target']
X_test_2 = test_2.drop(columns=['date', 'pd_code'])

In [15]:
y_train_1 = y_train_1.astype(int)
y_train_2 = y_train_2.astype(int)

### Modeling

In [16]:
is_holdout = False
n_splits = 5

cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)

In [17]:
#범주형 변수 설정
cat_features = ['ID','main_code','middle_code','sub_code','brd_code','info']

In [18]:
preds_1_1 = []

for tri, vai in cv.split(X_train_1): #KFold
    print("="*50)
    fit_params={"early_stopping_rounds":50, 
            "eval_metric" : 'mae', 
            "eval_set" : [(X_train_1.iloc[vai],y_train_1[vai])],
            'eval_names': ['valid'],
            'verbose': 100,
            'feature_name': 'auto',
            'categorical_feature' : cat_features
           }
    
    model = LGBMRegressor(num_leaves= 64, max_depth=16, 
                         random_state=42, 
                         silent=True, 
                         objective='regression',
                         metric='mae',
                         n_jobs=-1, 
                         n_estimators=10000,
                         colsample_bytree=0.9,
                         subsample=0.9,
                         learning_rate=0.03)

    model.fit(X_train_1.iloc[tri], y_train_1[tri], **fit_params)
    
    pred = model.predict(X_test_1)
    preds_1_1.append(pred)
    if is_holdout:
        break 

Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 0.434725
[200]	valid's l1: 0.218244
[300]	valid's l1: 0.196703
[400]	valid's l1: 0.189446
[500]	valid's l1: 0.18335
[600]	valid's l1: 0.180046
Early stopping, best iteration is:
[635]	valid's l1: 0.178652
Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 0.434872
[200]	valid's l1: 0.212649
[300]	valid's l1: 0.20624
Early stopping, best iteration is:
[276]	valid's l1: 0.205382
Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 0.338172
[200]	valid's l1: 0.134624
[300]	valid's l1: 0.130117
Early stopping, best iteration is:
[315]	valid's l1: 0.129319
Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 0.402732
[200]	valid's l1: 0.211308
Early stopping, best iteration is:
[224]	valid's l1: 0.209446
Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 0.333143
[200]	valid's l1: 0.150424
[300]	valid's l1:

In [19]:
preds_1_2 = []

for tri, vai in cv.split(X_train_1): #KFold
    print("="*50)
    fit_params={"early_stopping_rounds":50, 
            "eval_metric" : 'mae', 
            "eval_set" : [(X_train_1.iloc[vai],y_train_1[vai])],
            'eval_names': ['valid'],
            'verbose': 100,
            'feature_name': 'auto',
            'categorical_feature' : cat_features
           }
    
    model = LGBMRegressor(num_leaves= 64, max_depth=16, 
                         random_state=42, 
                         silent=True, 
                         objective='regression_l1',
                         metric='mae',
                         n_jobs=-1, 
                         n_estimators=10000,
                         colsample_bytree=0.9,
                         subsample=0.9,
                         learning_rate=0.03)

    model.fit(X_train_1.iloc[tri], y_train_1[tri], **fit_params)
    
    pred = model.predict(X_test_1)
    preds_1_2.append(pred)
    if is_holdout:
        break 

Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 1.49854
[200]	valid's l1: 0.683214
[300]	valid's l1: 0.547831
[400]	valid's l1: 0.498549
[500]	valid's l1: 0.472472
Early stopping, best iteration is:
[459]	valid's l1: 0.471326
Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 1.44104
[200]	valid's l1: 0.593263
[300]	valid's l1: 0.55114
[400]	valid's l1: 0.533709
Early stopping, best iteration is:
[438]	valid's l1: 0.533529
Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 0.883074
[200]	valid's l1: 0.588365
[300]	valid's l1: 0.554238
[400]	valid's l1: 0.529055
[500]	valid's l1: 0.507035
[600]	valid's l1: 0.496723
[700]	valid's l1: 0.496682
Early stopping, best iteration is:
[680]	valid's l1: 0.496464
Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 1.0734
[200]	valid's l1: 0.459575
[300]	valid's l1: 0.367948
[400]	valid's l1: 0.330239
Early stopping, best iteration is:


In [20]:
preds_2_1 = []

for tri, vai in cv.split(X_train_2): #KFold
    print("="*50)
    fit_params={"early_stopping_rounds":50, 
            "eval_metric" : 'mae', 
            "eval_set" : [(X_train_2.iloc[vai],y_train_2[vai])],
            'eval_names': ['valid'],
            'verbose': 100,
            'feature_name': 'auto',
            'categorical_feature' : cat_features
           }
    
    model = LGBMRegressor(num_leaves= 64, max_depth=16, 
                         random_state=42, 
                         silent=True, 
                         objective='regression',
                         metric='mae',
                         n_jobs=-1, 
                         n_estimators=10000,
                         colsample_bytree=0.9,
                         subsample=0.9,
                         learning_rate=0.03)

    model.fit(X_train_2.iloc[tri], y_train_2[tri], **fit_params)
    
    pred = model.predict(X_test_2)
    preds_2_1.append(pred)
    if is_holdout:
        break 

Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 6.94065
[200]	valid's l1: 6.61745
[300]	valid's l1: 6.61788
Early stopping, best iteration is:
[270]	valid's l1: 6.59086
Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 7.88657
[200]	valid's l1: 7.26491
Early stopping, best iteration is:
[156]	valid's l1: 7.2133
Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 7.14467
[200]	valid's l1: 6.73812
Early stopping, best iteration is:
[217]	valid's l1: 6.71004
Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 5.50261
[200]	valid's l1: 5.00725
[300]	valid's l1: 4.79714
Early stopping, best iteration is:
[335]	valid's l1: 4.73986
Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 6.65862
[200]	valid's l1: 6.33228
Early stopping, best iteration is:
[167]	valid's l1: 6.28636


In [21]:
preds_2_2 = []

for tri, vai in cv.split(X_train_2): #KFold
    print("="*50)
    fit_params={"early_stopping_rounds":50, 
            "eval_metric" : 'mae', 
            "eval_set" : [(X_train_2.iloc[vai],y_train_2[vai])],
            'eval_names': ['valid'],
            'verbose': 100,
            'feature_name': 'auto',
            'categorical_feature' : cat_features
           }
    
    model = LGBMRegressor(num_leaves= 64, max_depth=16, 
                         random_state=42, 
                         silent=True, 
                         objective='regression_l1',
                         metric='mae',
                         n_jobs=-1, 
                         n_estimators=10000,
                         colsample_bytree=0.9,
                         subsample=0.9,
                         learning_rate=0.03)

    model.fit(X_train_2.iloc[tri], y_train_2[tri], **fit_params)
    
    pred = model.predict(X_test_2)
    preds_2_2.append(pred)
    if is_holdout:
        break 

Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 8.8442
[200]	valid's l1: 8.00197
[300]	valid's l1: 7.75299
[400]	valid's l1: 7.30111
[500]	valid's l1: 7.11212
[600]	valid's l1: 7.00218
[700]	valid's l1: 6.96553
[800]	valid's l1: 6.90682
[900]	valid's l1: 6.86336
[1000]	valid's l1: 6.83142
[1100]	valid's l1: 6.80652
[1200]	valid's l1: 6.7736
[1300]	valid's l1: 6.7289
[1400]	valid's l1: 6.67119
[1500]	valid's l1: 6.627
[1600]	valid's l1: 6.59887
[1700]	valid's l1: 6.5527
[1800]	valid's l1: 6.53965
Early stopping, best iteration is:
[1822]	valid's l1: 6.53845
Training until validation scores don't improve for 50 rounds
[100]	valid's l1: 8.00107
[200]	valid's l1: 7.24424
[300]	valid's l1: 6.88241
[400]	valid's l1: 6.71887
[500]	valid's l1: 6.51967
[600]	valid's l1: 6.48974
[700]	valid's l1: 6.47016
[800]	valid's l1: 6.41427
[900]	valid's l1: 6.35572
[1000]	valid's l1: 6.30601
[1100]	valid's l1: 6.15353
Early stopping, best iteration is:
[1146]	valid's l1: 6.1

In [22]:
#산술평균
preds_1_1 = np.mean(preds_1_1, axis = 0)
preds_1_2 = np.mean(preds_1_2, axis = 0)

#음수값 보정
preds_1_1[preds_1_1<0] = 0
preds_1_2[preds_1_2<0] = 0

#앙상블
preds_1 = preds_1_1 * 0.5 + preds_1_2 * 0.5

#산술평균
preds_2_1 = np.mean(preds_2_1, axis = 0)
preds_2_2 = np.mean(preds_2_2, axis = 0)

#음수값 보정
preds_2_1[preds_2_1<0] = 0
preds_2_2[preds_2_2<0] = 0

#앙상블
preds_2 = preds_2_1 * 0.5 + preds_2_2 * 0.5

In [23]:
test.loc[index_1, 'preds'] = preds_1
test.loc[index_2, 'preds'] = preds_2

preds = test['preds'].values

## Submission

In [24]:
submission

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,15885,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15886,15886,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15887,15887,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15888,15888,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
#예측값 할당

for i in range(15890):
    for j in range(1,22):
        submission.iloc[i,j] = preds[21*i + j-1]

In [26]:
submission.iloc[:,1:] = np.round(submission.iloc[:,1:])
submission

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1.0,3.0,2.0,0.0,0.0,2.0,0.0,1.0,3.0,...,0.0,2.0,0.0,1.0,3.0,2.0,0.0,0.0,2.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,15885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15886,15886,3.0,0.0,2.0,4.0,1.0,1.0,3.0,3.0,0.0,...,1.0,1.0,3.0,3.0,0.0,2.0,4.0,1.0,1.0,3.0
15887,15887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15888,15888,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [43]:
submission.to_csv("./submission/LGBM_단일.csv", index=False)