In [1]:
import pandas as pd
import numpy as np

In [2]:
# pd.set_option('display.float_format', lambda x: '%.5f' % x)
# np.set_printoptions(suppress=True)

In [3]:
url_train = "https://raw.githubusercontent.com/ZeusKwon/data-drive/main/%EC%A3%BC%EC%B0%A8%EC%88%98%EC%9A%94/train.csv"
url_test = "https://raw.githubusercontent.com/ZeusKwon/data-drive/main/%EC%A3%BC%EC%B0%A8%EC%88%98%EC%9A%94/test.csv"
url_submission = "https://raw.githubusercontent.com/ZeusKwon/data-drive/main/%EC%A3%BC%EC%B0%A8%EC%88%98%EC%9A%94/sample_submission.csv"
urlinfo = 'https://raw.githubusercontent.com/ZeusKwon/data-drive/main/%EC%A3%BC%EC%B0%A8%EC%88%98%EC%9A%94/age_gender_info.csv'


train_raw = pd.read_csv(url_train)
test_raw = pd.read_csv(url_test)
submission_raw = pd.read_csv(url_submission)

In [4]:
def engineering_train(df):
    # 칼럼명 변경
    df2 =  df.rename(columns={'단지코드' : 'id', 
                               '총세대수' : 'no_house',
                              '임대건물구분' : 'bd_type',
                              '지역' : 'state',
                              '공급유형' : 'supply_type',
                              '전용면적' : 'sqm',
                              '전용면적별세대수' : 'house_per_sqm',
                              '공가수' : 'no_empty',
                              '자격유형' : 'q_type',
                              '임대보증금' : 'deposit',
                              '임대료' : 'rent',
                              '도보 10분거리 내 지하철역 수(환승노선 수 반영)' : 'subway',
                              '도보 10분거리 내 버스정류장 수' : 'bus',
                              '단지내주차면수' : 'parkinglot',
                              '등록차량수' : 'cars'})
    
    # sqm -> 평수로 변경
    def pyeong(x):
        y = x*0.3025
        return y

    df3 = df2.copy()
    df3['sqp'] = pyeong(df3['sqm']).round()
    df3_columns = ['id', 'no_house', 'bd_type', 'state', 'supply_type', 'sqp',
       'house_per_sqm', 'no_empty', 'q_type', 'deposit', 'rent', 'subway',
       'bus', 'parkinglot', 'cars']
    df3 = df3[df3_columns]
    
    # 데이터 타입 숫자형으로
    df3['deposit'] = pd.to_numeric(df3['deposit'], errors='coerce')
    df3['rent'] = pd.to_numeric(df3['rent'], errors='coerce')
    
    # 지하철 / 버스 결측치 0으로 
    df4 = df3.copy()
    df4['subway'].fillna(0, inplace = True)
    df4['bus'].fillna(0, inplace = True)
    
    # 평수별 차량 보유 대수 계산
    df4['cars2'] = (df4['house_per_sqm'] / df4['no_house']) * df4['cars']
    df4['cars2_per_house'] = df4['cars2'] / df4['house_per_sqm']
    
    # 결측치 처리 (자격유형 A, H, K)
    null = df4[df4['rent'].isnull()]
    null_ahk = null[null['q_type'].isin(['A','H','K'])]
    sqp_dep_mean = df4.groupby(['q_type','sqp'])['deposit'].mean()
    sqp_rent_mean = df4.groupby(['q_type','sqp'])['rent'].mean()

    for i in range(0, len(null_ahk)):
        qt = null_ahk['q_type'].iloc[i]
        sqpn = null_ahk['sqp'].iloc[i]
        null_ahk['deposit'].iloc[i] = sqp_dep_mean.loc[qt, sqpn]
        null_ahk['rent'].iloc[i] = sqp_rent_mean.loc[qt, sqpn]
        
    df4.loc[df4['deposit'].isna(), 'deposit'] = null_ahk['deposit']
    df4.loc[df4['rent'].isna(), 'rent'] = null_ahk['rent']
        
    #결측치 처리 (자격유형 D)
    df4['deposit'] = df4['deposit'].fillna(0)
    df4['rent'] = df4['rent'].fillna(0)
    
    
    # 데이터 오류 행 드랍 (3번 문제)
    # https://www.dacon.io/competitions/official/235745/talkboard/403708?dtype=recent&page=1
    
    error_id_train =  ['C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']
    df_train_error = df4[df4.loc[:,'id'].isin(error_id_train) == True]
    df5 = df4.drop(df_train_error.index, axis = 0)
    
    

    
    
    # 분석에 사용 할 칼럼
    df5_columns = ['no_house', 'bd_type', 'state', 'supply_type', 'sqp',
       'house_per_sqm', 'no_empty', 'q_type', 'deposit', 'rent', 'subway',
       'bus', 'parkinglot', 'cars2']
    
    df5 = df5[df5_columns]
    

    
    return df5

In [5]:
train_temp = train_raw.copy()
train_temp =  train_temp.rename(columns={'단지코드' : 'id', 
                           '총세대수' : 'no_house',
                          '임대건물구분' : 'bd_type',
                          '지역' : 'state',
                          '공급유형' : 'supply_type',
                          '전용면적' : 'sqm',
                          '전용면적별세대수' : 'house_per_sqm',
                          '공가수' : 'no_empty',
                          '자격유형' : 'q_type',
                          '임대보증금' : 'deposit',
                          '임대료' : 'rent',
                          '도보 10분거리 내 지하철역 수(환승노선 수 반영)' : 'subway',
                          '도보 10분거리 내 버스정류장 수' : 'bus',
                          '단지내주차면수' : 'parkinglot',
                          '등록차량수' : 'cars'})
def pyeong(x):
    y = x*0.3025
    return y

train_temp['sqp'] = pyeong(train_temp['sqm']).round()
train_temp_columns = ['id', 'no_house', 'bd_type', 'state', 'supply_type', 'sqp',
   'house_per_sqm', 'no_empty', 'q_type', 'deposit', 'rent', 'subway',
   'bus', 'parkinglot', 'cars']
train_temp = train_temp[train_temp_columns]

train_temp['deposit'] = pd.to_numeric(train_temp['deposit'], errors='coerce')
train_temp['rent'] = pd.to_numeric(train_temp['rent'], errors='coerce')

null_temp = train_temp[train_temp['rent'].isnull()]
null_ahk_temp = null_temp[null_temp['q_type'].isin(['A','H','K'])]
sqp_dep_mean_temp = train_temp.groupby(['q_type','sqp'])['deposit'].mean()
sqp_rent_mean_temp = train_temp.groupby(['q_type','sqp'])['rent'].mean()

In [6]:
def engineering_test(df):
    
    df2 =  df.rename(columns={'단지코드' : 'id', 
                               '총세대수' : 'no_house',
                              '임대건물구분' : 'bd_type',
                              '지역' : 'state',
                              '공급유형' : 'supply_type',
                              '전용면적' : 'sqm',
                              '전용면적별세대수' : 'house_per_sqm',
                              '공가수' : 'no_empty',
                              '자격유형' : 'q_type',
                              '임대보증금' : 'deposit',
                              '임대료' : 'rent',
                              '도보 10분거리 내 지하철역 수(환승노선 수 반영)' : 'subway',
                              '도보 10분거리 내 버스정류장 수' : 'bus',
                              '단지내주차면수' : 'parkinglot',
                              })

    # sqm -> 평수로 변경
    def pyeong(x):
        y = x*0.3025
        return y

    df3 = df2.copy()
    df3['sqp'] = pyeong(df3['sqm']).round()
    df3_columns = ['id', 'no_house', 'bd_type', 'state', 'supply_type', 'sqp',
       'house_per_sqm', 'no_empty', 'q_type', 'deposit', 'rent', 'subway',
       'bus', 'parkinglot']
    df3 = df3[df3_columns]

    # 데이터 타입 숫자형으로
    df3['deposit'] = pd.to_numeric(df3['deposit'], errors='coerce')
    df3['rent'] = pd.to_numeric(df3['rent'], errors='coerce')
    
    # 지하철 / 버스 결측치 0으로 
    df4 = df3.copy()
    df4['subway'].fillna(0, inplace = True)
    df4['bus'].fillna(0, inplace = True)
    

    
    
        
    # 결측치 처리 (자격유형 A, H, K)
    
    null = df4[df4['rent'].isnull()]
    null_ahk = null[null['q_type'].isin(['A','H','K'])]

    for i in range(0, len(null_ahk)):
        qt = null_ahk['q_type'].iloc[i]
        sqpn = null_ahk['sqp'].iloc[i]
        null_ahk['deposit'].iloc[i] = sqp_dep_mean_temp.loc[qt, sqpn]
        null_ahk['rent'].iloc[i] = sqp_rent_mean_temp.loc[qt, sqpn]
        
    df4.loc[df4['deposit'].isna(), 'deposit'] = null_ahk['deposit']
    df4.loc[df4['rent'].isna(), 'rent'] = null_ahk['rent']
        
        
    #결측치 처리 (자격유형 D)
    df4['deposit'] = df4['deposit'].fillna(0)
    df4['rent'] = df4['rent'].fillna(0)
    
       # 데이터 오류 행 드랍 (3번 문제)
    # https://www.dacon.io/competitions/official/235745/talkboard/403708?dtype=recent&page=1
#     error_id_test = ['C2335', 'C1327']
#     df_test_error = df4[df4.loc[:,'id'].isin(error_id_test) == True]
#     df5 = df4.drop(df_test_error.index, axis = 0) 
    
    df5 = df4.copy()
    
    # 분석에 사용 할 칼럼
    
    df5_columns = ['id','no_house', 'bd_type', 'state', 'supply_type', 'sqp',
       'house_per_sqm', 'no_empty', 'q_type', 'deposit', 'rent', 'subway',
       'bus', 'parkinglot']
    
    df5 = df5[df5_columns]
    
    return df5

In [7]:
train_eng = engineering_train(train_raw)

from sklearn.model_selection import train_test_split

train, val = train_test_split(train_eng, train_size = 0.8, test_size = 0.2, random_state = 2)

target = 'cars2'
features = train.columns.difference([target], sort = False)

X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [8]:
from category_encoders import CatBoostEncoder
from catboost import CatBoostRegressor
from sklearn.metrics import  mean_squared_error, mean_absolute_error, r2_score, accuracy_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [9]:
cat_enc = CatBoostEncoder()

X_train_encoded = cat_enc.fit_transform(X_train, y_train)
X_val_encoded = cat_enc.transform(X_val)

cat = CatBoostRegressor(iterations = 1000
                        , random_state = 42
                        , loss_function = 'MAE'
                       )

mae = make_scorer(mean_absolute_error, greater_is_better=False)


grids = {
#     "max_depth": [3,5,7], 
    "learning_rate" : [0.05, 0.07, 0.1]
    }                

fit_params = {"eval_set": [(X_val_encoded, y_val)], 
              "early_stopping_rounds": 50, 
              } 

reg = GridSearchCV(
    cat,
    param_grid=grids,
    cv=5,
    scoring=mae,
    verbose=1,
    n_jobs=-1
)

reg.fit(X_train_encoded, y_train, **fit_params)

print('최적 하이퍼파라미터: ', reg.best_params_)
print('mae_score: ', reg.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
0:	learn: 63.9475053	test: 64.2698720	best: 64.2698720 (0)	total: 154ms	remaining: 2m 33s
1:	learn: 61.4102811	test: 61.5832577	best: 61.5832577 (1)	total: 156ms	remaining: 1m 17s
2:	learn: 58.8725082	test: 59.0714887	best: 59.0714887 (2)	total: 158ms	remaining: 52.4s
3:	learn: 56.8052163	test: 57.0786974	best: 57.0786974 (3)	total: 160ms	remaining: 39.8s
4:	learn: 54.0327483	test: 54.2256348	best: 54.2256348 (4)	total: 162ms	remaining: 32.2s
5:	learn: 51.7670160	test: 51.8985806	best: 51.8985806 (5)	total: 164ms	remaining: 27.1s
6:	learn: 49.9504572	test: 50.0530978	best: 50.0530978 (6)	total: 166ms	remaining: 23.5s
7:	learn: 48.2588599	test: 48.2688115	best: 48.2688115 (7)	total: 168ms	remaining: 20.8s
8:	learn: 46.1896378	test: 46.1743015	best: 46.1743015 (8)	total: 170ms	remaining: 18.7s
9:	learn: 44.3773081	test: 44.1425170	best: 44.1425170 (9)	total: 172ms	remaining: 17.1s
10:	learn: 42.5796976	test: 42.1131249	best: 42.

160:	learn: 13.5651386	test: 16.1813680	best: 16.1813680 (160)	total: 493ms	remaining: 2.57s
161:	learn: 13.5360745	test: 16.1337749	best: 16.1337749 (161)	total: 495ms	remaining: 2.56s
162:	learn: 13.5286926	test: 16.1306259	best: 16.1306259 (162)	total: 498ms	remaining: 2.56s
163:	learn: 13.4773941	test: 16.0736619	best: 16.0736619 (163)	total: 501ms	remaining: 2.55s
164:	learn: 13.4581088	test: 16.0572514	best: 16.0572514 (164)	total: 503ms	remaining: 2.54s
165:	learn: 13.4224917	test: 16.0284948	best: 16.0284948 (165)	total: 506ms	remaining: 2.54s
166:	learn: 13.3898567	test: 15.9733337	best: 15.9733337 (166)	total: 510ms	remaining: 2.54s
167:	learn: 13.3771028	test: 15.9707145	best: 15.9707145 (167)	total: 515ms	remaining: 2.55s
168:	learn: 13.3666875	test: 15.9679762	best: 15.9679762 (168)	total: 517ms	remaining: 2.54s
169:	learn: 13.3308590	test: 15.9280388	best: 15.9280388 (169)	total: 520ms	remaining: 2.54s
170:	learn: 13.2501546	test: 15.8964912	best: 15.8964912 (170)	total: 

286:	learn: 11.1349015	test: 15.0729788	best: 15.0729788 (286)	total: 838ms	remaining: 2.08s
287:	learn: 11.1342353	test: 15.0719339	best: 15.0719339 (287)	total: 840ms	remaining: 2.08s
288:	learn: 11.1032173	test: 15.0416698	best: 15.0416698 (288)	total: 842ms	remaining: 2.07s
289:	learn: 11.1000876	test: 15.0418346	best: 15.0416698 (288)	total: 844ms	remaining: 2.07s
290:	learn: 11.0914084	test: 15.0343162	best: 15.0343162 (290)	total: 847ms	remaining: 2.06s
291:	learn: 11.0886582	test: 15.0342952	best: 15.0342952 (291)	total: 849ms	remaining: 2.06s
292:	learn: 11.0837404	test: 15.0287112	best: 15.0287112 (292)	total: 852ms	remaining: 2.06s
293:	learn: 11.0712670	test: 15.0147948	best: 15.0147948 (293)	total: 854ms	remaining: 2.05s
294:	learn: 11.0636926	test: 15.0146264	best: 15.0146264 (294)	total: 856ms	remaining: 2.05s
295:	learn: 11.0622791	test: 15.0147786	best: 15.0146264 (294)	total: 859ms	remaining: 2.04s
296:	learn: 11.0612494	test: 15.0142190	best: 15.0142190 (296)	total: 

395:	learn: 10.0265113	test: 14.6098069	best: 14.6098069 (395)	total: 1.18s	remaining: 1.8s
396:	learn: 10.0015543	test: 14.6075660	best: 14.6075660 (396)	total: 1.19s	remaining: 1.8s
397:	learn: 9.9723209	test: 14.5765065	best: 14.5765065 (397)	total: 1.19s	remaining: 1.8s
398:	learn: 9.9719843	test: 14.5759692	best: 14.5759692 (398)	total: 1.2s	remaining: 1.8s
399:	learn: 9.9701860	test: 14.5779384	best: 14.5759692 (398)	total: 1.2s	remaining: 1.8s
400:	learn: 9.9589539	test: 14.5683719	best: 14.5683719 (400)	total: 1.2s	remaining: 1.8s
401:	learn: 9.9550159	test: 14.5643601	best: 14.5643601 (401)	total: 1.21s	remaining: 1.79s
402:	learn: 9.9381278	test: 14.5523959	best: 14.5523959 (402)	total: 1.21s	remaining: 1.79s
403:	learn: 9.9155043	test: 14.5664830	best: 14.5523959 (402)	total: 1.21s	remaining: 1.79s
404:	learn: 9.9098342	test: 14.5690650	best: 14.5523959 (402)	total: 1.22s	remaining: 1.79s
405:	learn: 9.9071553	test: 14.5705267	best: 14.5523959 (402)	total: 1.22s	remaining: 1

512:	learn: 8.9800187	test: 14.1558043	best: 14.1558043 (512)	total: 1.52s	remaining: 1.44s
513:	learn: 8.9661491	test: 14.1617774	best: 14.1558043 (512)	total: 1.52s	remaining: 1.44s
514:	learn: 8.9582775	test: 14.1644076	best: 14.1558043 (512)	total: 1.52s	remaining: 1.44s
515:	learn: 8.9386897	test: 14.1839037	best: 14.1558043 (512)	total: 1.53s	remaining: 1.43s
516:	learn: 8.9302562	test: 14.1825752	best: 14.1558043 (512)	total: 1.53s	remaining: 1.43s
517:	learn: 8.9199104	test: 14.1866177	best: 14.1558043 (512)	total: 1.53s	remaining: 1.43s
518:	learn: 8.9168363	test: 14.1838074	best: 14.1558043 (512)	total: 1.54s	remaining: 1.42s
519:	learn: 8.9131505	test: 14.1771118	best: 14.1558043 (512)	total: 1.54s	remaining: 1.42s
520:	learn: 8.9043302	test: 14.1771471	best: 14.1558043 (512)	total: 1.54s	remaining: 1.42s
521:	learn: 8.9018681	test: 14.1747691	best: 14.1558043 (512)	total: 1.54s	remaining: 1.41s
522:	learn: 8.8971290	test: 14.1751142	best: 14.1558043 (512)	total: 1.55s	remai

655:	learn: 8.0516822	test: 13.9136424	best: 13.9067329 (637)	total: 1.86s	remaining: 976ms
656:	learn: 8.0463356	test: 13.9217430	best: 13.9067329 (637)	total: 1.86s	remaining: 973ms
657:	learn: 8.0452071	test: 13.9221742	best: 13.9067329 (637)	total: 1.86s	remaining: 970ms
658:	learn: 8.0388415	test: 13.9235138	best: 13.9067329 (637)	total: 1.87s	remaining: 967ms
659:	learn: 8.0334001	test: 13.9233159	best: 13.9067329 (637)	total: 1.87s	remaining: 963ms
660:	learn: 8.0284706	test: 13.8949022	best: 13.8949022 (660)	total: 1.87s	remaining: 960ms
661:	learn: 8.0212525	test: 13.8949325	best: 13.8949022 (660)	total: 1.87s	remaining: 957ms
662:	learn: 8.0157843	test: 13.9038737	best: 13.8949022 (660)	total: 1.88s	remaining: 954ms
663:	learn: 8.0112625	test: 13.9027921	best: 13.8949022 (660)	total: 1.88s	remaining: 951ms
664:	learn: 7.9958740	test: 13.9017761	best: 13.8949022 (660)	total: 1.88s	remaining: 948ms
665:	learn: 7.9955939	test: 13.9017933	best: 13.8949022 (660)	total: 1.88s	remai

820:	learn: 7.2956149	test: 13.6691013	best: 13.6691013 (820)	total: 2.21s	remaining: 481ms
821:	learn: 7.2944661	test: 13.6693742	best: 13.6691013 (820)	total: 2.21s	remaining: 478ms
822:	learn: 7.2915119	test: 13.6694753	best: 13.6691013 (820)	total: 2.21s	remaining: 476ms
823:	learn: 7.2902028	test: 13.6694268	best: 13.6691013 (820)	total: 2.21s	remaining: 473ms
824:	learn: 7.2883180	test: 13.6696544	best: 13.6691013 (820)	total: 2.21s	remaining: 470ms
825:	learn: 7.2864461	test: 13.6690736	best: 13.6690736 (825)	total: 2.22s	remaining: 467ms
826:	learn: 7.2856810	test: 13.6683613	best: 13.6683613 (826)	total: 2.22s	remaining: 464ms
827:	learn: 7.2730729	test: 13.6632344	best: 13.6632344 (827)	total: 2.22s	remaining: 462ms
828:	learn: 7.2708760	test: 13.6622905	best: 13.6622905 (828)	total: 2.22s	remaining: 459ms
829:	learn: 7.2689211	test: 13.6616556	best: 13.6616556 (829)	total: 2.23s	remaining: 456ms
830:	learn: 7.2664031	test: 13.6616401	best: 13.6616401 (830)	total: 2.23s	remai

985:	learn: 6.7778435	test: 13.5228358	best: 13.5204349 (980)	total: 2.55s	remaining: 36.2ms
986:	learn: 6.7766016	test: 13.5232371	best: 13.5204349 (980)	total: 2.55s	remaining: 33.6ms
987:	learn: 6.7704222	test: 13.5219183	best: 13.5204349 (980)	total: 2.56s	remaining: 31ms
988:	learn: 6.7690535	test: 13.5216575	best: 13.5204349 (980)	total: 2.56s	remaining: 28.5ms
989:	learn: 6.7678420	test: 13.5204018	best: 13.5204018 (989)	total: 2.56s	remaining: 25.9ms
990:	learn: 6.7666710	test: 13.5240120	best: 13.5204018 (989)	total: 2.56s	remaining: 23.3ms
991:	learn: 6.7650203	test: 13.5239827	best: 13.5204018 (989)	total: 2.56s	remaining: 20.7ms
992:	learn: 6.7644041	test: 13.5239514	best: 13.5204018 (989)	total: 2.57s	remaining: 18.1ms
993:	learn: 6.7642729	test: 13.5240313	best: 13.5204018 (989)	total: 2.57s	remaining: 15.5ms
994:	learn: 6.7633534	test: 13.5238890	best: 13.5204018 (989)	total: 2.57s	remaining: 12.9ms
995:	learn: 6.7623254	test: 13.5232995	best: 13.5204018 (989)	total: 2.5

In [10]:
y_pred_train = reg.predict(X_train_encoded)
y_pred_val = reg.predict(X_val_encoded)

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_val = mean_absolute_error(y_val, y_pred_val)

print(mae_train)
print(mae_val)

6.767843058796586
13.520402708281429


In [13]:
X_train_val = train_eng[features]
y_train_val = train_eng[target]

cat_enc2 = CatBoostEncoder()
X_train_val_encoded = cat_enc2.fit_transform(X_train_val, y_train_val)

cat2 = CatBoostRegressor(iterations = 989
                        , random_state = 42
                        , loss_function = 'MAE'
                        , learning_rate = 0.07)

cat2.fit(X_train_val_encoded, y_train_val)

y_pred_train_val = cat2.predict(X_train_val_encoded)
mae_train_val = mean_absolute_error(y_train_val, y_pred_train_val)
print(mae_train_val)


0:	learn: 63.9882251	total: 2.85ms	remaining: 2.82s
1:	learn: 61.4353359	total: 5.64ms	remaining: 2.78s
2:	learn: 58.7960510	total: 8.13ms	remaining: 2.67s
3:	learn: 56.2172264	total: 10.3ms	remaining: 2.54s
4:	learn: 53.4931024	total: 12.7ms	remaining: 2.49s
5:	learn: 51.2810306	total: 14.9ms	remaining: 2.44s
6:	learn: 49.7336237	total: 17.1ms	remaining: 2.4s
7:	learn: 48.0560787	total: 19.5ms	remaining: 2.39s
8:	learn: 46.4830460	total: 21.8ms	remaining: 2.37s
9:	learn: 44.6194028	total: 24ms	remaining: 2.35s
10:	learn: 42.6775820	total: 26.4ms	remaining: 2.35s
11:	learn: 41.2116503	total: 28.7ms	remaining: 2.33s
12:	learn: 39.8822700	total: 31.1ms	remaining: 2.34s
13:	learn: 38.5336585	total: 33.5ms	remaining: 2.33s
14:	learn: 37.1685736	total: 36ms	remaining: 2.33s
15:	learn: 35.9586201	total: 38.6ms	remaining: 2.35s
16:	learn: 35.2407590	total: 41.3ms	remaining: 2.36s
17:	learn: 34.3581867	total: 43.6ms	remaining: 2.35s
18:	learn: 33.7182794	total: 45.7ms	remaining: 2.33s
19:	lear

164:	learn: 14.4992158	total: 357ms	remaining: 1.78s
165:	learn: 14.4841893	total: 359ms	remaining: 1.78s
166:	learn: 14.4731907	total: 361ms	remaining: 1.78s
167:	learn: 14.4307770	total: 363ms	remaining: 1.78s
168:	learn: 14.4144420	total: 366ms	remaining: 1.77s
169:	learn: 14.3834883	total: 369ms	remaining: 1.77s
170:	learn: 14.3361572	total: 371ms	remaining: 1.77s
171:	learn: 14.3005732	total: 373ms	remaining: 1.77s
172:	learn: 14.2864255	total: 376ms	remaining: 1.77s
173:	learn: 14.1879610	total: 378ms	remaining: 1.77s
174:	learn: 14.1730610	total: 381ms	remaining: 1.77s
175:	learn: 14.1440337	total: 383ms	remaining: 1.77s
176:	learn: 14.1201889	total: 385ms	remaining: 1.76s
177:	learn: 14.0488981	total: 387ms	remaining: 1.76s
178:	learn: 14.0340880	total: 389ms	remaining: 1.76s
179:	learn: 13.9754380	total: 391ms	remaining: 1.76s
180:	learn: 13.9081747	total: 394ms	remaining: 1.76s
181:	learn: 13.8099164	total: 396ms	remaining: 1.75s
182:	learn: 13.7415154	total: 398ms	remaining:

372:	learn: 11.1643380	total: 877ms	remaining: 1.45s
373:	learn: 11.1466584	total: 880ms	remaining: 1.45s
374:	learn: 11.0861808	total: 882ms	remaining: 1.44s
375:	learn: 11.0693396	total: 885ms	remaining: 1.44s
376:	learn: 11.0614221	total: 888ms	remaining: 1.44s
377:	learn: 11.0340462	total: 890ms	remaining: 1.44s
378:	learn: 11.0340398	total: 893ms	remaining: 1.44s
379:	learn: 11.0221950	total: 896ms	remaining: 1.44s
380:	learn: 11.0191043	total: 900ms	remaining: 1.44s
381:	learn: 10.9948363	total: 903ms	remaining: 1.44s
382:	learn: 10.9947564	total: 906ms	remaining: 1.43s
383:	learn: 10.9735364	total: 909ms	remaining: 1.43s
384:	learn: 10.9671705	total: 913ms	remaining: 1.43s
385:	learn: 10.9670045	total: 916ms	remaining: 1.43s
386:	learn: 10.9639536	total: 919ms	remaining: 1.43s
387:	learn: 10.9603579	total: 923ms	remaining: 1.43s
388:	learn: 10.9516090	total: 927ms	remaining: 1.43s
389:	learn: 10.9383251	total: 929ms	remaining: 1.43s
390:	learn: 10.9374931	total: 933ms	remaining:

575:	learn: 9.6749906	total: 1.4s	remaining: 1s
576:	learn: 9.6628621	total: 1.4s	remaining: 1s
577:	learn: 9.6434089	total: 1.4s	remaining: 999ms
578:	learn: 9.6429503	total: 1.41s	remaining: 996ms
579:	learn: 9.6402431	total: 1.41s	remaining: 994ms
580:	learn: 9.6387792	total: 1.41s	remaining: 992ms
581:	learn: 9.6378316	total: 1.42s	remaining: 990ms
582:	learn: 9.6202675	total: 1.42s	remaining: 988ms
583:	learn: 9.6202230	total: 1.42s	remaining: 985ms
584:	learn: 9.6098728	total: 1.42s	remaining: 983ms
585:	learn: 9.6098382	total: 1.43s	remaining: 980ms
586:	learn: 9.6042206	total: 1.43s	remaining: 978ms
587:	learn: 9.6021725	total: 1.43s	remaining: 975ms
588:	learn: 9.6019740	total: 1.43s	remaining: 973ms
589:	learn: 9.5940920	total: 1.44s	remaining: 970ms
590:	learn: 9.5902797	total: 1.44s	remaining: 968ms
591:	learn: 9.5716509	total: 1.44s	remaining: 965ms
592:	learn: 9.5687991	total: 1.44s	remaining: 963ms
593:	learn: 9.5687478	total: 1.44s	remaining: 961ms
594:	learn: 9.5628904

789:	learn: 8.2781758	total: 1.92s	remaining: 484ms
790:	learn: 8.2781021	total: 1.92s	remaining: 482ms
791:	learn: 8.2779006	total: 1.93s	remaining: 479ms
792:	learn: 8.2768619	total: 1.93s	remaining: 477ms
793:	learn: 8.2738894	total: 1.93s	remaining: 475ms
794:	learn: 8.2713821	total: 1.93s	remaining: 472ms
795:	learn: 8.2658775	total: 1.94s	remaining: 470ms
796:	learn: 8.2627377	total: 1.94s	remaining: 467ms
797:	learn: 8.2600206	total: 1.94s	remaining: 465ms
798:	learn: 8.2599688	total: 1.94s	remaining: 462ms
799:	learn: 8.2560535	total: 1.95s	remaining: 460ms
800:	learn: 8.2526711	total: 1.95s	remaining: 457ms
801:	learn: 8.2474211	total: 1.95s	remaining: 455ms
802:	learn: 8.2416666	total: 1.95s	remaining: 452ms
803:	learn: 8.2262940	total: 1.96s	remaining: 450ms
804:	learn: 8.2230774	total: 1.96s	remaining: 448ms
805:	learn: 8.2220271	total: 1.96s	remaining: 445ms
806:	learn: 8.2023776	total: 1.96s	remaining: 443ms
807:	learn: 8.1948980	total: 1.96s	remaining: 440ms
808:	learn: 

In [14]:
test = engineering_test(test_raw)

In [15]:
test_mask = submission_raw.code

In [16]:
test_feature = test.columns.difference(['id'], sort = False)

test2 = test[test_feature]


test2_encoded = cat_enc2.transform(test2)

In [17]:
y_pred_test = cat2.predict(test2_encoded)

In [18]:
test3 = test.copy()
test3['cars'] = y_pred_test

In [19]:
sub = test3.groupby('id')[['cars']].sum()
sub.reset_index(inplace = True)


In [20]:
submission = submission_raw.copy()

for i in range(0,len(submission)):
    n = submission.iloc[i,0]
    submission.iloc[i,1] = sub[sub['id'] == n].iloc[0,1]

submission.set_index('code', drop = True, inplace = True)
submission

Unnamed: 0_level_0,num
code,Unnamed: 1_level_1
C1072,620.834062
C1128,1267.581072
C1456,453.569547
C1840,495.531257
C1332,1179.440352
...,...
C2456,212.211770
C1266,295.984463
C2152,51.508282
C1267,298.740503


In [21]:
# submission.to_csv('hades_submission9.csv', encoding = 'utf-8')

### sub 5 
- catboostregressor
- gridsearchcv : learning rate [0.2, 0.3, 0.4] -> 0.2
- earlystopping : 50 -> iteration 679
- train mae : 1.4556999254502654
- val mae : 13.460767025881024
- **result : 110**


### sub 6
- catboostregressor
- gridsearchcv : learning rate [0.05, 0.1, 0.15, 0.2] -> 0.15
- earlystopping : 50 -> iteration 999 (earlystopping 발동 안됨)
- train mae : 1.1758222016501447
- val mae : 12.532559838766556
- **result : 114.5**

**근소하게나마 train / val mae 차이가 줄었는데 제출점수는 mae가 더 올랐다.**  
**=> 약간의 train/val 차이만으로 과적합을 줄였다고 판단할 수 없다**

### sub 7
- catboostregressor : loss_function(mae) 추가
- gridsearchcv : learning rate [0.1, 0.15, 0.2] -> 0.1
- earlystopping : 50 -> iteration 766
- train mae : 6.6701283058541545
- val mae : 15.069254846402911
- **result : 107.9**

### sub 8
- catboostregressor : loss_function(mae)
- gridsearchcv : learning rate [0.05, 0.07, 0.1] -> 0.07
- earlystopping : 50 -> iteration 989
- train mae : 6.767843058796586
- val mae : 13.520402708281429
- **result : **

### sub 9
- catboostregressor : iteration = 989, learningrate = 0.7, loss_function = mae
- sub8의 조건으로 train/val셋 합쳐서 다시 학습
- train_val set mae : 7.45386245065379
- **result : **