# Import

In [1]:
import os
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import glob
from tqdm import tqdm

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon.core as ag

import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) 

# Data Load

In [2]:
train_org = pd.read_csv('../open/train.csv').drop(columns='ID')
test_org = pd.read_csv('../open/test.csv').drop(columns='ID')

In [3]:
display(train_org)
print(train_org.shape)

Unnamed: 0,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사고유형 - 세부분류,법규위반,가해운전자 차종,...,가해운전자 상해정도,피해운전자 차종,피해운전자 성별,피해운전자 연령,피해운전자 상해정도,사망자수,중상자수,경상자수,부상자수,ECLO
0,2019-01-01 00,화요일,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,길가장자리구역통행중,안전운전불이행,승용,...,상해없음,보행자,여,70세,중상,0,1,0,0,5
1,2019-01-01 00,화요일,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,보도통행중,기타,승용,...,상해없음,보행자,남,61세,경상,0,0,1,0,3
2,2019-01-01 01,화요일,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,차도통행중,안전운전불이행,승용,...,상해없음,보행자,남,38세,경상,0,0,1,0,3
3,2019-01-01 02,화요일,맑음,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,승용,...,상해없음,승용,남,36세,중상,0,1,0,0,5
4,2019-01-01 04,화요일,맑음,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,승용,...,상해없음,승용,남,52세,경상,0,0,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,2021-12-31 19,금요일,맑음,대구광역시 수성구 수성동3가,교차로 - 교차로안,건조,차대차,측면충돌,신호위반,승용,...,상해없음,이륜,남,28세,경상,0,0,1,0,3
39605,2021-12-31 19,금요일,맑음,대구광역시 달서구 상인동,단일로 - 기타,건조,차대차,측면충돌,안전거리미확보,승용,...,상해없음,승용,남,52세,경상,0,0,1,0,3
39606,2021-12-31 21,금요일,맑음,대구광역시 달서구 월성동,교차로 - 교차로안,건조,차대차,측면충돌,교차로운행방법위반,승용,...,중상,승용,남,73세,중상,0,2,0,0,10
39607,2021-12-31 22,금요일,맑음,대구광역시 달서구 장동,기타 - 기타,건조,차대차,추돌,안전운전불이행,승용,...,상해없음,승용,여,57세,경상,0,0,1,0,3


(39609, 22)


In [4]:
display(test_org)
print(test_org.shape)

Unnamed: 0,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형
0,2022-01-01 01,토요일,맑음,대구광역시 수성구 상동,교차로 - 교차로안,건조,차대사람
1,2022-01-01 01,토요일,맑음,대구광역시 수성구 지산동,단일로 - 기타,건조,차대사람
2,2022-01-01 04,토요일,맑음,대구광역시 수성구 수성동2가,교차로 - 교차로안,건조,차대차
3,2022-01-01 04,토요일,맑음,대구광역시 수성구 신매동,단일로 - 기타,건조,차대차
4,2022-01-01 06,토요일,맑음,대구광역시 달서구 감삼동,교차로 - 교차로안,건조,차대차
...,...,...,...,...,...,...,...
10958,2022-12-31 18,토요일,맑음,대구광역시 남구 대명동,단일로 - 터널,건조,차대차
10959,2022-12-31 18,토요일,맑음,대구광역시 수성구 시지동,단일로 - 기타,건조,차대차
10960,2022-12-31 20,토요일,맑음,대구광역시 수성구 연호동,단일로 - 기타,건조,차대차
10961,2022-12-31 20,토요일,맑음,대구광역시 수성구 범물동,교차로 - 교차로부근,건조,차대차


(10963, 7)


# Data merge

In [5]:
light_df = pd.read_csv('../open/external_open/대구 보안등 정보.csv', encoding='cp949')[['설치개수', '소재지지번주소']]

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

light_df[['도시', '구', '동', '번지']] = light_df['소재지지번주소'].str.extract(location_pattern)
light_df = light_df.drop(columns=['소재지지번주소', '번지'])

light_df = light_df.groupby(['도시', '구', '동']).sum().reset_index()
light_df.reset_index(inplace=True, drop=True)

In [6]:
light_df

Unnamed: 0,도시,구,동,설치개수
0,대구광역시,남구,대명동,5377
1,대구광역시,남구,봉덕동,1424
2,대구광역시,남구,이천동,556
3,대구광역시,달서구,갈산동,349
4,대구광역시,달서구,감삼동,932
...,...,...,...,...
223,대구광역시,중구,태평로2가,38
224,대구광역시,중구,태평로3가,47
225,대구광역시,중구,포정동,18
226,대구광역시,중구,향촌동,28


In [7]:
child_area_df = pd.read_csv('../open/external_open/대구 어린이 보호 구역 정보.csv', encoding='cp949').drop_duplicates()[['소재지지번주소']]
child_area_df['cnt'] = 1

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

child_area_df[['도시', '구', '동', '번지']] = child_area_df['소재지지번주소'].str.extract(location_pattern)
child_area_df = child_area_df.drop(columns=['소재지지번주소', '번지'])

child_area_df = child_area_df.groupby(['도시', '구', '동']).sum().reset_index()
child_area_df.reset_index(inplace=True, drop=True)

In [8]:
child_area_df

Unnamed: 0,도시,구,동,cnt
0,대구광역시,남구,대명동,26
1,대구광역시,남구,봉덕동,8
2,대구광역시,남구,이천동,6
3,대구광역시,달성군,가창면,4
4,대구광역시,달성군,구지면,3
...,...,...,...,...
66,대구광역시,중구,봉산동,2
67,대구광역시,중구,삼덕동2가,1
68,대구광역시,중구,삼덕동3가,1
69,대구광역시,중구,서문로1가,1


In [9]:
parking_df = pd.read_csv('../open/external_open/대구 주차장 정보.csv', encoding='cp949')[['소재지지번주소', '급지구분']]
parking_df = pd.get_dummies(parking_df, columns=['급지구분'])

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

parking_df[['도시', '구', '동', '번지']] = parking_df['소재지지번주소'].str.extract(location_pattern)
parking_df = parking_df.drop(columns=['소재지지번주소', '번지'])

parking_df = parking_df.groupby(['도시', '구', '동']).sum().reset_index()
parking_df.reset_index(inplace=True, drop=True)

In [10]:
parking_df

Unnamed: 0,도시,구,동,급지구분_1,급지구분_2,급지구분_3
0,대구광역시,남구,대명동,20,1,0
1,대구광역시,남구,봉덕동,9,3,0
2,대구광역시,남구,이천동,3,0,0
3,대구광역시,달서구,갈산동,0,0,4
4,대구광역시,달서구,감삼동,0,1,3
...,...,...,...,...,...,...
131,대구광역시,중구,태평로2가,5,0,0
132,대구광역시,중구,태평로3가,1,0,0
133,대구광역시,중구,포정동,4,0,0
134,대구광역시,중구,향촌동,2,0,0


In [11]:
train_df = train_org.copy()
test_df = test_org.copy()

# Data Preprocessing

In [12]:
time_pattern = r'(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2})' 

train_df[['연', '월', '일', '시간']] = train_org['사고일시'].str.extract(time_pattern)
train_df[['연', '월', '일', '시간']] = train_df[['연', '월', '일', '시간']].apply(pd.to_numeric) # 추출된 문자열을 수치화해줍니다 
train_df = train_df.drop(columns=['사고일시']) # 정보 추출이 완료된 '사고일시' 컬럼은 제거합니다 

# 해당 과정을 test_x에 대해서도 반복해줍니다 
test_df[['연', '월', '일', '시간']] = test_org['사고일시'].str.extract(time_pattern)
test_df[['연', '월', '일', '시간']] = test_df[['연', '월', '일', '시간']].apply(pd.to_numeric)
test_df = test_df.drop(columns=['사고일시'])

In [13]:
location_pattern = r'(\S+) (\S+) (\S+)'

train_df[['도시', '구', '동']] = train_org['시군구'].str.extract(location_pattern)
train_df = train_df.drop(columns=['시군구'])

test_df[['도시', '구', '동']] = test_org['시군구'].str.extract(location_pattern)
test_df = test_df.drop(columns=['시군구'])

In [14]:
road_pattern = r'(.+) - (.+)'

train_df[['도로형태1', '도로형태2']] = train_org['도로형태'].str.extract(road_pattern)
train_df = train_df.drop(columns=['도로형태'])

test_df[['도로형태1', '도로형태2']] = test_org['도로형태'].str.extract(road_pattern)
test_df = test_df.drop(columns=['도로형태'])

In [15]:
# train_df와 test_df에, light_df와 child_area_df, parking_df를 merge하세요.
train_df = pd.merge(train_df, light_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, child_area_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, parking_df, how='left', on=['도시', '구', '동'])

test_df = pd.merge(test_df, light_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, child_area_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, parking_df, how='left', on=['도시', '구', '동'])

In [16]:
display(train_df)
print(train_df.shape)

Unnamed: 0,요일,기상상태,노면상태,사고유형,사고유형 - 세부분류,법규위반,가해운전자 차종,가해운전자 성별,가해운전자 연령,가해운전자 상해정도,...,도시,구,동,도로형태1,도로형태2,설치개수,cnt,급지구분_1,급지구분_2,급지구분_3
0,화요일,맑음,건조,차대사람,길가장자리구역통행중,안전운전불이행,승용,여,51세,상해없음,...,대구광역시,중구,대신동,단일로,기타,391.0,2.0,11.0,0.0,0.0
1,화요일,흐림,건조,차대사람,보도통행중,기타,승용,남,39세,상해없음,...,대구광역시,달서구,감삼동,단일로,기타,932.0,,0.0,1.0,3.0
2,화요일,맑음,건조,차대사람,차도통행중,안전운전불이행,승용,남,70세,상해없음,...,대구광역시,수성구,두산동,단일로,기타,473.0,5.0,,,
3,화요일,맑음,건조,차대차,추돌,안전운전불이행,승용,남,49세,상해없음,...,대구광역시,북구,복현동,단일로,기타,534.0,11.0,0.0,9.0,5.0
4,화요일,맑음,건조,차대차,추돌,안전운전불이행,승용,남,30세,상해없음,...,대구광역시,동구,신암동,단일로,기타,2057.0,,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,금요일,맑음,건조,차대차,측면충돌,신호위반,승용,여,52세,상해없음,...,대구광역시,수성구,수성동3가,교차로,교차로안,,1.0,,,
39605,금요일,맑음,건조,차대차,측면충돌,안전거리미확보,승용,여,60세,상해없음,...,대구광역시,달서구,상인동,단일로,기타,843.0,,0.0,0.0,5.0
39606,금요일,맑음,건조,차대차,측면충돌,교차로운행방법위반,승용,남,60세,중상,...,대구광역시,달서구,월성동,교차로,교차로안,164.0,,0.0,1.0,0.0
39607,금요일,맑음,건조,차대차,추돌,안전운전불이행,승용,남,40세,상해없음,...,대구광역시,달서구,장동,기타,기타,210.0,,0.0,0.0,1.0


(39609, 33)


In [17]:
display(test_df)
print(test_df.shape)

Unnamed: 0,요일,기상상태,노면상태,사고유형,연,월,일,시간,도시,구,동,도로형태1,도로형태2,설치개수,cnt,급지구분_1,급지구분_2,급지구분_3
0,토요일,맑음,건조,차대사람,2022,1,1,1,대구광역시,수성구,상동,교차로,교차로안,700.0,5.0,,,
1,토요일,맑음,건조,차대사람,2022,1,1,1,대구광역시,수성구,지산동,단일로,기타,,10.0,0.0,0.0,2.0
2,토요일,맑음,건조,차대차,2022,1,1,4,대구광역시,수성구,수성동2가,교차로,교차로안,,1.0,,,
3,토요일,맑음,건조,차대차,2022,1,1,4,대구광역시,수성구,신매동,단일로,기타,,7.0,0.0,2.0,1.0
4,토요일,맑음,건조,차대차,2022,1,1,6,대구광역시,달서구,감삼동,교차로,교차로안,932.0,,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10958,토요일,맑음,건조,차대차,2022,12,31,18,대구광역시,남구,대명동,단일로,터널,5377.0,26.0,20.0,1.0,0.0
10959,토요일,맑음,건조,차대차,2022,12,31,18,대구광역시,수성구,시지동,단일로,기타,,5.0,,,
10960,토요일,맑음,건조,차대차,2022,12,31,20,대구광역시,수성구,연호동,단일로,기타,,,,,
10961,토요일,맑음,건조,차대차,2022,12,31,20,대구광역시,수성구,범물동,교차로,교차로부근,,7.0,,,


(10963, 18)


In [18]:
test_x = test_df.copy()
train_x = train_df[test_x.columns].copy()
train_y = train_df['ECLO'].copy()

In [19]:
display(train_x)
print(train_x.shape)

Unnamed: 0,요일,기상상태,노면상태,사고유형,연,월,일,시간,도시,구,동,도로형태1,도로형태2,설치개수,cnt,급지구분_1,급지구분_2,급지구분_3
0,화요일,맑음,건조,차대사람,2019,1,1,0,대구광역시,중구,대신동,단일로,기타,391.0,2.0,11.0,0.0,0.0
1,화요일,흐림,건조,차대사람,2019,1,1,0,대구광역시,달서구,감삼동,단일로,기타,932.0,,0.0,1.0,3.0
2,화요일,맑음,건조,차대사람,2019,1,1,1,대구광역시,수성구,두산동,단일로,기타,473.0,5.0,,,
3,화요일,맑음,건조,차대차,2019,1,1,2,대구광역시,북구,복현동,단일로,기타,534.0,11.0,0.0,9.0,5.0
4,화요일,맑음,건조,차대차,2019,1,1,4,대구광역시,동구,신암동,단일로,기타,2057.0,,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,금요일,맑음,건조,차대차,2021,12,31,19,대구광역시,수성구,수성동3가,교차로,교차로안,,1.0,,,
39605,금요일,맑음,건조,차대차,2021,12,31,19,대구광역시,달서구,상인동,단일로,기타,843.0,,0.0,0.0,5.0
39606,금요일,맑음,건조,차대차,2021,12,31,21,대구광역시,달서구,월성동,교차로,교차로안,164.0,,0.0,1.0,0.0
39607,금요일,맑음,건조,차대차,2021,12,31,22,대구광역시,달서구,장동,기타,기타,210.0,,0.0,0.0,1.0


(39609, 18)


In [20]:
display(test_x)
print(test_x.shape)

Unnamed: 0,요일,기상상태,노면상태,사고유형,연,월,일,시간,도시,구,동,도로형태1,도로형태2,설치개수,cnt,급지구분_1,급지구분_2,급지구분_3
0,토요일,맑음,건조,차대사람,2022,1,1,1,대구광역시,수성구,상동,교차로,교차로안,700.0,5.0,,,
1,토요일,맑음,건조,차대사람,2022,1,1,1,대구광역시,수성구,지산동,단일로,기타,,10.0,0.0,0.0,2.0
2,토요일,맑음,건조,차대차,2022,1,1,4,대구광역시,수성구,수성동2가,교차로,교차로안,,1.0,,,
3,토요일,맑음,건조,차대차,2022,1,1,4,대구광역시,수성구,신매동,단일로,기타,,7.0,0.0,2.0,1.0
4,토요일,맑음,건조,차대차,2022,1,1,6,대구광역시,달서구,감삼동,교차로,교차로안,932.0,,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10958,토요일,맑음,건조,차대차,2022,12,31,18,대구광역시,남구,대명동,단일로,터널,5377.0,26.0,20.0,1.0,0.0
10959,토요일,맑음,건조,차대차,2022,12,31,18,대구광역시,수성구,시지동,단일로,기타,,5.0,,,
10960,토요일,맑음,건조,차대차,2022,12,31,20,대구광역시,수성구,연호동,단일로,기타,,,,,
10961,토요일,맑음,건조,차대차,2022,12,31,20,대구광역시,수성구,범물동,교차로,교차로부근,,7.0,,,


(10963, 18)


In [21]:
categorical_features = list(train_x.dtypes[train_x.dtypes == "object"].index)

In [22]:
display(categorical_features)

['요일', '기상상태', '노면상태', '사고유형', '도시', '구', '동', '도로형태1', '도로형태2']

In [23]:
for feature in tqdm(categorical_features, desc="Encoding features"):
    encoder = LabelEncoder()
    train_x[feature] = encoder.fit_transform(train_x[feature])
    for label in np.unique(test_x[feature]):
        if label not in encoder.classes_:
            encoder.classes_ = np.append(encoder.classes_, label)
    test_x[feature] = encoder.transform(test_x[feature])

Encoding features: 100%|█████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 76.52it/s]


In [24]:
train_x

Unnamed: 0,요일,기상상태,노면상태,사고유형,연,월,일,시간,도시,구,동,도로형태1,도로형태2,설치개수,cnt,급지구분_1,급지구분_2,급지구분_3
0,6,2,0,0,2019,1,1,0,0,7,40,2,5,391.0,2.0,11.0,0.0,0.0
1,6,5,0,0,2019,1,1,0,0,1,4,2,5,932.0,,0.0,1.0,3.0
2,6,2,0,0,2019,1,1,1,0,6,66,2,5,473.0,5.0,,,
3,6,2,0,1,2019,1,1,2,0,4,79,2,5,534.0,11.0,0.0,9.0,5.0
4,6,2,0,1,2019,1,1,4,0,3,129,2,5,2057.0,,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,0,2,0,1,2021,12,31,19,0,6,118,0,3,,1.0,,,
39605,0,2,0,1,2021,12,31,19,0,1,103,2,5,843.0,,0.0,0.0,5.0
39606,0,2,0,1,2021,12,31,21,0,1,144,0,3,164.0,,0.0,1.0,0.0
39607,0,2,0,1,2021,12,31,22,0,1,158,1,5,210.0,,0.0,0.0,1.0


In [25]:
test_x

Unnamed: 0,요일,기상상태,노면상태,사고유형,연,월,일,시간,도시,구,동,도로형태1,도로형태2,설치개수,cnt,급지구분_1,급지구분_2,급지구분_3
0,5,2,0,0,2022,1,1,1,0,6,99,0,3,700.0,5.0,,,
1,5,2,0,0,2022,1,1,1,0,6,168,2,5,,10.0,0.0,0.0,2.0
2,5,2,0,1,2022,1,1,4,0,6,117,0,3,,1.0,,,
3,5,2,0,1,2022,1,1,4,0,6,126,2,5,,7.0,0.0,2.0,1.0
4,5,2,0,1,2022,1,1,6,0,1,4,0,3,932.0,,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10958,5,2,0,1,2022,12,31,18,0,0,38,2,9,5377.0,26.0,20.0,1.0,0.0
10959,5,2,0,1,2022,12,31,18,0,6,123,2,5,,5.0,,,
10960,5,2,0,1,2022,12,31,20,0,6,134,2,5,,,,,
10961,5,2,0,1,2022,12,31,20,0,6,77,0,2,,7.0,,,


In [26]:
train_x.fillna(0, inplace=True)
test_x.fillna(0, inplace=True)

In [27]:
train_x['ECLO'] = train_y

In [28]:
display(train_x)
print(train_x.shape)

Unnamed: 0,요일,기상상태,노면상태,사고유형,연,월,일,시간,도시,구,동,도로형태1,도로형태2,설치개수,cnt,급지구분_1,급지구분_2,급지구분_3,ECLO
0,6,2,0,0,2019,1,1,0,0,7,40,2,5,391.0,2.0,11.0,0.0,0.0,5
1,6,5,0,0,2019,1,1,0,0,1,4,2,5,932.0,0.0,0.0,1.0,3.0,3
2,6,2,0,0,2019,1,1,1,0,6,66,2,5,473.0,5.0,0.0,0.0,0.0,3
3,6,2,0,1,2019,1,1,2,0,4,79,2,5,534.0,11.0,0.0,9.0,5.0,5
4,6,2,0,1,2019,1,1,4,0,3,129,2,5,2057.0,0.0,0.0,1.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,0,2,0,1,2021,12,31,19,0,6,118,0,3,0.0,1.0,0.0,0.0,0.0,3
39605,0,2,0,1,2021,12,31,19,0,1,103,2,5,843.0,0.0,0.0,0.0,5.0,3
39606,0,2,0,1,2021,12,31,21,0,1,144,0,3,164.0,0.0,0.0,1.0,0.0,10
39607,0,2,0,1,2021,12,31,22,0,1,158,1,5,210.0,0.0,0.0,0.0,1.0,3


(39609, 19)


In [29]:
display(test_x)
print(test_x.shape)

Unnamed: 0,요일,기상상태,노면상태,사고유형,연,월,일,시간,도시,구,동,도로형태1,도로형태2,설치개수,cnt,급지구분_1,급지구분_2,급지구분_3
0,5,2,0,0,2022,1,1,1,0,6,99,0,3,700.0,5.0,0.0,0.0,0.0
1,5,2,0,0,2022,1,1,1,0,6,168,2,5,0.0,10.0,0.0,0.0,2.0
2,5,2,0,1,2022,1,1,4,0,6,117,0,3,0.0,1.0,0.0,0.0,0.0
3,5,2,0,1,2022,1,1,4,0,6,126,2,5,0.0,7.0,0.0,2.0,1.0
4,5,2,0,1,2022,1,1,6,0,1,4,0,3,932.0,0.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10958,5,2,0,1,2022,12,31,18,0,0,38,2,9,5377.0,26.0,20.0,1.0,0.0
10959,5,2,0,1,2022,12,31,18,0,6,123,2,5,0.0,5.0,0.0,0.0,0.0
10960,5,2,0,1,2022,12,31,20,0,6,134,2,5,0.0,0.0,0.0,0.0,0.0
10961,5,2,0,1,2022,12,31,20,0,6,77,0,2,0.0,7.0,0.0,0.0,0.0


(10963, 18)


# Autogluon

In [30]:
train_selection_feature = ['사고유형','시간','도로형태2','요일','연','노면상태','급지구분_2','도로형태1','기상상태','급지구분_3','ECLO']
test_selection_feature = ['사고유형','시간','도로형태2','요일','연','노면상태','급지구분_2','도로형태1','기상상태','급지구분_3']

In [31]:
train_x = train_x[train_selection_feature]
test_x = test_x[test_selection_feature]

In [32]:
train_data = TabularDataset(train_x)
test_data = TabularDataset(test_x)

In [33]:
label = 'ECLO'
eval_metric = 'root_mean_squared_error'

In [34]:
predictor = TabularPredictor(
    label=label, problem_type='regression', eval_metric=eval_metric
).fit(train_data, 
      presets='best_quality', 
      num_stack_levels=3,
      #excluded_model_types = excluded_model_types,
      num_gpus=1)

No path specified. Models will be saved in: "AutogluonModels\ag-20231123_101040\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231123_101040\"
AutoGluon Version:  0.8.2
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   235.04 GB / 999.46 GB (23.5%)
Train Data Rows:    39609
Train Data Columns: 10
Label Column: ECLO
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    54853.46 MB
	Train Data (Original)  Memory Usage: 2.22 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...


In [35]:
predictor.leaderboard(silent = True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L5,-3.102442,14.660993,2437.236627,0.000997,0.546978,5,True,42
1,WeightedEnsemble_L4,-3.104203,10.740999,1823.791136,0.000976,0.518235,4,True,32
2,CatBoost_BAG_L4,-3.104513,12.445552,1937.416762,0.022497,50.991641,4,True,36
3,NeuralNetFastAI_BAG_L4,-3.106147,12.895363,2045.537423,0.472308,159.112303,4,True,38
4,LightGBM_BAG_L4,-3.106574,12.507573,1892.532447,0.084518,6.107326,4,True,34
5,NeuralNetFastAI_BAG_L3,-3.107336,8.892703,1401.923204,0.454069,151.084058,3,True,28
6,LightGBMXT_BAG_L4,-3.107465,12.532079,1893.162561,0.109025,6.737441,4,True,33
7,LightGBMXT_BAG_L3,-3.109172,8.559201,1257.976105,0.120567,7.136959,3,True,23
8,CatBoost_BAG_L3,-3.109992,8.459634,1311.317507,0.021,60.478362,3,True,26
9,LightGBM_BAG_L3,-3.112753,8.509975,1256.140386,0.071341,5.30124,3,True,24


In [36]:
predictor.feature_importance(train_data) 

Computing feature importance via permutation shuffling for 10 features using 5000 rows with 5 shuffle sets...
	692.21s	= Expected runtime (138.44s per shuffle set)
	199.87s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
사고유형,0.071682,0.009879,4.2e-05,5,0.092023,0.051341
시간,0.068934,0.005436,5e-06,5,0.080126,0.057742
요일,0.067169,0.008762,3.4e-05,5,0.085209,0.049129
도로형태2,0.053202,0.003258,2e-06,5,0.059909,0.046494
도로형태1,0.046021,0.00433,9e-06,5,0.054936,0.037105
급지구분_2,0.042934,0.004475,1.4e-05,5,0.052148,0.033721
연,0.041678,0.008289,0.000178,5,0.058745,0.024611
급지구분_3,0.037551,0.005437,5.1e-05,5,0.048746,0.026355
노면상태,0.006904,0.004612,0.014321,5,0.016399,-0.002592
기상상태,0.004929,0.001623,0.001227,5,0.00827,0.001587


In [37]:
model_to_use = predictor.get_model_best()
model_pred = predictor.predict(test_x, model=model_to_use)

In [38]:
sample_submission = pd.read_csv('../open/sample_submission.csv')
baseline_submission = sample_submission.copy()
baseline_submission['ECLO'] = model_pred.astype(int)
baseline_submission

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,3
1,ACCIDENT_39610,3
2,ACCIDENT_39611,5
3,ACCIDENT_39612,4
4,ACCIDENT_39613,4
...,...,...
10958,ACCIDENT_50567,6
10959,ACCIDENT_50568,5
10960,ACCIDENT_50569,5
10961,ACCIDENT_50570,5


In [39]:
baseline_submission.to_csv('../Sub/autogluon_new_data_feature_selection.csv', index=False)

In [41]:
baseline_submission[baseline_submission['ECLO']<0]

Unnamed: 0,ID,ECLO


In [42]:
baseline_submission['ECLO'].unique()

array([3, 5, 4, 7, 6, 2, 9, 8])

In [43]:
train_x['ECLO'].unique()

array([ 5,  3,  6, 18,  9,  8, 15,  2,  1,  7,  4, 12, 13, 10, 11, 16, 40,
       22, 21, 17, 25, 14, 20, 27, 19, 24, 26, 23, 28, 31, 30, 47, 36, 33,
       34, 35, 37, 65, 56, 32, 57, 45, 52, 74, 39, 66], dtype=int64)