In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sklearn
import lightgbm
import catboost
import ngboost
import sys
import pandas as pd
import numpy as np
import platform


print(f"파이썬 버전 : {sys.version}")
print(f"pandas 버전 : {pd.__version__}")
print(f"numpy 버전 : {np.__version__}")
print(f"sklearn 버전 : {sklearn.__version__}")
print(f"lgbm 버전 확인 : {lightgbm.__version__}")
print(f"catboost 버전 : {catboost.__version__}")
print(f"ngboost 버전 : {ngboost.__version__}")
print("OS :",platform.platform())

파이썬 버전 : 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]
pandas 버전 : 1.5.3
numpy 버전 : 1.23.5
sklearn 버전 : 1.2.2
lgbm 버전 확인 : 4.1.0
catboost 버전 : 1.2.2
ngboost 버전 : 0.4.2
OS : Linux-5.15.120+-x86_64-with-glibc2.35


In [None]:
!head /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 63
model name	: Intel(R) Xeon(R) CPU @ 2.30GHz
stepping	: 0
microcode	: 0xffffffff
cpu MHz		: 2299.998
cache size	: 46080 KB
physical id	: 0


In [None]:
!nvidia-smi

Wed Dec 13 11:39:26 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import pandas as pd
import numpy as np
import datetime
from tqdm.auto import tqdm
import joblib
import os
import torch
import random
import holidays
from sklearn.cluster import KMeans
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from ngboost import NGBRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False

SEED = 42
seed_everything(SEED)

In [None]:
def rmsle(y_actual, y_pred):
    diff = np.log(y_pred + 1) - np.log(y_actual + 1)
    mean_error = np.mean(np.square(diff))

    return np.sqrt(mean_error)

rmsle_score = make_scorer(rmsle, greater_is_better=False)

In [None]:
def rmsle_log(y_valid, pred):
    y_valid = np.expm1(y_valid)
    pred = np.expm1(pred)
    msle = mean_squared_log_error(y_valid, pred)
    return np.sqrt(msle)

rmsle_log_score = make_scorer(rmsle_log, greater_is_better=False)

In [None]:
DATA_PATH = "/content/drive/MyDrive/DACON 대구 교통사고 피해 예측/data/"
SUBMISSION_PATH = "/content/drive/MyDrive/DACON 대구 교통사고 피해 예측/submission/"
MODEL_PATH = "/content/drive/MyDrive/DACON 대구 교통사고 피해 예측/model/"

train_df = pd.read_csv(f"{DATA_PATH}train.csv")
accident_df = pd.read_csv(f'{DATA_PATH}countrywide_accident.csv')

test_df = pd.read_csv(f"{DATA_PATH}test.csv")

train_df.shape, test_df.shape, accident_df.shape

((39609, 23), (10963, 8), (602775, 23))

#### test set에는 없는 항목 삭제

In [None]:
# 안개 삭제
train_df.drop(train_df[train_df['기상상태'] == '안개'].index, inplace=True)
accident_df.drop(accident_df[accident_df['기상상태'] == '안개'].index, inplace=True)

# 해빙, nan 삭제
accident_df.drop(accident_df[accident_df['노면상태'] == '해빙'].index, inplace=True)
accident_df.drop(accident_df[accident_df['노면상태'].isnull()].index, inplace=True)

# 철길건널목 삭제
accident_df.drop(accident_df[accident_df['사고유형'] == '철길건널목'].index, inplace=True)

In [None]:
# 시간 관련
kr_holidays = holidays.KR()

train_df['사고일시'] = pd.to_datetime(train_df['사고일시'])
train_df['연'] = train_df['사고일시'].dt.year
train_df['월'] = train_df['사고일시'].dt.month
train_df['일'] = train_df['사고일시'].dt.day
train_df['시간'] = train_df['사고일시'].dt.hour
train_df['요일'] = train_df['사고일시'].dt.day_of_week
train_df['공휴일'] = train_df['사고일시'].apply(lambda x : int(x in kr_holidays))

accident_df['사고일시'] = pd.to_datetime(accident_df['사고일시'])
accident_df['연'] = accident_df['사고일시'].dt.year
accident_df['월'] = accident_df['사고일시'].dt.month
accident_df['일'] = accident_df['사고일시'].dt.day
accident_df['시간'] = accident_df['사고일시'].dt.hour
accident_df['요일'] = accident_df['사고일시'].dt.day_of_week
accident_df['공휴일'] = accident_df['사고일시'].apply(lambda x : int(x in kr_holidays))

test_df['사고일시'] = pd.to_datetime(test_df['사고일시'])
test_df['연'] = test_df['사고일시'].dt.year
test_df['월'] = test_df['사고일시'].dt.month
test_df['일'] = test_df['사고일시'].dt.day
test_df['시간'] = test_df['사고일시'].dt.hour
test_df['요일'] = test_df['사고일시'].dt.day_of_week
test_df['공휴일'] = test_df['사고일시'].apply(lambda x : int(x in kr_holidays))

# 장소 관련
pat = r'(\S+) (\S+) (\S+)'
train_df[['시', '구', '동']] = train_df['시군구'].str.extract(pat)
accident_df[['시', '구', '동']] = accident_df['시군구'].str.extract(pat)
test_df[['시', '구', '동']] = test_df['시군구'].str.extract(pat)


# 도로형태
pat =  r'(.+) - (.+)'
train_df[['도로형태_1', '도로형태_2']] = train_df['도로형태'].str.extract(pat)
accident_df[['도로형태_1', '도로형태_2']] = accident_df['도로형태'].str.extract(pat)
test_df[['도로형태_1', '도로형태_2']] = test_df['도로형태'].str.extract(pat)

# 광역시만 선택
city = ['서울특별시', '인천광역시', '광주광역시', '부산광역시', '울산광역시', '대전광역시']
mask = accident_df['시'].isin(city)
accident_df = accident_df.loc[mask, :].reset_index(drop=True)


eclo_cols = ['ID', 'ECLO', '연', '월', '시간', '요일', '공휴일', '시', '구', '동','사망자수', '중상자수','경상자수', '부상자수']

train_cols = ['ID', 'ECLO', '연', '월', '시간', '요일', '공휴일', '시', '구', '동', '기상상태', '도로형태_1', '도로형태_2', '노면상태', '사고유형']
test_cols = ['ID', '연', '월', '시간', '요일', '공휴일', '시', '구', '동', '기상상태', '도로형태_1', '도로형태_2', '노면상태', '사고유형']


train_eclo = train_df[eclo_cols].copy()
accident_eclo = accident_df[eclo_cols].copy()

train_df = train_df[train_cols]
accident_df = accident_df[train_cols]
test_df = test_df[test_cols]

# 대구 데이터와 전체데이터 결합
train_df = pd.concat([train_df, accident_df]).reset_index(drop=True)
train_eclo = pd.concat([train_eclo, accident_eclo]).reset_index(drop=True)

#### 사상자 관련 피쳐 추가

In [None]:
train_eclo['사상자'] = train_eclo['사망자수'] + train_eclo['중상자수'] + train_eclo['경상자수'] + train_eclo['부상자수']

tmp_mean = train_eclo.groupby(['시간','요일'])[['사상자']].mean().reset_index()

train_df = train_df.merge(tmp_mean,how='left',on=['시간','요일'], suffixes= ('','_mean'))
test_df = test_df.merge(tmp_mean,how='left',on=['시간','요일'], suffixes= ('','_mean'))

#### 동별 군집분석을 통한 피쳐 추가

In [None]:
train_df['시_구_동'] = train_df['시'] + train_df['구'] + train_df['동']
test_df['시_구_동'] = test_df['시'] + test_df['구'] + test_df['동']

tmp = train_df.pivot_table(values='ECLO', index=train_df['시_구_동'], columns = '시간', aggfunc = "mean" )
tmp = tmp.fillna(0)
cluster = KMeans(n_clusters=8, random_state=SEED, n_init='auto')
cluster.fit(tmp)
kmeans = cluster.predict(tmp)
tmp['군집분석_1'] = kmeans

train_df = train_df.merge(tmp.군집분석_1, how='left',on='시_구_동')
test_df = test_df.merge(tmp.군집분석_1, how='left',on='시_구_동')

In [None]:
# 결측치 확인 및 결측치 0으로 치환.
print(train_df.isnull().sum().sum())
print(test_df.isnull().sum().sum())

train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

0
0


In [None]:
# target 균등 분배를 위한 target 범주화
train_df['ECLO_cat'] = 0
train_df.loc[train_df['ECLO'] < 10, "ECLO_cat"] = train_df.loc[train_df['ECLO'] < 10, "ECLO"]
train_df.loc[train_df['ECLO'] >= 10, "ECLO_cat"] = (train_df.loc[train_df['ECLO'] >= 10, "ECLO"]//10)*10
train_df.loc[train_df['ECLO'] >= 70, "ECLO_cat"] = 70

In [None]:
train_ft = train_df.drop(columns = ['ID', 'ECLO', '시_구_동', 'ECLO_cat']).copy()
test_ft = test_df.drop(columns = ['ID', '시_구_동']).copy()

target = train_df['ECLO']
target_log = np.log1p(target).copy()
target_cat = train_df['ECLO_cat']

train_ft = train_ft.astype({'월':'object',
                            '시간':'object',
                            '요일':'object',
                            '군집분석_1':'object'})
test_ft = test_ft.astype({'월':'object',
                            '시간':'object',
                            '요일':'object',
                            '군집분석_1':'object'})

In [None]:
numeric_cols = train_ft.select_dtypes(exclude="object").columns.tolist()
category_cols = train_ft.select_dtypes(include="object").columns.tolist()

mask = train_ft[category_cols].nunique() <= 10
category_cols_1 = train_ft[category_cols].nunique().loc[mask].index.tolist()
category_cols_2 = train_ft[category_cols].nunique().loc[-mask].index.tolist()

train_enc = train_ft.copy()
test_enc = test_ft.copy()

enc = OneHotEncoder(handle_unknown = 'ignore')
# 학습데이터
tmp = pd.DataFrame(
    enc.fit_transform(train_enc[category_cols_1]).toarray(),
    columns = enc.get_feature_names_out()
)
train_enc = pd.concat([train_enc,tmp],axis=1).drop(columns=category_cols_1)

# 테스트데이터
tmp = pd.DataFrame(
    enc.transform(test_enc[category_cols_1]).toarray(),
    columns = enc.get_feature_names_out()
)
test_enc = pd.concat([test_enc,tmp],axis=1).drop(columns=category_cols_1)

for col in category_cols_2:
    en = TargetEncoder(cols=[col])
    train_enc[col] = en.fit_transform(train_ft[col], target)
    test_enc[col] = en.transform(test_ft[col])

scaler_ft = StandardScaler()

train_ft[numeric_cols] = scaler_ft.fit_transform(train_ft[numeric_cols])
test_ft[numeric_cols] = scaler_ft.transform(test_ft[numeric_cols])

scaler_enc = StandardScaler()

train_enc[train_enc.columns] = scaler_enc.fit_transform(train_enc)
test_enc[test_enc.columns] = scaler_enc.transform(test_enc)

drop_cols = ['도로형태_2_미분류', '도로형태_2_주차장', '노면상태_침수', '군집분석_1_6']
train_enc = train_enc.drop(columns=drop_cols)
test_enc = test_enc.drop(columns=drop_cols)

In [None]:
lgbm_params = {'n_estimators': 729,
               'max_depth': 30,
               'num_leaves': 31,
               'learning_rate': 0.023386169098808,
               'colsample_bytree': 0.3131206580677619,
               'subsample': 0.7906188904981659,
               'min_child_samples': 75,
               'reg_alpha': 3.2018968473513207,
               'reg_lambda': 8.379074404773137}

cat_params = {'n_estimators': 2612,
              'depth': 7,
              'learning_rate': 0.02607024758370768,
              'min_data_in_leaf': 3,
              'l2_leaf_reg': 9.699098521619943,
              'one_hot_max_size': 10}

ngb_params = {'n_estimators': 570,
              'learning_rate': 0.040215545266902894,
              'Base': DecisionTreeRegressor(criterion='friedman_mse', max_depth=4, random_state=42),
              'natural_gradient': True,
              'col_sample': 0.7290071707509156,
              'minibatch_frac': 0.7712703489732423}

#### LGBM

In [None]:
SEED_list = [6, 59, 82]

# 모델 저장할 폴더 생성
lgbm_path = f"{MODEL_PATH}lgbm_model"
os.mkdir(lgbm_path)

SEED_lgbm_scores = []
for seed in tqdm(SEED_list):

    # SEED 별 폴더 생성
    seed_lgbm_path = f"{lgbm_path}/{seed}"
    os.mkdir(seed_lgbm_path)

    seed_everything(seed)

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    lgbm_scores = []
    for i, (tri , vai) in tqdm(enumerate(cv.split(train_enc, target_cat)), total=10):
        x_train = train_enc.iloc[tri]
        y_train = target_log.iloc[tri]

        x_valid = train_enc.iloc[vai]
        y_valid = target_log.iloc[vai]

        lgbm_model = LGBMRegressor(**lgbm_params, random_state=seed, objective='rmse', metric='rmse', verbosity=-1, n_jobs=-1)
        lgbm_model.fit(x_train, y_train)

        pred = lgbm_model.predict(x_valid)
        score = rmsle_log(y_valid,pred)
        lgbm_scores.append(score)
        # 모델 저장
        joblib.dump(lgbm_model, f"{seed_lgbm_path}/{seed}_{i}_lgbm_model.pkl")

    SEED_lgbm_scores.append(np.mean(lgbm_scores))

#### Catboost

In [None]:
SEED_list = [6, 59, 82]

#모델 저장할 폴더 생성
cat_path = f"{MODEL_PATH}cat_model"
os.mkdir(cat_path)

SEED_cat_scores = []
for seed in tqdm(SEED_list):

    seed_cat_path = f"{cat_path}/{seed}"
    os.mkdir(seed_cat_path)

    seed_everything(seed)

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    cat_scores = []
    for i, (tri , vai) in tqdm(enumerate(cv.split(train_ft, target_cat)), total=10):
        x_train = train_ft.iloc[tri]
        y_train = target_log.iloc[tri]

        x_valid = train_ft.iloc[vai]
        y_valid = target_log.iloc[vai]

        cat_model = CatBoostRegressor(**cat_params, random_state=seed, eval_metric='RMSE', cat_features=category_cols, task_type='GPU', verbose=False)
        cat_model.fit(x_train, y_train)
        pred = cat_model.predict(x_valid)
        score = rmsle_log(y_valid,pred)
        cat_scores.append(score)

        # 모델 저장
        joblib.dump(cat_model, f"{seed_cat_path}/{seed}_{i}_cat_model.pkl")

    SEED_cat_scores.append(np.mean(cat_scores))

#### NGBoost

In [None]:
SEED_list = [6, 59, 82]

#모델 저장할 폴더 생성
ngb_path = f"{MODEL_PATH}ngb_model"
# os.mkdir(ngb_path)

SEED_ngb_scores = []
for seed in tqdm(SEED_list):

    seed_ngb_path = f"{ngb_path}/{seed}"
    os.mkdir(seed_ngb_path)

    seed_everything(seed)

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    ngb_scores = []
    for i, (tri , vai) in tqdm(enumerate(cv.split(train_ft, target_cat)), total=10):
        x_train = train_enc.iloc[tri]
        y_train = target_log.iloc[tri]

        x_valid = train_enc.iloc[vai]
        y_valid = target_log.iloc[vai]

        ngb_model = NGBRegressor(**ngb_params, random_state=SEED, verbose=False, early_stopping_rounds=100)
        ngb_model.fit(x_train, y_train)
        pred = ngb_model.predict(x_valid)
        score = rmsle_log(y_valid,pred)
        ngb_scores.append(score)

        # 모델 저장
        joblib.dump(ngb_model, f"{seed_ngb_path}/{seed}_{i}_ngb_model.pkl")

    SEED_ngb_scores.append(np.mean(ngb_scores))

#### 모델별 예측 진행

In [None]:
SEED_list = [6, 59, 82]

# lgbm
lgbm_seed_pred = []
for i, seed in tqdm(enumerate(SEED_list), total=3):
    seed_everything(seed)
    lgbm_pred_list = []
    for i in range(10):
        model = joblib.load(f"{seed_lgbm_path}/{seed}_{i}_lgbm_model.pkl")
        pred = model.predict(test_enc)
        pred = np.expm1(pred)
        lgbm_pred_list.append(pred)
    lgbm_pred_mean = np.mean(lgbm_pred_list, axis=0)
    lgbm_seed_pred.append(lgbm_pred_mean)

# cat
cat_seed_pred = []
for i, seed in tqdm(enumerate(SEED_list), total=3):
    seed_everything(seed)
    cat_pred_list = []
    for i in range(10):
        model = joblib.load(f"{seed_cat_path}/{seed}_{i}_cat_model.pkl")
        pred = model.predict(test_ft)
        pred = np.expm1(pred)
        cat_pred_list.append(pred)
    cat_pred_mean = np.mean(cat_pred_list, axis=0)
    cat_seed_pred.append(cat_pred_mean)

# ngb
ngb_seed_pred = []
for i, seed in tqdm(enumerate(SEED_list), total=3):
    seed_everything(seed)
    ngb_pred_list = []
    for i in range(10):
        model = joblib.load(f"{seed_ngb_path}/{seed}_{i}_ngb_model.pkl")
        pred = model.predict(test_enc)
        pred = np.expm1(pred)
        ngb_pred_list.append(pred)
    ngb_pred_mean = np.mean(ngb_pred_list, axis=0)
    ngb_seed_pred.append(ngb_pred_mean)

In [None]:
submission = pd.read_csv(f'{SUBMISSION_PATH}sample_submission.csv')

submission['lgbm_ECLO'] = np.mean(lgbm_seed_pred, axis=0)
submission['cat_ECLO'] = np.mean(cat_seed_pred, axis=0)
submission['ngb_ECLO'] = np.mean(ngb_seed_pred, axis=0)

submission['ECLO'] = 0.4*submission['cat_ECLO']  + 0.3*submission['lgbm_ECLO'] + 0.3*submission['ngb_ECLO']

submission = submission.iloc[:, :2]

submission.to_csv(f"{SUBMISSION_PATH}seed_ensemble.csv", index=False)