In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler,RobustScaler
from category_encoders.target_encoder import TargetEncoder

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error,mean_squared_log_error,make_scorer
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from optuna.samplers import TPESampler


import os
from xgboost import XGBRegressor 
import xgboost as xgb 
from tqdm import tqdm
from sklearn.model_selection import KFold


In [2]:
train = pd.read_csv("train2.csv").drop(['사망자수','중상자수','경상자수','부상자수'],axis=1)
test = pd.read_csv("test(통합본).csv")

In [3]:
def gu(address):
    return address.split()[0]
def dong(address):
    return address.split()[1]
def road_1(x):
    return x.split(' - ')[0]
def road_2(x):
    return x.split(' - ')[1]

In [4]:
for df in [train,test]:
    df['구'] = df['주소'].apply(gu)
    df['동'] = df['주소'].apply(dong)
    df.drop('주소',axis=1,inplace=True)
    df['도로구분1'] = df['도로형태'].apply(road_1)
    df['도로구분2'] = df['도로형태'].apply(road_2)
    df.drop('도로형태',axis=1,inplace=True)

In [5]:
train = train[['요일', '기상상태', '도로구분1','도로구분2', '노면상태', '사고유형', '구', '동', '연휴', '년도', '월', '일', '시간',
       '설치장소', '설치개수', 'CCTV설치대수', '급지구분_1', '급지구분_2', '급지구분_3', 'ECLO']]
test = test[['요일', '기상상태', '도로구분1','도로구분2', '노면상태', '사고유형', '구', '동', '연휴', '년도', '월', '일', '시간',
       '설치장소', '설치개수', 'CCTV설치대수', '급지구분_1', '급지구분_2', '급지구분_3']]

In [6]:
categorical_features = list(train.dtypes[train.dtypes == "object"].index)
display(categorical_features)

for i in categorical_features:
    le = LabelEncoder()
    le=le.fit(train[i]) 
    train[i]=le.transform(train[i])
    
    for case in np.unique(test[i]):
        if case not in le.classes_: 
            le.classes_ = np.append(le.classes_, case) 
    test[i]=le.transform(test[i])

['요일', '기상상태', '도로구분1', '도로구분2', '노면상태', '사고유형', '구', '동']

In [7]:
train_x = train.drop('ECLO',axis=1)
train_y = np.log(train['ECLO'])
test_x = test

In [8]:
rs = RobustScaler()
X_standard = rs.fit_transform(train_x)
test_standard = rs.transform(test_x.iloc[:,:])

In [9]:
X_train, X_valid , y_train , y_valid  = train_test_split(X_standard,train_y, shuffle=True,test_size=0.2)

In [10]:
def rmsle(actual_values,predicted_values):
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)

    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)

    difference = log_predict - log_actual
    difference = np.square(difference)
    mean_difference = difference.mean()
    score = np.sqrt(mean_difference)

    return score

def objective(trial, train_x, train_y, val_x, val_y):
    
    param = {
        'lambda': trial.suggest_float('lambda', 1e-3, 0.1),
        'alpha': trial.suggest_float('alpha', 1e-3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'learning_rate': trial.suggest_float('learning_rate',0.0001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 4,20), # 8
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 50),
    }
    model =XGBRegressor(**param)  
    model.fit(train_x, train_y)
    preds = model.predict(val_x)
    score = rmsle(val_y, preds)
    
    return score

In [11]:
study =  optuna.create_study(study_name='Xgb', direction='minimize',sampler=TPESampler(seed=42) )
study.optimize(lambda trial: objective(trial,X_train,  y_train ,X_valid , y_valid ),n_trials=100, timeout = 1000 )
print('Best trial:', study.best_trial.params)
print('Best score:', study.best_value)

[I 2023-12-09 16:15:53,082] A new study created in memory with name: Xgb
[I 2023-12-09 16:15:56,017] Trial 0 finished with value: 0.27585284350887745 and parameters: {'lambda': 0.03807947176588889, 'alpha': 0.9507635921035062, 'colsample_bytree': 0.839196365086843, 'subsample': 0.759195090518222, 'learning_rate': 0.01568626218019941, 'n_estimators': 240, 'max_depth': 4, 'min_child_weight': 44}. Best is trial 0 with value: 0.27585284350887745.
[I 2023-12-09 16:16:00,240] Trial 1 finished with value: 0.28018932072527164 and parameters: {'lambda': 0.06051038616257767, 'alpha': 0.7083645052182495, 'colsample_bytree': 0.41235069657748147, 'subsample': 0.9819459112971965, 'learning_rate': 0.08326101981596214, 'n_estimators': 291, 'max_depth': 7, 'min_child_weight': 10}. Best is trial 0 with value: 0.27585284350887745.
[I 2023-12-09 16:16:06,296] Trial 2 finished with value: 0.28029673265264454 and parameters: {'lambda': 0.03111998205299424, 'alpha': 0.5252316752006057, 'colsample_bytree': 0.

[I 2023-12-09 16:21:40,514] Trial 22 finished with value: 0.2893232193247551 and parameters: {'lambda': 0.0467139907822923, 'alpha': 0.8732692860933504, 'colsample_bytree': 0.6678392576921028, 'subsample': 0.6885061433309765, 'learning_rate': 0.0022969779538975534, 'n_estimators': 504, 'max_depth': 6, 'min_child_weight': 42}. Best is trial 0 with value: 0.27585284350887745.
[I 2023-12-09 16:22:01,022] Trial 23 finished with value: 0.2855615970429135 and parameters: {'lambda': 0.040530686876425644, 'alpha': 0.991549382380117, 'colsample_bytree': 0.5617385677616494, 'subsample': 0.7592124760561969, 'learning_rate': 0.044732695568703226, 'n_estimators': 758, 'max_depth': 9, 'min_child_weight': 35}. Best is trial 0 with value: 0.27585284350887745.
[I 2023-12-09 16:22:04,613] Trial 24 finished with value: 0.2768561273972392 and parameters: {'lambda': 0.02257305227451984, 'alpha': 0.7776198450984917, 'colsample_bytree': 0.732843028879555, 'subsample': 0.8139566329840936, 'learning_rate': 0.0

[I 2023-12-09 16:23:46,557] Trial 44 finished with value: 0.27995650206738254 and parameters: {'lambda': 0.010626102166477178, 'alpha': 0.5466230449660341, 'colsample_bytree': 0.9524527922054138, 'subsample': 0.8540256896157712, 'learning_rate': 0.028117097563580752, 'n_estimators': 285, 'max_depth': 9, 'min_child_weight': 4}. Best is trial 27 with value: 0.27503797579693756.
[I 2023-12-09 16:23:53,018] Trial 45 finished with value: 0.27671623062692197 and parameters: {'lambda': 0.017012955146556635, 'alpha': 0.8336430359026353, 'colsample_bytree': 0.8843627063595672, 'subsample': 0.8996570345553732, 'learning_rate': 0.02116830517422659, 'n_estimators': 154, 'max_depth': 11, 'min_child_weight': 8}. Best is trial 27 with value: 0.27503797579693756.
[I 2023-12-09 16:23:56,248] Trial 46 finished with value: 0.27713720140424014 and parameters: {'lambda': 0.026163867753485157, 'alpha': 0.6639542895544751, 'colsample_bytree': 0.8350333193447442, 'subsample': 0.9724935232358467, 'learning_rat

[I 2023-12-09 16:26:23,402] Trial 66 finished with value: 0.27625571282446915 and parameters: {'lambda': 0.010207350892826438, 'alpha': 0.9015257560885556, 'colsample_bytree': 0.6703558503105629, 'subsample': 0.9417526489932739, 'learning_rate': 0.009807879379052416, 'n_estimators': 480, 'max_depth': 5, 'min_child_weight': 29}. Best is trial 58 with value: 0.27497521425158783.
[I 2023-12-09 16:26:27,848] Trial 67 finished with value: 0.2752715300123911 and parameters: {'lambda': 0.02055233680395401, 'alpha': 0.798612221837634, 'colsample_bytree': 0.7157215697856235, 'subsample': 0.9238877351725046, 'learning_rate': 0.007994968281537205, 'n_estimators': 392, 'max_depth': 4, 'min_child_weight': 34}. Best is trial 58 with value: 0.27497521425158783.
[I 2023-12-09 16:26:34,152] Trial 68 finished with value: 0.27502711139522285 and parameters: {'lambda': 0.027739302180147883, 'alpha': 0.7926830429571868, 'colsample_bytree': 0.7238547412870688, 'subsample': 0.920909647781203, 'learning_rate'

[I 2023-12-09 16:29:37,875] Trial 88 finished with value: 0.2747944916686184 and parameters: {'lambda': 0.018996125692128704, 'alpha': 0.9955756403984131, 'colsample_bytree': 0.8198962477380932, 'subsample': 0.9646293166177486, 'learning_rate': 0.0036701912549986218, 'n_estimators': 717, 'max_depth': 5, 'min_child_weight': 33}. Best is trial 88 with value: 0.2747944916686184.
[I 2023-12-09 16:29:47,976] Trial 89 finished with value: 0.27480578198646655 and parameters: {'lambda': 0.01882777008810925, 'alpha': 0.899813117107597, 'colsample_bytree': 0.7009185772037204, 'subsample': 0.96197040296493, 'learning_rate': 0.0030605975955611325, 'n_estimators': 734, 'max_depth': 5, 'min_child_weight': 33}. Best is trial 88 with value: 0.2747944916686184.
[I 2023-12-09 16:29:56,324] Trial 90 finished with value: 0.46697411865191696 and parameters: {'lambda': 0.013574571364662228, 'alpha': 0.9992021564683484, 'colsample_bytree': 0.699043379793506, 'subsample': 0.9669233718588048, 'learning_rate': 

Best trial: {'lambda': 0.02038364412442828, 'alpha': 0.9518984718482559, 'colsample_bytree': 0.8244013234312235, 'subsample': 0.9422387795962343, 'learning_rate': 0.002770450246681311, 'n_estimators': 883, 'max_depth': 5, 'min_child_weight': 30}
Best score: 0.274715148932495


In [12]:
model_xgb = XGBRegressor(**study.best_trial.params)
kf = KFold(n_splits=5, shuffle=True , random_state=42)

ensemble_predicts= []
scores =[]


for train_idx, val_idx in tqdm(kf.split(X_standard), total=5, desc="Processing folds"):
    X_t, X_val = X_standard[train_idx], X_standard[val_idx]
    y_t, y_val = train_y[train_idx], train_y[val_idx]
    
    # 두 모델 모두 학습
    model_xgb.fit(X_t, y_t)
    
    # 각 모델로부터 Validation set에 대한 예측을 평균내어 앙상블 예측 생성
    val_pred = model_xgb.predict(X_val)
    
    # Validation set에 대한 대회 평가 산식 계산 후 저장
    scores.append(rmsle(y_val, val_pred))
    
    # test 데이터셋에 대한 예측 수행 후 저장
    model_xgb_pred = np.exp(model_xgb.predict(test_standard))
    model_xgb_pred = np.where(model_xgb_pred < 0, 0, model_xgb_pred)
    
    ensemble_predicts.append(model_xgb_pred)

# K-fold 모든 예측의 평균을 계산하여 fold별 모델들의 앙상블 예측 생성
final_predictions = np.mean(ensemble_predicts, axis=0)

# 각 fold에서의 Validation Metric Score와 전체 평균 Validation Metric Score출력
print("Validation : RMSLE scores for each fold:", scores)
print("Validation : RMSLE:", np.mean(scores))


Processing folds: 100%|██████████████████████████████████████████████████████████████████| 5/5 [01:00<00:00, 12.18s/it]

Validation : RMSLE scores for each fold: [0.27649451142710235, 0.2807618284962091, 0.2751382161529211, 0.2648698341279104, 0.27359812753510987]
Validation : RMSLE: 0.27417250354785055





In [15]:
final_predictions

array([3.3672576, 3.2248642, 4.5092278, ..., 4.005776 , 3.98694  ,
       4.02114  ], dtype=float32)

In [16]:
submission = pd.read_csv('C:/Users/User/Desktop/대구교통사고예측/sample_submission.csv')
submission['ECLO'] = final_predictions
submission.to_csv('C:/Users/User/Desktop/대구교통사고예측/optuna_xgb2.csv',index=False)
submission

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,3.367258
1,ACCIDENT_39610,3.224864
2,ACCIDENT_39611,4.509228
3,ACCIDENT_39612,3.952304
4,ACCIDENT_39613,3.980799
...,...,...
10958,ACCIDENT_50567,3.866031
10959,ACCIDENT_50568,3.967485
10960,ACCIDENT_50569,4.005776
10961,ACCIDENT_50570,3.986940


In [17]:
min(final_predictions),max(final_predictions),np.mean(final_predictions)

(2.4403484, 4.7770643, 3.6671457)