**기본적인 데이터 처리(Baseline Code)**

**1. Feature Engineering**

In [2]:
import os
import random
import numpy as np

In [3]:
# random seed 설정

def seed_everything(seed):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)

seed_everything(1504)

In [6]:
# 파일 입력받기

import pandas as pd

train_org = pd.read_csv('train.csv')
test_org = pd.read_csv('test.csv')
sample_submission = pd.read_csv("sample_submission.csv")

In [526]:
train_org.head()

Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사고유형 - 세부분류,법규위반,...,가해운전자 상해정도,피해운전자 차종,피해운전자 성별,피해운전자 연령,피해운전자 상해정도,사망자수,중상자수,경상자수,부상자수,ECLO
0,ACCIDENT_00000,2019-01-01 00,화요일,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,길가장자리구역통행중,안전운전불이행,...,상해없음,보행자,여,70세,중상,0,1,0,0,5
1,ACCIDENT_00001,2019-01-01 00,화요일,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,보도통행중,기타,...,상해없음,보행자,남,61세,경상,0,0,1,0,3
2,ACCIDENT_00002,2019-01-01 01,화요일,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,차도통행중,안전운전불이행,...,상해없음,보행자,남,38세,경상,0,0,1,0,3
3,ACCIDENT_00003,2019-01-01 02,화요일,맑음,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,상해없음,승용,남,36세,중상,0,1,0,0,5
4,ACCIDENT_00004,2019-01-01 04,화요일,맑음,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,상해없음,승용,남,52세,경상,0,0,1,0,3


In [7]:
train_df = train_org.copy()
test_df = test_org.copy()

time_pattern = r'(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2})'

train_df[['연', '월', '일', '시간']] = train_org['사고일시'].str.extract(time_pattern)
train_df[['연', '월', '일', '시간']] = train_df[['연', '월', '일', '시간']].apply(pd.to_numeric) # 추출된 문자열을 수치화해줍니다
train_df = train_df.drop(columns=['사고일시']) # 정보 추출이 완료된 '사고일시' 컬럼은 제거합니다

# 해당 과정을 test_x에 대해서도 반복해줍니다
test_df[['연', '월', '일', '시간']] = test_org['사고일시'].str.extract(time_pattern)
test_df[['연', '월', '일', '시간']] = test_df[['연', '월', '일', '시간']].apply(pd.to_numeric)
test_df = test_df.drop(columns=['사고일시'])

In [8]:
location_pattern = r'(\S+) (\S+) (\S+)'

train_df[['도시', '구', '동']] = train_org['시군구'].str.extract(location_pattern)
train_df = train_df.drop(columns=['시군구'])

test_df[['도시', '구', '동']] = test_org['시군구'].str.extract(location_pattern)
test_df = test_df.drop(columns=['시군구'])

In [9]:
road_pattern = r'(.+) - (.+)'

train_df[['도로형태1', '도로형태2']] = train_org['도로형태'].str.extract(road_pattern)
train_df = train_df.drop(columns=['도로형태'])

test_df[['도로형태1', '도로형태2']] = test_org['도로형태'].str.extract(road_pattern)
test_df = test_df.drop(columns=['도로형태'])

In [10]:
test_x = test_df.drop(columns=['ID']).copy()
train_x = train_df[test_x.columns].copy()
train_y = train_df['ECLO'].copy()

In [531]:
train_x.head()

Unnamed: 0,요일,기상상태,노면상태,사고유형,연,월,일,시간,도시,구,동,도로형태1,도로형태2
0,화요일,맑음,건조,차대사람,2019,1,1,0,대구광역시,중구,대신동,단일로,기타
1,화요일,흐림,건조,차대사람,2019,1,1,0,대구광역시,달서구,감삼동,단일로,기타
2,화요일,맑음,건조,차대사람,2019,1,1,1,대구광역시,수성구,두산동,단일로,기타
3,화요일,맑음,건조,차대차,2019,1,1,2,대구광역시,북구,복현동,단일로,기타
4,화요일,맑음,건조,차대차,2019,1,1,4,대구광역시,동구,신암동,단일로,기타


In [11]:
from sklearn.preprocessing import LabelEncoder

categorical_features = list(train_x.dtypes[train_x.dtypes == "object"].index)

for i in categorical_features:
    le = LabelEncoder()
    le=le.fit(train_x[i])
    train_x[i]=le.transform(train_x[i])

    for case in np.unique(test_x[i]):
        if case not in le.classes_:
            le.classes_ = np.append(le.classes_, case)
    test_x[i]=le.transform(test_x[i])

In [533]:
train_x.head()

Unnamed: 0,요일,기상상태,노면상태,사고유형,연,월,일,시간,도시,구,동,도로형태1,도로형태2
0,6,2,0,0,2019,1,1,0,0,7,40,2,5
1,6,5,0,0,2019,1,1,0,0,1,4,2,5
2,6,2,0,0,2019,1,1,1,0,6,66,2,5
3,6,2,0,1,2019,1,1,2,0,4,79,2,5
4,6,2,0,1,2019,1,1,4,0,3,129,2,5


**직접 작성한 코드**

In [12]:
# test_df에 없는 필요없는 설명변수 제거

train_x = train_x.drop(columns = ['기상상태', '연', '월', '일', '도시', '구', '동', '도로형태2'])

test_x = test_x.drop(columns = ['기상상태', '연', '월', '일', '도시', '구', '동', '도로형태2'])

In [13]:
# 더미 만들기

train_x = pd.get_dummies(train_x, columns = ['요일', '노면상태', '사고유형', '도로형태1'])
test_x = pd.get_dummies(test_x, columns = ['요일', '노면상태', '사고유형', '도로형태1'])

In [536]:
train_x.head()

Unnamed: 0,시간,요일_0,요일_1,요일_2,요일_3,요일_4,요일_5,요일_6,노면상태_0,노면상태_1,...,노면상태_4,노면상태_5,사고유형_0,사고유형_1,사고유형_2,도로형태1_0,도로형태1_1,도로형태1_2,도로형태1_3,도로형태1_4
0,0,0,0,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,1,0,0
3,2,0,0,0,0,0,0,1,1,0,...,0,0,0,1,0,0,0,1,0,0
4,4,0,0,0,0,0,0,1,1,0,...,0,0,0,1,0,0,0,1,0,0


**2. Predictive Modeling**

In [14]:
# Calculate RMSLE(Root Mean Squared Logarithmic Error)

def RMSLE(y_pred, y_true):
    y_true = np.log1p(y_true)
    y_pred = np.log1p(y_pred)

    squared_log_diff = (y_true - y_pred) ** 2
    mean_squared_log_diff = np.mean(squared_log_diff)
    rmsle = np.sqrt(mean_squared_log_diff)

    return rmsle

In [15]:
# Sampling

train_train_x = train_x.sample(frac = 0.7, replace=False)
train_train_y = train_y.loc[train_train_x.index]

train_test_x = train_x.drop(train_train_x.index)
train_test_y = train_y.drop(train_train_x.index)

In [16]:
# 이상치 확인

print(train_y.value_counts().sort_index())
print(f"ECLO가 11 이하인 데이터의 비율: {100*sum(train_y<=11)/len(train_y)}%")

1      2578
2       150
3     18675
4       818
5      6743
6      4967
7       263
8      1152
9      1528
10      611
11      394
12      611
13      194
14      145
15      283
16      101
17       78
18       89
19       36
20       39
21       33
22       17
23       17
24       18
25        7
26       12
27       11
28        6
30        5
31        4
32        3
33        2
34        3
35        2
36        1
37        3
39        1
40        1
45        1
47        1
52        1
56        1
57        1
65        1
66        1
74        1
Name: ECLO, dtype: int64
ECLO가 11 이하인 데이터의 비율: 95.63230578908833%


In [17]:
# 이상치 제거

filtered_data = train_train_y[train_train_y <= 11]
filtered_indices = filtered_data.index

filtered_train_x = train_train_x.loc[filtered_indices]
train_train_y = filtered_data

In [18]:
# Machine Learning

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [542]:
# KFold

import statistics as st

from sklearn.model_selection import KFold #for K-fold cross validation

kfold = KFold(n_splits=5) # k=5, split the data into 10 equal parts

models = [LinearRegression(),
         KNeighborsRegressor(n_neighbors=5),
         DecisionTreeRegressor(),
         RandomForestRegressor(n_estimators=100),
         LGBMRegressor(n_estimators=100, verbose=-1),
         XGBRegressor(random_state=200, n_estimators=50, max_depth=5, gamma=0),
         GradientBoostingRegressor()]

for model in models:
    L = []

    for train_index, test_index in kfold.split(train_x):

      train_train_x, train_test_x = train_x.iloc[train_index], train_x.iloc[test_index]
      train_train_y, train_test_y = train_y.iloc[train_index], train_y.iloc[test_index]

      model.fit(train_train_x, train_train_y)
      prediction = model.predict(train_test_x)
      ans = RMSLE(prediction, train_test_y)

      L.append(ans)

    print(f"{model}의 평균: {st.mean(L)}")
    print(f"{model}의 표준편차: {st.stdev(L)}")


LinearRegression()의 평균: 0.46068356010053335
LinearRegression()의 표준편차: 0.012719517195355547
KNeighborsRegressor()의 평균: 0.505654766772734
KNeighborsRegressor()의 표준편차: 0.01422384090269462
DecisionTreeRegressor()의 평균: 0.4766176433028049
DecisionTreeRegressor()의 표준편차: 0.015089600810019308
RandomForestRegressor()의 평균: 0.4709979979383737
RandomForestRegressor()의 표준편차: 0.013998971227547705
LGBMRegressor(verbose=-1)의 평균: 0.4614836923439278
LGBMRegressor(verbose=-1)의 표준편차: 0.013361430943391879
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max

In [19]:
# 제출용

from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(train_x, train_y)

prediction = model.predict(test_x)


In [20]:
# 제출

baseline_submission = sample_submission.copy()
baseline_submission['ECLO'] = prediction
baseline_submission

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,4.250952
1,ACCIDENT_39610,4.137993
2,ACCIDENT_39611,5.294434
3,ACCIDENT_39612,5.181475
4,ACCIDENT_39613,5.275016
...,...,...
10958,ACCIDENT_50567,5.045545
10959,ACCIDENT_50568,5.045545
10960,ACCIDENT_50569,5.026126
10961,ACCIDENT_50570,5.139086


In [21]:
baseline_submission.to_csv('baseline_submit.csv', index=False)