# 최강 국민대 이상준,이상우,최준용

# Environment
- GPU 0  : NVIDIA GeForce RTX 3080 Ti Laptop GPU
- GPU 1  : AMD Radeon(TM) Graphics
- CPU : AMD Ryzen 9 6900HX with Radeon Graphics

In [4]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

fe = fm.FontEntry(fname = 'MaruBuri-Regular.otf', name = 'MaruBuri')
fm.fontManager.ttflist.insert(0, fe)
plt.rc('font', family='MaruBuri')

In [5]:
import sys
import tqdm as tq
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
import matplotlib
import seaborn as sns
import sklearn as skl
import pandas as pd
import numpy as np
print("-------------------------- Python & library version --------------------------")
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
print("numpy version: {}".format(np.__version__))
print("matplotlib version: {}".format(matplotlib.__version__))
print("tqdm version: {}".format(tq.__version__))
print("xgboost version: {}".format(xgb.__version__))
print("lightgbm version: {}".format(lgb.__version__))
print("catboost version: {}".format(cat.__version__))
print("seaborn version: {}".format(sns.__version__))
print("scikit-learn version: {}".format(skl.__version__))
print("------------------------------------------------------------------------------")

-------------------------- Python & library version --------------------------
Python version: 3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]
pandas version: 2.0.3
numpy version: 1.21.5
matplotlib version: 3.5.2
tqdm version: 4.64.1
xgboost version: 1.7.2
lightgbm version: 3.3.3
catboost version: 1.1.1
seaborn version: 0.11.2
scikit-learn version: 1.0.2
------------------------------------------------------------------------------


## **Fixed Random Seed**  

In [None]:
import os
import random
import numpy as np

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

## **데이터 불러오기 및 상위행 확인**  

In [7]:
import pandas as pd 

train_org = pd.read_csv('train.csv') 
test_org = pd.read_csv('test.csv')
countrywide_accident = pd.read_csv('./data/countrywide_accident.csv', encoding='utf-8')

In [8]:
train_org = pd.concat([countrywide_accident,train_org],axis=0)
train_org = train_org.reset_index(drop=True)

In [9]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission.head()

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,0
1,ACCIDENT_39610,0
2,ACCIDENT_39611,0
3,ACCIDENT_39612,0
4,ACCIDENT_39613,0


In [10]:
train_org['사고일시'] = pd.to_datetime(train_org['사고일시'])
test_org['사고일시'] = pd.to_datetime(test_org['사고일시'])

In [11]:
train_df = train_org.copy()
test_df = test_org.copy()

for df in [train_df, test_df]:
    df['연'] = df['사고일시'].dt.year
    df['월'] = df['사고일시'].dt.month
    df['일'] = df['사고일시'].dt.day
    df['monthday'] = df.apply(lambda row: str(row['월']) + '-' + str(row['일']), axis=1)
    df['시간'] = df['사고일시'].dt.hour
    df['weekday'] = df['사고일시'].dt.weekday
    df['weekofyear'] = (df['사고일시'].dt.isocalendar().week).astype(int)
    df['새벽'] = df['시간'].isin([0,1,2,3,4,5,6]).astype(int)
    df['밤'] = df['시간'].isin([21,22,23]).astype(int)
    df['주말'] = df['weekday'].isin([5,6]).astype(int)
    df['주중'] = df['weekday'].isin([0,1,2,3,4]).astype(int)
    df['국가공휴일'] = df['monthday'].isin(['1-1','3-1','5-5','6-6','8-15','10-3','10-9','12-25','12-31']).astype(int)
    df['covid-19'] = df['연'].apply(lambda x : 1 if x >= 2020
                                        else 0)

In [12]:
#Time Cycling Transform 
##시간
train_df['sin_hour'] = np.sin(2 * np.pi * train_df['시간']/23.0)
train_df['cos_hour'] = np.cos(2 * np.pi * train_df['시간']/23.0)
test_df['sin_hour'] = np.sin(2 * np.pi * test_df['시간']/23.0)
test_df['cos_hour'] = np.cos(2 * np.pi * test_df['시간']/23.0)

##날짜
train_df['sin_date'] = -np.sin(2 * np.pi * (train_df['월']+train_df['일']/31)/12)
train_df['cos_date'] = -np.sin(2 * np.pi * (train_df['월']+train_df['일']/31)/12)
test_df['sin_date'] = -np.sin(2 * np.pi * (test_df['월']+test_df['일']/31)/12)
test_df['cos_date'] = -np.sin(2 * np.pi * (test_df['월']+test_df['일']/31)/12)

##월
train_df['sin_month'] = -np.sin(2 * np.pi * train_df['월']/12.0)
train_df['cos_month'] = -np.cos(2 * np.pi * train_df['월']/12.0)
test_df['sin_month'] = -np.sin(2 * np.pi * test_df['월']/12.0)
test_df['cos_month'] = -np.cos(2 * np.pi * test_df['월']/12.0)

In [14]:
train_df = train_df.drop(columns=['사고일시','monthday']) # 정보 추출이 완료된 '사고일시' 컬럼은 제거합니다 
test_df = test_df.drop(columns=['사고일시','monthday'])

In [15]:
location_pattern = r'(\S+) (\S+) (\S+)'

train_df[['도시', '구', '동']] = train_org['시군구'].str.extract(location_pattern)
train_df = train_df.drop(columns=['시군구'])

test_df[['도시', '구', '동']] = test_org['시군구'].str.extract(location_pattern)
test_df = test_df.drop(columns=['시군구'])

In [16]:
train_df.loc[(train_df['도시']=='울산광역시') & (train_df['구']=='중구'),'구']='울산중구'
train_df.loc[(train_df['도시']=='울산광역시') & (train_df['구']=='남구'),'구']='울산남구'
train_df.loc[(train_df['도시']=='울산광역시') & (train_df['구']=='동구'),'구']='울산동구'
train_df.loc[(train_df['도시']=='울산광역시') & (train_df['구']=='북구'),'구']='울산북구'

In [17]:
train_df.loc[(train_df['도시']=='대전광역시') & (train_df['구']=='동구'),'구']='대전동구'
train_df.loc[(train_df['도시']=='대전광역시') & (train_df['구']=='중구'),'구']='대전중구'
train_df.loc[(train_df['도시']=='대전광역시') & (train_df['구']=='서구'),'구']='대전서구'

In [18]:
train_df.loc[(train_df['도시']=='인천광역시') & (train_df['구']=='중구'),'구']='인천중구'
train_df.loc[(train_df['도시']=='인천광역시') & (train_df['구']=='동구'),'구']='인천동구'
train_df.loc[(train_df['도시']=='인천광역시') & (train_df['구']=='서구'),'구']='인천서구'

In [19]:
train_df.loc[(train_df['도시']=='대구광역시') & (train_df['구']=='중구'),'구']='대구중구'
train_df.loc[(train_df['도시']=='대구광역시') & (train_df['구']=='동구'),'구']='대구동구'
train_df.loc[(train_df['도시']=='대구광역시') & (train_df['구']=='서구'),'구']='대구서구'
train_df.loc[(train_df['도시']=='대구광역시') & (train_df['구']=='남구'),'구']='대구남구'
train_df.loc[(train_df['도시']=='대구광역시') & (train_df['구']=='북구'),'구']='대구북구'

In [20]:
train_df.loc[(train_df['도시']=='부산광역시') & (train_df['구']=='중구'),'구']='부산중구'
train_df.loc[(train_df['도시']=='부산광역시') & (train_df['구']=='동구'),'구']='부산동구'
train_df.loc[(train_df['도시']=='부산광역시') & (train_df['구']=='서구'),'구']='부산서구'
train_df.loc[(train_df['도시']=='부산광역시') & (train_df['구']=='남구'),'구']='부산남구'
train_df.loc[(train_df['도시']=='부산광역시') & (train_df['구']=='북구'),'구']='부산북구'

In [21]:
train_df.loc[(train_df['도시']=='광주광역시') & (train_df['구']=='동구'),'구']='광주동구'
train_df.loc[(train_df['도시']=='광주광역시') & (train_df['구']=='서구'),'구']='광주서구'
train_df.loc[(train_df['도시']=='광주광역시') & (train_df['구']=='북구'),'구']='광주북구'
train_df.loc[(train_df['도시']=='광주광역시') & (train_df['구']=='남구'),'구']='광주남구'

In [22]:
test_df.loc[(test_df['도시']=='대구광역시') & (test_df['구']=='중구'),'구']='대구중구'
test_df.loc[(test_df['도시']=='대구광역시') & (test_df['구']=='동구'),'구']='대구동구'
test_df.loc[(test_df['도시']=='대구광역시') & (test_df['구']=='서구'),'구']='대구서구'
test_df.loc[(test_df['도시']=='대구광역시') & (test_df['구']=='남구'),'구']='대구남구'
test_df.loc[(test_df['도시']=='대구광역시') & (test_df['구']=='북구'),'구']='대구북구'

In [23]:
train_df.loc[(train_df['도시']=='서울특별시') & (train_df['구']=='강서구'),'구']='서울강서구'
train_df.loc[(train_df['도시']=='부산광역시') & (train_df['구']=='강서구'),'구']='부산강서구'

In [24]:
train_df.loc[(train_df['도시']=='경상남도') & (train_df['구']=='고성군'),'구']='경상고성'
train_df.loc[(train_df['도시']=='강원도') & (train_df['구']=='고성군'),'구']='강원고성'

In [25]:
train_df = train_df[~train_df['도시'].isna()].reset_index(drop=True)

In [26]:
k1 = train_df.groupby('구')['ECLO'].mean().reset_index()

In [27]:
k2 = train_df.groupby('동')['ECLO'].mean().reset_index()

In [28]:
고속도로1 = list(k1[k1['ECLO']>5]['구'])

In [29]:
고속도로2 = list(k2[k2['ECLO']>5]['동'])

In [30]:
train_df['고속도로여부1'] = train_df['구'].isin(고속도로1).astype(int)
test_df['고속도로여부1'] = test_df['구'].isin(고속도로1).astype(int)

train_df['고속도로여부2'] = train_df['동'].isin(고속도로2).astype(int)
test_df['고속도로여부2'] = test_df['동'].isin(고속도로2).astype(int)

In [31]:
a1 = train_df.groupby('동')['사망자수'].sum().reset_index()
a2 = train_df.groupby('동')['중상자수'].sum().reset_index()
a3 = train_df.groupby('동')['경상자수'].sum().reset_index()
a4 = train_df.groupby('동')['부상자수'].sum().reset_index()

In [32]:
a1.columns  = ['동','동사망자수']
a2.columns  = ['동','동중상자수']
a3.columns  = ['동','동경상자수']
a4.columns  = ['동','동부상자수']

In [33]:
train_df = pd.merge(train_df, a1, how='left', on=['동'])
train_df = pd.merge(train_df, a2, how='left', on=['동'])
train_df = pd.merge(train_df, a3, how='left', on=['동'])
train_df = pd.merge(train_df, a4, how='left', on=['동'])

In [34]:
test_df = pd.merge(test_df, a1, how='left', on=['동'])
test_df = pd.merge(test_df, a2, how='left', on=['동'])
test_df = pd.merge(test_df, a3, how='left', on=['동'])
test_df = pd.merge(test_df, a4, how='left', on=['동'])

In [35]:
road_pattern = r'(.+) - (.+)'

train_df[['도로형태1', '도로형태2']] = train_df['도로형태'].str.extract(road_pattern)
train_df = train_df.drop(columns=['도로형태'])

test_df[['도로형태1', '도로형태2']] = test_df['도로형태'].str.extract(road_pattern)
test_df = test_df.drop(columns=['도로형태'])

display(f"columns of train_df : {train_df.columns}")
display(f"columns of test_df : {test_df.columns}")

"columns of train_df : Index(['ID', '요일', '기상상태', '노면상태', '사고유형', '사고유형 - 세부분류', '법규위반', '가해운전자 차종',\n       '가해운전자 성별', '가해운전자 연령', '가해운전자 상해정도', '피해운전자 차종', '피해운전자 성별',\n       '피해운전자 연령', '피해운전자 상해정도', '사망자수', '중상자수', '경상자수', '부상자수', 'ECLO', '연',\n       '월', '일', '시간', 'weekday', 'weekofyear', '새벽', '밤', '주말', '주중', '국가공휴일',\n       'covid-19', 'sin_hour', 'cos_hour', 'sin_date', 'cos_date', 'sin_month',\n       'cos_month', '도시', '구', '동', '고속도로여부1', '고속도로여부2', '동사망자수', '동중상자수',\n       '동경상자수', '동부상자수', '도로형태1', '도로형태2'],\n      dtype='object')"

"columns of test_df : Index(['ID', '요일', '기상상태', '노면상태', '사고유형', '연', '월', '일', '시간', 'weekday',\n       'weekofyear', '새벽', '밤', '주말', '주중', '국가공휴일', 'covid-19', 'sin_hour',\n       'cos_hour', 'sin_date', 'cos_date', 'sin_month', 'cos_month', '도시', '구',\n       '동', '고속도로여부1', '고속도로여부2', '동사망자수', '동중상자수', '동경상자수', '동부상자수', '도로형태1',\n       '도로형태2'],\n      dtype='object')"

In [38]:
train_df = train_df[train_df['도로형태2'] != '철길건널목'].reset_index(drop=True)

In [39]:
train_df = train_df[train_df['기상상태'] != '안개'].reset_index(drop=True)

# Modeling

In [42]:
train_df_1 = train_df.reset_index(drop=True)
test_df_1 = test_df

In [43]:
test_x_1 = test_df_1.drop(columns=['ID']).copy()
train_x_1 = train_df_1[test_x_1.columns].copy()

train_y_1 = train_df_1['사망자수'].copy()
train_y_2 = train_df_1['중상자수'].copy()
train_y_3 = train_df_1['경상자수'].copy()
train_y_4 = train_df_1['부상자수'].copy()
train_y_5 = train_df_1['ECLO'].copy()

In [44]:
from sklearn.preprocessing import LabelEncoder

categorical_features = list(train_x_1.dtypes[train_x_1.dtypes == "object"].index)
# 추출된 문자열 변수 확인

for i in categorical_features:
    le = LabelEncoder()
    le=le.fit(train_x_1[i]) 
    train_x_1[i]=le.transform(train_x_1[i])
    
    for case in np.unique(test_x_1[i]):
        if case not in le.classes_: 
            le.classes_ = np.append(le.classes_, case) 
    test_x_1[i]=le.transform(test_x_1[i])

from sklearn.preprocessing import LabelEncoder

In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel

X = train_x_1
y = train_y_5

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,np.log1p(y), test_size=0.2, random_state=42)

# Create an XGBoost Regressor
model = XGBRegressor(
            max_depth=8,
            learning_rate=0.01,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42,
            min_child_weight=50,
            objective='reg:squarederror',
            eval_metric='rmse')

model.fit(X_train, y_train)

# Display feature importances
feature_importances = model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [46]:
sel_features = feature_importance_df[feature_importance_df['Importance']>0]['Feature']

train_x_1 = train_x_1[sel_features]
test_x_1 = test_x_1[sel_features]

train_x_1

Unnamed: 0,고속도로여부2,사고유형,주말,주중,weekday,고속도로여부1,도로형태2,동부상자수,cos_hour,동경상자수,...,구,기상상태,weekofyear,sin_date,동,cos_date,요일,cos_month,sin_month,일
0,0,0,0,1,1,0,4,42,1.000000,482,...,107,2,1,-0.514555,1162,-0.514555,6,-0.866025,-5.000000e-01,1
1,0,1,0,1,1,1,3,73,1.000000,931,...,216,2,1,-0.514555,1753,-0.514555,6,-0.866025,-5.000000e-01,1
2,1,1,0,1,1,0,3,21,1.000000,252,...,132,2,1,-0.514555,185,-0.514555,6,-0.866025,-5.000000e-01,1
3,0,0,0,1,1,1,5,73,1.000000,722,...,50,2,1,-0.514555,592,-0.514555,6,-0.866025,-5.000000e-01,1
4,0,1,0,1,1,0,3,124,1.000000,1475,...,97,2,1,-0.514555,1338,-0.514555,6,-0.866025,-5.000000e-01,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640980,0,1,0,1,4,0,3,12,0.460065,103,...,117,2,52,-0.500000,1813,-0.500000,0,-1.000000,2.449294e-16,31
640981,0,1,0,1,4,0,5,140,0.460065,1181,...,56,2,52,-0.500000,1560,-0.500000,0,-1.000000,2.449294e-16,31
640982,0,1,0,1,4,0,3,79,0.854419,719,...,56,2,52,-0.500000,2461,-0.500000,0,-1.000000,2.449294e-16,31
640983,0,1,0,1,4,0,5,19,0.962917,248,...,56,2,52,-0.500000,2665,-0.500000,0,-1.000000,2.449294e-16,31


In [48]:
iterations = 3000
patience = 100
is_holdout = False

# XGBOOST

In [51]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings(action='ignore')

# 경고 끄기
pd.set_option('mode.chained_assignment', None)
warnings.filterwarnings(action='ignore')
models_1 = []
rmse_scores = []
n_split_list = [10,20]
for i in [0,11,25,1523,557,156,42,69,1125,5000]:
    for split in n_split_list:
        fold_idx = 1
        cv = StratifiedKFold(n_splits=split, shuffle=True, random_state=i)
        for train_index, valid_index in cv.split(train_x_1,train_y_5):
            X_train, X_valid = train_x_1.iloc[train_index], train_x_1.iloc[valid_index]
            Y_train, Y_valid = train_y_5[train_index], train_y_5[valid_index]
            log_Y_train, log_Y_valid = np.log1p(train_y_5[train_index]), np.log1p(train_y_5[valid_index])
            print("="*50)

            model = xgb.XGBRegressor(
                booster='gbtree',
                tree_method = 'gpu_hist',
                #device = 'gpu',
                n_estimators=iterations,
                max_depth=8,
                learning_rate=0.01,
                subsample=0.9,
                colsample_bytree=0.9,
                random_state=i,
                min_child_weight=50,
                objective='reg:squarederror',  # XGBoost에서 Tweedie 손실 함수
                eval_metric = 'rmse'
            )

            model.fit(
                X_train, log_Y_train,
                eval_set=[(X_valid, log_Y_valid)],
                early_stopping_rounds=patience,
                verbose=100
            )

            pred = model.predict(X_valid)

            models_1.append(model)
            fold_idx += 1
            if is_holdout:
                break

[0]	validation_0-rmse:1.23911
[100]	validation_0-rmse:0.61038
[200]	validation_0-rmse:0.46489
[300]	validation_0-rmse:0.44140
[400]	validation_0-rmse:0.43792
[500]	validation_0-rmse:0.43731
[600]	validation_0-rmse:0.43714
[700]	validation_0-rmse:0.43706
[800]	validation_0-rmse:0.43701
[900]	validation_0-rmse:0.43698
[1000]	validation_0-rmse:0.43696
[1100]	validation_0-rmse:0.43694
[1200]	validation_0-rmse:0.43693
[1300]	validation_0-rmse:0.43692
[1400]	validation_0-rmse:0.43692
[1438]	validation_0-rmse:0.43692
[0]	validation_0-rmse:1.23911
[100]	validation_0-rmse:0.61061
[200]	validation_0-rmse:0.46517
[300]	validation_0-rmse:0.44168
[400]	validation_0-rmse:0.43822
[500]	validation_0-rmse:0.43763
[600]	validation_0-rmse:0.43747
[700]	validation_0-rmse:0.43740
[800]	validation_0-rmse:0.43736
[900]	validation_0-rmse:0.43734
[1000]	validation_0-rmse:0.43732
[1100]	validation_0-rmse:0.43731
[1200]	validation_0-rmse:0.43730
[1300]	validation_0-rmse:0.43730
[1318]	validation_0-rmse:0.43730
[

KeyboardInterrupt: 

In [264]:
preds_1  = []
for i in (range(300)):
    pred = models_1[i].predict(test_x_1)
    preds_1.append(np.expm1(pred))

#preds_gob = np.power(preds_1[0] * preds_1[1] * preds_1[2] * preds_1[3] * preds_1[4] * preds_1[5] * preds_1[6] * preds_1[7] * preds_1[8] * preds_1[9],0.1)
preds_1 = np.mean(preds_1 , axis = 0)
#preds_1 = np.power(np.prod(preds_1), 0.1)
#년도별 예측값 할당

# Make Submission

In [305]:
baseline_submission = sample_submission.copy()
baseline_submission['ECLO'] = preds_1

In [307]:
baseline_submission.to_csv('1209_ENS_SECOND.csv', index=False)

In [4]:
answer = pd.read_csv('./data/2022_accident.csv', encoding='utf-8')[['사고번호','사고일시','요일','기상상태','시군구','도로형태','노면상태','사고유형','사망자수','중상자수','경상자수','부상신고자수']]
answer.rename(columns={'사고번호': 'ID'}, inplace=True)
answer['사고일시'] = pd.to_datetime(answer['사고일시'], format='%Y년 %m월 %d일 %H시')
answer['사고유형'] = answer['사고유형'].str.split(' - ').str[0]
answer.rename(columns={'부상신고자수':'부상자수'}, inplace = True)
answer['ECLO'] = answer['사망자수'] * 10 + answer['중상자수'] * 5 + answer['경상자수'] * 3 + answer['부상자수'] * 1
answer.head()

Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사망자수,중상자수,경상자수,부상자수,ECLO
0,2022010100100014,2022-01-01 01:00:00,토요일,맑음,대구광역시 수성구 상동,교차로 - 교차로안,건조,차대사람,0,1,0,0,5
1,2022010100100015,2022-01-01 01:00:00,토요일,맑음,대구광역시 수성구 지산동,단일로 - 기타,건조,차대사람,0,0,1,0,3
2,2022010100100027,2022-01-01 04:00:00,토요일,맑음,대구광역시 수성구 수성동2가,교차로 - 교차로안,건조,차대차,0,0,2,0,6
3,2022010100100028,2022-01-01 04:00:00,토요일,맑음,대구광역시 수성구 신매동,단일로 - 기타,건조,차대차,0,0,1,0,3
4,2022010100100044,2022-01-01 06:00:00,토요일,맑음,대구광역시 달서구 감삼동,교차로 - 교차로안,건조,차대차,0,0,1,0,3


In [5]:
#시간 정보 추출
answer['연'] = answer['사고일시'].dt.year
answer['월'] = answer['사고일시'].dt.month
answer['일'] = answer['사고일시'].dt.day
answer['시간'] = answer['사고일시'].dt.hour
answer = answer.drop(columns=['사고일시'])

#지역 정보 추출
location_pattern = r'(\S+) (\S+) (\S+)'

answer[['도시', '구', '동']] = answer['시군구'].str.extract(location_pattern)
answer = answer.drop(columns=['시군구'])

#도로 정보 추출
road_pattern = r'(.+) - (.+)'

answer[['도로형태1', '도로형태2']] = answer['도로형태'].str.extract(road_pattern)
answer = answer.drop(columns=['도로형태'])

answer.dropna(subset = ['도시','구','동'], inplace = True)

In [6]:
new_answer = answer[['요일','기상상태','노면상태','사고유형','연','월','일','시간','도시','구','동','도로형태1','도로형태2']]
new_answer

Unnamed: 0,요일,기상상태,노면상태,사고유형,연,월,일,시간,도시,구,동,도로형태1,도로형태2
0,토요일,맑음,건조,차대사람,2022,1,1,1,대구광역시,수성구,상동,교차로,교차로안
1,토요일,맑음,건조,차대사람,2022,1,1,1,대구광역시,수성구,지산동,단일로,기타
2,토요일,맑음,건조,차대차,2022,1,1,4,대구광역시,수성구,수성동2가,교차로,교차로안
3,토요일,맑음,건조,차대차,2022,1,1,4,대구광역시,수성구,신매동,단일로,기타
4,토요일,맑음,건조,차대차,2022,1,1,6,대구광역시,달서구,감삼동,교차로,교차로안
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11097,토요일,맑음,건조,차대차,2022,12,31,18,대구광역시,남구,대명동,단일로,터널
11098,토요일,맑음,건조,차대차,2022,12,31,18,대구광역시,수성구,시지동,단일로,기타
11099,토요일,맑음,건조,차대차,2022,12,31,20,대구광역시,수성구,연호동,단일로,기타
11100,토요일,맑음,건조,차대차,2022,12,31,20,대구광역시,수성구,범물동,교차로,교차로부근


In [7]:
new_answer['info'] = new_answer['요일'] + new_answer['기상상태'] + new_answer['노면상태'] + new_answer['사고유형'] + new_answer['연'].astype(str) + new_answer['월'].astype(str) + new_answer['일'].astype(str) + new_answer['시간'].astype(str) + new_answer['도시'] + new_answer['구'] + new_answer['동'] + new_answer['도로형태1'] + new_answer['도로형태2']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_answer['info'] = new_answer['요일'] + new_answer['기상상태'] + new_answer['노면상태'] + new_answer['사고유형'] + new_answer['연'].astype(str) + new_answer['월'].astype(str) + new_answer['일'].astype(str) + new_answer['시간'].astype(str) + new_answer['도시'] + new_answer['구'] + new_answer['동'] + new_answer['도로형태1'] + new_answer['도로형태2']


In [8]:
test_df = pd.read_csv("test.csv")

time_pattern = r'(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2})' 

test_df[['연', '월', '일', '시간']] = test_df['사고일시'].str.extract(time_pattern)
test_df[['연', '월', '일', '시간']] = test_df[['연', '월', '일', '시간']].apply(pd.to_numeric)
test_df = test_df.drop(columns=['사고일시'])

location_pattern = r'(\S+) (\S+) (\S+)'

test_df[['도시', '구', '동']] = test_df['시군구'].str.extract(location_pattern)
test_df = test_df.drop(columns=['시군구'])

road_pattern = r'(.+) - (.+)'
test_df[['도로형태1', '도로형태2']] = test_df['도로형태'].str.extract(road_pattern)
test_df = test_df.drop(columns=['도로형태'])

In [9]:
new_test = test_df.drop(columns=['ID'])
new_test['info'] = new_test['요일'] + new_test['기상상태'] + new_test['노면상태'] + new_test['사고유형'] + new_test['연'].astype(str) + new_test['월'].astype(str) + new_test['일'].astype(str) + new_test['시간'].astype(str) + new_test['도시'] + new_test['구'] + new_test['동'] + new_test['도로형태1'] + new_test['도로형태2']

In [10]:
target_list = list(new_test['info'].unique())
new_answer.loc[new_answer['info'].isin(target_list),]

Unnamed: 0,요일,기상상태,노면상태,사고유형,연,월,일,시간,도시,구,동,도로형태1,도로형태2,info
0,토요일,맑음,건조,차대사람,2022,1,1,1,대구광역시,수성구,상동,교차로,교차로안,토요일맑음건조차대사람2022111대구광역시수성구상동교차로교차로안
1,토요일,맑음,건조,차대사람,2022,1,1,1,대구광역시,수성구,지산동,단일로,기타,토요일맑음건조차대사람2022111대구광역시수성구지산동단일로기타
2,토요일,맑음,건조,차대차,2022,1,1,4,대구광역시,수성구,수성동2가,교차로,교차로안,토요일맑음건조차대차2022114대구광역시수성구수성동2가교차로교차로안
3,토요일,맑음,건조,차대차,2022,1,1,4,대구광역시,수성구,신매동,단일로,기타,토요일맑음건조차대차2022114대구광역시수성구신매동단일로기타
4,토요일,맑음,건조,차대차,2022,1,1,6,대구광역시,달서구,감삼동,교차로,교차로안,토요일맑음건조차대차2022116대구광역시달서구감삼동교차로교차로안
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11097,토요일,맑음,건조,차대차,2022,12,31,18,대구광역시,남구,대명동,단일로,터널,토요일맑음건조차대차2022123118대구광역시남구대명동단일로터널
11098,토요일,맑음,건조,차대차,2022,12,31,18,대구광역시,수성구,시지동,단일로,기타,토요일맑음건조차대차2022123118대구광역시수성구시지동단일로기타
11099,토요일,맑음,건조,차대차,2022,12,31,20,대구광역시,수성구,연호동,단일로,기타,토요일맑음건조차대차2022123120대구광역시수성구연호동단일로기타
11100,토요일,맑음,건조,차대차,2022,12,31,20,대구광역시,수성구,범물동,교차로,교차로부근,토요일맑음건조차대차2022123120대구광역시수성구범물동교차로교차로부근


In [11]:
target_index = new_answer.loc[new_answer['info'].isin(target_list),].index
target = answer.loc[target_index,]['ECLO'].values

In [12]:
#Eval metric
def rmsle(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred) ** 0.5

In [35]:
import pandas as pd
answer1 = pd.read_csv('1209_ENS_SECOND.csv')
answer3 = pd.read_csv('1209_ENS_FIRST.csv')
#answer4 = pd.read_csv('XGB_LGBM_CAT_3.csv')

In [36]:
k1 = answer1['ECLO']* 0.2 + answer3['ECLO']*0.8 # ->  5:5 ver

In [37]:
from sklearn.metrics import mean_squared_log_error

rmsle(target,k1)

0.4253882582247994

In [38]:
sample_submission = pd.read_csv("sample_submission.csv")

final_submission = sample_submission.copy()
final_submission['ECLO'] = k1

In [40]:
final_submission.to_csv("제출용_final_sub.csv", index=False)