In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [5]:
train = pd.read_csv('train.csv') 
test = pd.read_csv('test.csv')

In [6]:
def making_val_table(df):
    '''입력받은 데이터셋의 유효성 검증을 위한 요약 테이블'''
    # dtypes 
    df_dtypes = df.dtypes
    
    # nunique
    df_nunique = df.nunique()
    
    # null_values
    df_nan = df.isna().sum()
    
    val_table = pd.concat([df_dtypes, df_nunique, df_nan], axis=1)
    val_table.columns = ['dtype', 'nunique', 'nan']
        
    return val_table.reset_index()

In [7]:
# 기상상태, 시군구 데이터의 unique 값이 각각 7vs6, 199vs 192로 다르다
train_info = making_val_table(train)
test_info = making_val_table(test)

pd.merge(left=train_info, right=test_info, on='index', 
         how='left', suffixes=('_train','_test')).set_index('index')

Unnamed: 0_level_0,dtype_train,nunique_train,nan_train,dtype_test,nunique_test,nan_test
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ID,object,39609,0,object,10963.0,0.0
사고일시,object,18057,0,object,5548.0,0.0
요일,object,7,0,object,7.0,0.0
기상상태,object,6,0,object,5.0,0.0
시군구,object,199,0,object,192.0,0.0
도로형태,object,11,0,object,11.0,0.0
노면상태,object,6,0,object,6.0,0.0
사고유형,object,3,0,object,3.0,0.0
사고유형 - 세부분류,object,14,0,,,
법규위반,object,11,0,,,


In [8]:
# target 변수 지정
y_train = train['ECLO']

# train, test 데이터간 컬럼 동기화 
cols = test.columns
X_train = train[cols]
X_test = test

In [9]:
def feat_eng(df):
    # datetime 변환
    df['사고일시'] = pd.to_datetime(df['사고일시'])
    
    # 월,일,시 컬럼 생성
    df['월'] = df['사고일시'].dt.month
    df['일'] = df['사고일시'].dt.day
    df['시'] = df['사고일시'].dt.hour
    
    # 불필요 컬럼 제거
    subs = ['ID','사고일시','기상상태','시군구']
    df = df.drop(subs, axis=1)
    
    # one-hot encoding 실시
    df = pd.get_dummies(df)
    
    return df

In [10]:
X_train_eng = feat_eng(X_train)
X_test_eng = feat_eng(X_test)

In [11]:
print(f'X_train 데이터 shape : {X_train_eng.shape}')
print(f'y_train 데이터 shape : {y_train.shape}')
print()
print(f'X_test 데이터 shape : {X_test_eng.shape}')

X_train 데이터 shape : (39609, 30)
y_train 데이터 shape : (39609,)

X_test 데이터 shape : (10963, 30)


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train_eng, y_train , test_size=0.2, random_state=42)

In [28]:
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor

# 'ECLO' 열에 대해 결측치를 평균 값으로 대체하고, 무한대 값을 최대값으로 대체
y_train = y_train.replace([np.inf, -np.inf], np.nan)
y_train = y_train.fillna(y_train.mean())

# 타겟 변수를 데이터 프레임에 추가
X_train_eng['ECLO'] = y_train

# 기타 열에 대해 결측치를 평균 값으로 대체하고, 무한대 값을 해당 열의 최대값으로 대체
X_train_eng = X_train_eng.replace([np.inf, -np.inf], np.nan)
X_train_eng = X_train_eng.apply(lambda x: x.fillna(x.mean()), axis=0)

# AutoGluon 학습을 위한 데이터 형태로 변환
train_data = TabularDataset(data=X_train_eng)

predictor = TabularPredictor(label='ECLO').fit(
    train_data=train_data, 
    presets='best_quality', 
    time_limit=60*60
)


No path specified. Models will be saved in: "AutogluonModels/ag-20231117_021351/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels/ag-20231117_021351/"
AutoGluon Version:  0.8.2
Python Version:     3.8.15
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.5.0: Mon Apr 24 20:53:44 PDT 2023; root:xnu-8796.121.2~5/RELEASE_ARM64_T8103
Disk Space Avail:   28.26 GB / 245.11 GB (11.5%)
Train Data Rows:    39609
Train Data Columns: 30
Label Column: ECLO
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (74.0, 1.0, 4.73238, 2.8803)
	If 'regression' is not the correct problem_type, please manually specify the problem_type 

In [29]:
# 예측하기
pred_y = predictor.predict(X_test_eng)

In [30]:
sample_submission = pd.read_csv("sample_submission.csv")

baseline_submission = sample_submission.copy()
baseline_submission['ECLO'] = pred_y
baseline_submission 

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,4.348880
1,ACCIDENT_39610,4.108166
2,ACCIDENT_39611,5.254338
3,ACCIDENT_39612,5.115234
4,ACCIDENT_39613,5.009536
...,...,...
10958,ACCIDENT_50567,6.624053
10959,ACCIDENT_50568,5.320106
10960,ACCIDENT_50569,4.731536
10961,ACCIDENT_50570,4.807998


In [31]:
baseline_submission.to_csv('fourth_submit.csv', index=False)

In [32]:
# csv 파일을 읽어옵니다.
df = pd.read_csv('third_submit.csv')

# 빈 값을 0으로 채웁니다.
df = df.fillna(0)

# 변경된 데이터를 다시 csv 파일로 저장합니다.
df.to_csv('fourth_submit.csv', index=False)