#사전 작업

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!apt-get install -y fonts-nanum*
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

#데이터 전처리


In [61]:
#import 및 파일 불러오기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats

plt.rc('font', family='NanumGothic')

file_path = "/content/drive/MyDrive/Colab Notebooks/01 DATA/accidentInfoList.CSV"
df = pd.read_csv(file_path, encoding='cp949')



In [62]:
# 숫자 추출 함수 생성
def extract_numbers_from_age(age):
    numbers = re.findall(r'\d+', str(age))
    if numbers:
        return int(numbers[0])
    else:
        return np.nan

# 문자 데이터 제거 및 숫자만 추출
df['가해운전자 연령'] = df['가해운전자 연령'].apply(extract_numbers_from_age)
df['피해운전자 연령'] = df['피해운전자 연령'].apply(extract_numbers_from_age)
mean_age_driver = round(df['가해운전자 연령'].mean(), 1)
mean_age_victim = round(df['피해운전자 연령'].mean(), 1)

# 결측치 처리
df['가해운전자 연령'].fillna(mean_age_driver, inplace=True)
df['피해운전자 연령'].fillna(mean_age_victim, inplace=True)
df['피해운전자 차종'].fillna('차량단독사고', inplace=True)
df['피해운전자 성별'].fillna('차량단독사고', inplace=True)
df['피해운전자 상해정도'].fillna('상해없음', inplace=True)


In [63]:
# 피쳐 생성
df['ECLO'] = df['사망자수'] * 10 + df['중상자수'] * 5 + df['경상자수'] * 3 + df['부상신고자수'] * 1

# '사고일시' 열에서 시간 정보 추출
df['사고일시'] = pd.to_datetime(df['사고일시'], format='%Y년 %m월 %d일 %H시')
df['사고발생시간'] = df['사고일시'].dt.hour

# '시군구' 피처에서 '구' 정보만 남기기
df['시군구'] = df['시군구'].apply(lambda x: x.split(' ')[1])

In [64]:
# 이상치 제거
z_scores = np.abs(stats.zscore(df['ECLO']))
outliers = (z_scores > 3)
df_no_outliers = df[~outliers]


In [None]:
#스케일링
eclo_data = df_no_outliers['ECLO'].values.reshape(-1, 1)
scaler = StandardScaler()
eclo_scaled = scaler.fit_transform(eclo_data)
df_no_outliers['ECLO_scaled'] = eclo_scaled


#시각화 관련

데이터탐색에 들어갈 자료 생성

In [None]:
# 빈도표
plt.figure(figsize=(16, 12))
sns.countplot(x='피해운전자 연령', data=df, palette='viridis')
plt.ylabel('빈도')
plt.show()


In [None]:
# 커널 밀도 추정 그래프
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df, x='사고발생시간', fill=True, color='skyblue')
plt.title('사고발생시간별 분포')
plt.xlabel('사고발생시간')
plt.ylabel('밀도')
plt.show()


시각화에 들어갈 자료 생성

In [None]:
# 사고내용별 ECLO
avg_eclo_by_accident = df.groupby('시군구')['ECLO'].mean().reset_index()
avg_eclo_all = df['ECLO'].mean()

# 막대 그래프
plt.figure(figsize=(12, 8))
sns.barplot(x='시군구', y='ECLO', data=avg_eclo_by_accident, palette='viridis')
plt.ylabel('평균 ECLO')
plt.axhline(avg_eclo_all, color='r', linestyle='--', label='전체 평균 ECLO')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
def categorize_age(age):
    if age <= 19:
        return '0~19'
    elif age <= 45:
        return '20~45'
    elif age <= 60:
        return '46~60'
    else:
        return '61 이상'

# 연령을 범주
df['피해운전자 연령대'] = df['피해운전자 연령'].apply(categorize_age)

# 사고내용별 ECLO
avg_eclo_by_accident = df.groupby('피해운전자 연령대')['ECLO'].mean().reset_index()
avg_eclo_all = df['ECLO'].mean()

# 막대 그래프
plt.figure(figsize=(12, 8))
sns.barplot(x='피해운전자 연령대', y='ECLO', data=avg_eclo_by_accident, palette='viridis')
plt.ylabel('평균 ECLO')
plt.axhline(avg_eclo_all, color='r', linestyle='--', label='전체 평균 ECLO')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# 3가지 이상의 피처를 활용한 히트맵

heatmap_data = df.pivot_table(index='사고발생시간', columns='법규위반', values='ECLO', aggfunc='mean')

plt.figure(figsize=(14, 10))
sns.heatmap(heatmap_data, cmap='viridis', annot=True, fmt=".2f", linewidths=.5)
plt.title('사고 발생시간과 법규위반에 따른 ECLO 평균')
plt.xlabel('법규위반')
plt.ylabel('사고발생시간')
plt.show()


#모델링

In [66]:
# 피처 선택
features = ['경상자수', '중상자수', '사망자수']

# 독립변수와 종속변수 분할
X = df_no_outliers[features]
y = df_no_outliers['ECLO_scaled']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [67]:
# 모델 생성 및 훈련
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# 모델 평가
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error without outliers:", mse)
print("Mean Absolute Error without outliers:", mae)
print("R² Score without outliers:", r2)

# 중요도 확인
feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

Mean Squared Error without outliers: 0.011646513531384507
Mean Absolute Error without outliers: 0.031942648591598045
R² Score without outliers: 0.9881970895183898
  Feature  Importance
0    경상자수    0.578689
1    중상자수    0.350729
2    사망자수    0.070582


#SUB

In [None]:
print(df.columns.tolist())

In [None]:
print(df.head())

In [None]:
#원-핫 인코딩 예시
categorical_features = ['요일']
encoded_df = pd.get_dummies(df[categorical_features])
df = df.drop(columns=categorical_features)
df = pd.concat([df, encoded_df], axis=1)
print(encoded_df.columns)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats

plt.rc('font', family='NanumGothic')

file_path = "/content/drive/MyDrive/Colab Notebooks/01 DATA/accidentInfoList.CSV"
df = pd.read_csv(file_path, encoding='cp949')

# 숫자 추출 함수 생성
def extract_numbers_from_age(age):
    numbers = re.findall(r'\d+', str(age))
    if numbers:
        return int(numbers[0])
    else:
        return np.nan

# 문자 데이터 제거 및 숫자만 추출
df['가해운전자 연령'] = df['가해운전자 연령'].apply(extract_numbers_from_age)
df['피해운전자 연령'] = df['피해운전자 연령'].apply(extract_numbers_from_age)
mean_age_driver = round(df['가해운전자 연령'].mean(), 1)
mean_age_victim = round(df['피해운전자 연령'].mean(), 1)

# 결측치 처리
df['가해운전자 연령'].fillna(mean_age_driver, inplace=True)
df['피해운전자 연령'].fillna(mean_age_victim, inplace=True)
df['피해운전자 차종'].fillna('차량단독사고', inplace=True)
df['피해운전자 성별'].fillna('차량단독사고', inplace=True)
df['피해운전자 상해정도'].fillna('상해없음', inplace=True)

# 피쳐 생성
df['ECLO'] = df['사망자수'] * 10 + df['중상자수'] * 5 + df['경상자수'] * 3 + df['부상신고자수'] * 1

# '사고일시' 열에서 시간 정보 추출
df['사고일시'] = pd.to_datetime(df['사고일시'], format='%Y년 %m월 %d일 %H시')
df['사고발생시간'] = df['사고일시'].dt.hour

# '시군구' 피처에서 '구' 정보만 남기기
df['시군구'] = df['시군구'].apply(lambda x: x.split(' ')[1])

# 이상치 제거
z_scores = np.abs(stats.zscore(df['ECLO']))
outliers = (z_scores > 3)
df_no_outliers = df[~outliers]

#스케일링
eclo_data = df_no_outliers['ECLO'].values.reshape(-1, 1)
scaler = StandardScaler()
eclo_scaled = scaler.fit_transform(eclo_data)
df_no_outliers['ECLO_scaled'] = eclo_scaled

# 피처 선택
features = ['경상자수', '중상자수', '사망자수']

# 독립변수와 종속변수 분할
X = df_no_outliers[features]
y = df_no_outliers['ECLO_scaled']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 생성 및 훈련
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# 모델 평가
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error without outliers:", mse)
print("Mean Absolute Error without outliers:", mae)
print("R² Score without outliers:", r2)

# 중요도 확인
feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_outliers['ECLO_scaled'] = eclo_scaled


Mean Squared Error without outliers: 0.011646513531384507
Mean Absolute Error without outliers: 0.031942648591598045
R² Score without outliers: 0.9881970895183898
  Feature  Importance
0    경상자수    0.578689
1    중상자수    0.350729
2    사망자수    0.070582
