# Automobile Accidents
- `accidentsFull.csv` 파일에는 세 단계 부상 수준(NO INJURY, INJURY, FATALITY)으로 기록된 2001년 미국의 실제 자동차 사고 42,183건에 대한 정보가 담겨 있다. 각 사고에 대해 요일, 기상 조건, 도로 종류와 같은 추가 정보도 기록되어 있다. 어떤 회사가 초기 보고서와 이 시스템의 연관된 데이터(그중 일부는 GPS-지원 보고에 의존함)에 근거하여 사고의 심각성을 신속하게 분류하는 시스템을 개발하고자 한다.
- 목적은 보고된 사고에 부상이 동반될지(MAX_SEV_IR = 1 또는 2), 부상이 없을지(MAX_SEV_IR = 0)을 예측하는 것이다. 이를 위해 MAX_SEV_IR = 1 또는 2이면 값이 "Yes"가 되고 그렇지 않으면 "No"가 되는 INJURY라는 가변수를 생성하시오.

In [37]:
!pip install dmba

[0m

In [38]:
%matplotlib inline
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
import dmba

import matplotlib.pylab as plt

## Load the data set

In [39]:
accident_df = dmba.load_data('accidentsFull.csv')

Determine the shape of the data frame. It has 42183 rows and 24 columns

In [40]:
accident_df.shape

(42183, 24)

Show the top rows of the dataframe

In [41]:
accident_df.head()

Unnamed: 0,HOUR_I_R,ALCHL_I,ALIGN_I,STRATUM_R,WRK_ZONE,WKDY_I_R,INT_HWY,LGTCON_I_R,MANCOL_I_R,PED_ACC_R,...,SUR_COND,TRAF_CON_R,TRAF_WAY,VEH_INVL,WEATHER_R,INJURY_CRASH,NO_INJ_I,PRPTYDMG_CRASH,FATALITIES,MAX_SEV_IR
0,0,2,2,1,0,1,0,3,0,0,...,4,0,3,1,1,1,1,0,0,1
1,1,2,1,0,0,1,1,3,2,0,...,4,0,3,2,2,0,0,1,0,0
2,1,2,1,0,0,1,0,3,2,0,...,4,1,2,2,2,0,0,1,0,0
3,1,2,1,1,0,0,0,3,2,0,...,4,1,2,2,1,0,0,1,0,0
4,1,1,1,0,0,1,0,3,2,0,...,4,0,2,3,1,0,0,1,0,0


In [42]:
accident_df.columns

Index(['HOUR_I_R', 'ALCHL_I', 'ALIGN_I', 'STRATUM_R', 'WRK_ZONE', 'WKDY_I_R',
       'INT_HWY', 'LGTCON_I_R', 'MANCOL_I_R', 'PED_ACC_R', 'RELJCT_I_R',
       'REL_RWY_R', 'PROFIL_I_R', 'SPD_LIM', 'SUR_COND', 'TRAF_CON_R',
       'TRAF_WAY', 'VEH_INVL', 'WEATHER_R', 'INJURY_CRASH', 'NO_INJ_I',
       'PRPTYDMG_CRASH', 'FATALITIES', 'MAX_SEV_IR'],
      dtype='object')

In [43]:
accident_df.describe()

Unnamed: 0,HOUR_I_R,ALCHL_I,ALIGN_I,STRATUM_R,WRK_ZONE,WKDY_I_R,INT_HWY,LGTCON_I_R,MANCOL_I_R,PED_ACC_R,...,SUR_COND,TRAF_CON_R,TRAF_WAY,VEH_INVL,WEATHER_R,INJURY_CRASH,NO_INJ_I,PRPTYDMG_CRASH,FATALITIES,MAX_SEV_IR
count,42183.0,42183.0,42183.0,42183.0,42183.0,42183.0,42183.0,42183.0,42183.0,42183.0,...,42183.0,42183.0,42183.0,42183.0,42183.0,42183.0,42183.0,42183.0,42183.0,42183.0
mean,0.429344,1.912832,1.131546,0.49162,0.022616,0.771614,0.150321,1.492521,1.337079,0.040514,...,1.29071,0.516322,1.477491,1.816964,1.142783,0.497736,0.778702,0.491217,0.011047,0.51983
std,0.494988,0.282084,0.338,0.499936,0.148677,0.419797,0.418952,0.789874,0.929756,0.197164,...,0.780524,0.749417,0.584851,0.684843,0.349855,0.500001,1.035169,0.499929,0.104524,0.521256
min,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,...,1.0,0.0,1.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0
75%,1.0,2.0,1.0,1.0,0.0,1.0,0.0,2.0,2.0,0.0,...,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,0.0,1.0
max,1.0,2.0,2.0,1.0,1.0,1.0,9.0,3.0,2.0,1.0,...,9.0,2.0,3.0,23.0,2.0,1.0,31.0,1.0,1.0,2.0


In [44]:
accident_df['SPD_LIM'].describe()

count    42183.000000
mean        43.547875
std         12.948396
min          5.000000
25%         35.000000
50%         40.000000
75%         55.000000
max         75.000000
Name: SPD_LIM, dtype: float64

In [45]:
print("Number of rows\n", accident_df.count())
reduce_df = accident_df.dropna()
print("\nNumber of rows after removing rows with missing values\n", reduce_df.count())

Number of rows
 HOUR_I_R          42183
ALCHL_I           42183
ALIGN_I           42183
STRATUM_R         42183
WRK_ZONE          42183
WKDY_I_R          42183
INT_HWY           42183
LGTCON_I_R        42183
MANCOL_I_R        42183
PED_ACC_R         42183
RELJCT_I_R        42183
REL_RWY_R         42183
PROFIL_I_R        42183
SPD_LIM           42183
SUR_COND          42183
TRAF_CON_R        42183
TRAF_WAY          42183
VEH_INVL          42183
WEATHER_R         42183
INJURY_CRASH      42183
NO_INJ_I          42183
PRPTYDMG_CRASH    42183
FATALITIES        42183
MAX_SEV_IR        42183
dtype: int64

Number of rows after removing rows with missing values
 HOUR_I_R          42183
ALCHL_I           42183
ALIGN_I           42183
STRATUM_R         42183
WRK_ZONE          42183
WKDY_I_R          42183
INT_HWY           42183
LGTCON_I_R        42183
MANCOL_I_R        42183
PED_ACC_R         42183
RELJCT_I_R        42183
REL_RWY_R         42183
PROFIL_I_R        42183
SPD_LIM           42183
SU

In [46]:
accident_df.dtypes

HOUR_I_R          int64
ALCHL_I           int64
ALIGN_I           int64
STRATUM_R         int64
WRK_ZONE          int64
WKDY_I_R          int64
INT_HWY           int64
LGTCON_I_R        int64
MANCOL_I_R        int64
PED_ACC_R         int64
RELJCT_I_R        int64
REL_RWY_R         int64
PROFIL_I_R        int64
SPD_LIM           int64
SUR_COND          int64
TRAF_CON_R        int64
TRAF_WAY          int64
VEH_INVL          int64
WEATHER_R         int64
INJURY_CRASH      int64
NO_INJ_I          int64
PRPTYDMG_CRASH    int64
FATALITIES        int64
MAX_SEV_IR        int64
dtype: object

보고된 사고에 부상이 동반될지(MAX_SEV_IR = 1 OR 2), 부상이 없을지(MAX_SEV_IR = 0)를 예측하기 위해 MAX_SEV_IR = 1 또는 2이면 값이 "Yes"가 되고 그렇지 않으면 "No"가 되는 `INJURY`라는 가변수를 생성.

In [47]:
accident_df['INJURY'] = accident_df['MAX_SEV_IR'].apply(lambda x: "Yes" if x > 0 else "No")
accident_df.head()

Unnamed: 0,HOUR_I_R,ALCHL_I,ALIGN_I,STRATUM_R,WRK_ZONE,WKDY_I_R,INT_HWY,LGTCON_I_R,MANCOL_I_R,PED_ACC_R,...,TRAF_CON_R,TRAF_WAY,VEH_INVL,WEATHER_R,INJURY_CRASH,NO_INJ_I,PRPTYDMG_CRASH,FATALITIES,MAX_SEV_IR,INJURY
0,0,2,2,1,0,1,0,3,0,0,...,0,3,1,1,1,1,0,0,1,Yes
1,1,2,1,0,0,1,1,3,2,0,...,0,3,2,2,0,0,1,0,0,No
2,1,2,1,0,0,1,0,3,2,0,...,1,2,2,2,0,0,1,0,0,No
3,1,2,1,1,0,0,0,3,2,0,...,1,2,2,1,0,0,1,0,0,No
4,1,1,1,0,0,1,0,3,2,0,...,0,2,3,1,0,0,1,0,0,No


## Part a

이 데이터셋의 정보를 사용하여, 막 보고된 사고에 추가 정보가 없다면 예측은 무엇이 되어야 하는가?(INJURY = Yes 또는 No?) 그 이유는 무엇인가?

In [48]:
# create a value count table based on 'INJURY'
injury_value_cnt = accident_df['INJURY'].value_counts()
print(injury_value_cnt)

INJURY
Yes    21462
No     20721
Name: count, dtype: int64


In [49]:
from pandas.api.types import CategoricalDtype

# calculate probability of injury
inj_prob = "{:.2%}".format(injury_value_cnt["Yes"] / (injury_value_cnt["Yes"] + injury_value_cnt["No"]))
print("probability of injury: ",inj_prob)

probability of injury:  50.88%


### Ans)

**주어진 데이터 세트의 사고 중 약 51%(50.88%)가 부상으로 이어졌으므로, 사고가 발생하면 부상으로 이어질 가능성이 조금 더 높다고 예측해야 합니다.**

## Part b

이 데이터셋의 처음 12개 레코드를 선택하여 응답값(INJURY)과 2개의 예측 변수 WEATHER_R과 TRAF_CON_R만 고려하자.

In [50]:
# Convert all columns to categorical type
for col in accident_df.columns:
    accident_df[col] = accident_df[col].astype('category')
    
# Create a new subset with only the required records
new_df = accident_df.loc[0:11, ["INJURY", "WEATHER_R", "TRAF_CON_R"]]
new_df

Unnamed: 0,INJURY,WEATHER_R,TRAF_CON_R
0,Yes,1,0
1,No,2,0
2,No,2,1
3,No,1,1
4,No,1,0
5,Yes,2,0
6,No,2,0
7,Yes,1,0
8,No,2,0
9,No,2,0


### i.
12개의 레코드들에 대해 두 예측 변수의 함수로서 INJURY를 검토하는 피벗 테이블을 작성하십시오. 피벗 테이블에서 이 세 변수를 모두 행/열로 사용하시오.

In [51]:
!pip install pivottablejs

[0m

In [52]:
import pivottablejs

# Display the pivot table for the new_df
pivottablejs.pivot_ui(new_df)

### ii.
예측 변수들의 가능한 조합 6가지에 대해, 부상이 있을(INJURY = Yes) 정확한 베이즈 조건부 확률(Exact Bayes conditional probability)을 계산하시오.

In [53]:
# Calculating probabilities

# P(Injury=yes|WEATHER_R = 1, TRAF_CON_R =0)
numerator1 = 2/3 * 3/12
denominator1 = 3/12
prob1 = numerator1 / denominator1

# P(Injury=yes|WEATHER_R = 1, TRAF_CON_R =1)
numerator2 = 0 * 3/12
denominator2 = 1/12
prob2 = numerator2 / denominator2 if denominator2 != 0 else np.nan

# P(Injury=yes| WEATHER_R = 1, TRAF_CON_R =2)
numerator3 = 0 * 3/12
denominator3 = 1/12
prob3 = numerator3 / denominator3 if denominator3 != 0 else np.nan

# P(Injury=yes| WEATHER_R = 2, TRAF_CON_R =0)
numerator4 = 1/3 * 3/12
denominator4 = 6/12
prob4 = numerator4 / denominator4

# P(Injury=yes| WEATHER_R = 2, TRAF_CON_R =1)
numerator5 = 0 * 3/12
denominator5 = 1/12
prob5 = numerator5 / denominator5 if denominator5 != 0 else np.nan

# P(Injury=yes| WEATHER_R = 2, TRAF_CON_R =2)
numerator6 = 0 * 3/12
denominator6 = 0
prob6 = numerator6 / denominator6 if denominator6 != 0 else np.nan

# Creating DataFrame
options = [1, 2, 3, 4, 5, 6]
probabilities = [prob1, prob2, prob3, prob4, prob5, prob6]
prob_df = pd.DataFrame({'Option #': options, 'Probability': probabilities})

# Rounding probabilities to 3 decimal places
prob_df = prob_df.round({'Probability': 3})
prob_df


Unnamed: 0,Option #,Probability
0,1,0.667
1,2,0.0
2,3,0.0
3,4,0.167
4,5,0.0
5,6,


### iii.
이 확률값들과 컷오프 값 0.5를 사용해 이 12개의 사고를 분류하시오.

In [54]:
# Add probability results to the new_df
new_df_prob = new_df.copy()

# Display the head of the updated DataFrame
print(new_df_prob.head())

new_df_prob

  INJURY WEATHER_R TRAF_CON_R
0    Yes         1          0
1     No         2          0
2     No         2          1
3     No         1          1
4     No         1          0


Unnamed: 0,INJURY,WEATHER_R,TRAF_CON_R
0,Yes,1,0
1,No,2,0
2,No,2,1
3,No,1,1
4,No,1,0
5,Yes,2,0
6,No,2,0
7,Yes,1,0
8,No,2,0
9,No,2,0


In [55]:
# Probability of injury
prob_inj = [0.667, 0.167, 0, 0, 0.667, 0.167, 0.167, 0.667, 0.167, 0.167, 0.167, 0]

# Add the PROB_INJURY column to the new_df_prob DataFrame
new_df_prob['PROB_INJURY'] = prob_inj

# Add a column for injury prediction based on a cutoff of 0.5
new_df_prob['PREDICT_PROB'] = np.where(new_df_prob['PROB_INJURY'] > 0.5, "yes", "no")

# Display the updated DataFrame
new_df_prob


Unnamed: 0,INJURY,WEATHER_R,TRAF_CON_R,PROB_INJURY,PREDICT_PROB
0,Yes,1,0,0.667,yes
1,No,2,0,0.167,no
2,No,2,1,0.0,no
3,No,1,1,0.0,no
4,No,1,0,0.667,yes
5,Yes,2,0,0.167,no
6,No,2,0,0.167,no
7,Yes,1,0,0.667,yes
8,No,2,0,0.167,no
9,No,2,0,0.167,no


### iv.
WEATHER_R = 1과 TRAF_CON_R = 1이 주어졌을 때, 부상이 있을 나이브 베이즈 조건부 확률을 손으로 계산하시오.

In [56]:
# P(INJURY=Yes| WEATHER_R = 1, TRAF_CON_R = 1):
# 사고에서 부상의 확률
# = (INJURY=Yes인 경우 WEATHER_R = 1의 비율) 
#    * (INJURY=Yes인 경우 TRAF_CON_R = 1의 비율)
#    * (모든 경우에서 INJURY=yes의 비율)
prob = 2/3 * 0/3 * 3/12
prob

0.0

### v.
scikit-learn을 사용해 2개의 예측 변수를 갖는 12개의 레코드들에 대해 나이브 베이즈 분류기를 실행하시오. 12개 레코드 모두에 대해 확률값과 분류 결과를 구해서 모델의 출력값을 살펴보시오. 이 출력값을 정확한 베이즈 분류 결과와 비교하시오. 분류 결과가 동일한가? 관측치들의 랭킹(=순위)은 동일한가?

In [57]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import cohen_kappa_score, confusion_matrix

In [58]:
new_df = new_df.copy()
new_df

# 예측 변수(X)와 타겟 변수(y)
X = new_df[["WEATHER_R", "TRAF_CON_R"]]
y = new_df["INJURY"]


# Gaussian Naive Bayes 모델 생성 및 교차 검증
nb_model = GaussianNB()
scores = cross_val_score(nb_model, X, y, cv=3, scoring="accuracy")
print("Cross-validated Accuracy:", scores.mean())

# 모델 훈련
nb_model.fit(X, y)

# 확률 예측과 클래스 예측
pred_probs = nb_model.predict_proba(X)
pred_classes = nb_model.predict(X)

# 출력 결과 비교
print("\nProbabilities:\n", pred_probs)
print("\nPredicted Classes:\n", pred_classes)

# 혼동 행렬과 Kappa 점수 계산
conf_matrix = confusion_matrix(y, pred_classes, labels=["No", "Yes"])
kappa_score = cohen_kappa_score(y, pred_classes)
print("\nConfusion Matrix:\n", conf_matrix)
print("Kappa Score:", kappa_score)

# 새로운 데이터프레임에 예측 결과 추가
new_df["PROB_INJURY_YES"] = pred_probs[:, 1]
new_df["PREDICTED_CLASS"] = pred_classes

print("Exact Bayes conditional probability\n", new_df_prob)
print()
print("scikit_learn DataFrame:\n", new_df)

Cross-validated Accuracy: 0.5833333333333334

Probabilities:
 [[3.30536253e-05 9.99966946e-01]
 [1.48119025e-04 9.99851881e-01]
 [1.00000000e+00 0.00000000e+00]
 [1.00000000e+00 0.00000000e+00]
 [3.30536253e-05 9.99966946e-01]
 [1.48119025e-04 9.99851881e-01]
 [1.48119025e-04 9.99851881e-01]
 [3.30536253e-05 9.99966946e-01]
 [1.48119025e-04 9.99851881e-01]
 [1.48119025e-04 9.99851881e-01]
 [1.48119025e-04 9.99851881e-01]
 [1.00000000e+00 0.00000000e+00]]

Predicted Classes:
 ['Yes' 'Yes' 'No' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No']

Confusion Matrix:
 [[3 6]
 [0 3]]
Kappa Score: 0.19999999999999996
Exact Bayes conditional probability
    INJURY WEATHER_R TRAF_CON_R  PROB_INJURY PREDICT_PROB
0     Yes         1          0        0.667          yes
1      No         2          0        0.167           no
2      No         2          1        0.000           no
3      No         1          1        0.000           no
4      No         1          0        0.667          yes
5 