In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 데이터 로드 
data = pd.read_csv('C:/Users/wjdqh/injige/car_evaluation.csv')
data

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
...,...,...,...,...,...,...,...
1722,low,low,5more,more,med,med,good
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good


In [4]:
# 컬럼 확인
data.columns

Index(['vhigh', 'vhigh.1', '2', '2.1', 'small', 'low', 'unacc'], dtype='object')

In [15]:
# 필수 컬럼 확인
required_columns = {'vhigh', 'vhigh.1', '2', '2.1', 'small', 'low', 'unacc'}
if not required_columns.issubset(data.columns):
    missing_cols = required_columns - set(data.columns)
    raise ValueError(f"Error: Missing columns in dataset - {missing_cols}")

In [6]:
# 불필요한 컬럼 제거 (필수 컬럼 제외)
data = data[list(required_columns)]

X = data.drop(columns=['unacc'])  # 특성(X)에서 타겟(y) 제거
y = data['unacc']  # 타겟 변수 설정

# 변환 전 Sex 컬럼 값 분포 확인
print("column value counts before encoding:")
print(data['vhigh'].value_counts())
print(data['vhigh.1'].value_counts())
print(data['2'].value_counts())
print(data['2.1'].value_counts())
print(data['small'].value_counts())
print(data['low'].value_counts())
print(data['unacc'].value_counts())

column value counts before encoding:
vhigh
high     432
med      432
low      432
vhigh    431
Name: count, dtype: int64
vhigh.1
high     432
med      432
low      432
vhigh    431
Name: count, dtype: int64
2
3        432
4        432
5more    432
2        431
Name: count, dtype: int64
2.1
4       576
more    576
2       575
Name: count, dtype: int64
small
med      576
big      576
small    575
Name: count, dtype: int64
low
med     576
high    576
low     575
Name: count, dtype: int64
unacc
unacc    1209
acc       384
good       69
vgood      65
Name: count, dtype: int64


In [18]:
# LabelEncoder 초기화
label_encoder = LabelEncoder()

# 전체 컬럼 인코딩

X = data.drop(columns=['unacc']).apply(label_encoder.fit_transform)  # 모든 특성 컬럼 인코딩
y = label_encoder.fit_transform(data['unacc'])  # 타겟 변수도 인코딩

# 변환된 데이터 확인
print(data_encoded.head())

# 결측값 확인
missing_values = X.isnull().sum()
print("Missing values before handling:")
print(missing_values)


   vhigh  vhigh.1  2  2.1  small  low  unacc
0      3        3  0    0      2    2      2
1      3        3  0    0      2    0      2
2      3        3  0    0      1    1      2
3      3        3  0    0      1    2      2
4      3        3  0    0      1    0      2
Missing values before handling:
vhigh      0
vhigh.1    0
2          0
2.1        0
small      0
low        0
dtype: int64
🔍 Missing values in dataset:
Series([], dtype: int64)


In [17]:
# 데이터 분할 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화 (일부 모델에 필요, but 트리 기반 모델은 필요 없음)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.astype(float))
X_test = scaler.transform(X_test.astype(float))

# 모델 리스트
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(kernel='linear'),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

# 모델 학습 및 평가
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix for {name}:")
    print(confusion_matrix(y_test, y_pred))

Logistic Regression Accuracy: 0.6850
              precision    recall  f1-score   support

           0       0.32      0.16      0.21        77
           1       0.00      0.00      0.00        15
           2       0.73      0.95      0.83       237
           3       0.50      0.06      0.11        17

    accuracy                           0.68       346
   macro avg       0.39      0.29      0.28       346
weighted avg       0.60      0.68      0.62       346

Confusion Matrix for Logistic Regression:
[[ 12   0  64   1]
 [  2   0  13   0]
 [ 13   0 224   0]
 [ 11   0   5   1]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest Accuracy: 0.9682
              precision    recall  f1-score   support

           0       0.94      0.96      0.95        77
           1       0.91      0.67      0.77        15
           2       1.00      1.00      1.00       237
           3       0.79      0.88      0.83        17

    accuracy                           0.97       346
   macro avg       0.91      0.88      0.89       346
weighted avg       0.97      0.97      0.97       346

Confusion Matrix for Random Forest:
[[ 74   1   1   1]
 [  2  10   0   3]
 [  1   0 236   0]
 [  2   0   0  15]]
Decision Tree Accuracy: 0.9711
              precision    recall  f1-score   support

           0       0.96      0.95      0.95        77
           1       0.75      0.80      0.77        15
           2       1.00      1.00      1.00       237
           3       0.83      0.88      0.86        17

    accuracy                           0.97       346
   macro avg       0.89      0.91      0.90       346
weighted a

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
