In [67]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [68]:
# --------------------------------------------
# 1) 데이터 로드
# --------------------------------------------
CSV_PATH = r"/content/drive/MyDrive/Col/머신러닝 4주차/winequality-white.csv"
df = pd.read_csv(CSV_PATH, sep=";")

In [69]:
# ============================================
# (수정) 1-1) 결측치 확인 단계 추가
# ============================================
print("\n[결측치 확인]")
print(df.isnull().sum())
print(f"\n→ 전체 결측치 개수: {df.isnull().sum().sum()}")
# ============================================


[결측치 확인]
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

→ 전체 결측치 개수: 0


In [70]:
# --------------------------------------------
# 2) 타깃 변환 (분류)
# --------------------------------------------
def to_class(q: int) -> int:
    if q <= 4: return 0
    elif q <= 6: return 1
    else: return 2

y = df["quality"].apply(to_class)
X = df.drop(columns=["quality"])

print("\n[클래스 분포] (0=저, 1=중, 2=고)")
print(y.value_counts().sort_index(), "\n")

# train/test 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)



[클래스 분포] (0=저, 1=중, 2=고)
quality
0     183
1    3655
2    1060
Name: count, dtype: int64 



In [71]:
# --------------------------------------------
# 3) 모델 정의 (전부 StandardScaler 파이프라인 적용)
# --------------------------------------------
models = {
    "LogisticRegression": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            multi_class="multinomial", solver="lbfgs", max_iter=200, random_state=RANDOM_STATE
        ))
    ]),
    "KNN(k=7)": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", KNeighborsClassifier(n_neighbors=7))
    ]),
    "DecisionTree(max_depth=8)": Pipeline([
        ("scaler", StandardScaler()),  # 영향은 없지만 통일성을 위해 추가
        ("clf", DecisionTreeClassifier(max_depth=8, random_state=RANDOM_STATE))
    ]),
    "RandomForest(n_estimators=300)": Pipeline([
        ("scaler", StandardScaler()),  # 영향은 없지만 통일성을 위해 추가
        ("clf", RandomForestClassifier(
            n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1
        ))
    ]),
}


In [72]:
# --------------------------------------------
# 4) 학습 & 평가
# --------------------------------------------
rows, preds = [], {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1m = f1_score(y_test, y_pred, average="macro")
    rows.append([name, acc, f1m])
    preds[name] = y_pred

res_df = pd.DataFrame(rows, columns=["Model", "Accuracy", "Macro F1"])\
         .sort_values(by=["Macro F1", "Accuracy"], ascending=False)

print("=== 분류 모델 성능 비교 ===")
print(res_df.to_string(index=False))

=== 분류 모델 성능 비교 ===
                         Model  Accuracy  Macro F1
RandomForest(n_estimators=300)  0.857143  0.629786
     DecisionTree(max_depth=8)  0.784694  0.531395
                      KNN(k=7)  0.795918  0.512615
            LogisticRegression  0.768367  0.423750


In [73]:
# --------------------------------------------
# 5) 베스트 모델 상세 리포트
# --------------------------------------------
best_name = res_df.iloc[0]["Model"]
best_pred = preds[best_name]
print(f"\n[베스트 모델] {best_name}")
print("\n=== 상세 리포트 (classification_report) ===")
print(classification_report(y_test, best_pred, digits=4))
print("=== 혼동행렬 ===")
print(confusion_matrix(y_test, best_pred, labels=[0,1,2]))


[베스트 모델] RandomForest(n_estimators=300)

=== 상세 리포트 (classification_report) ===
              precision    recall  f1-score   support

           0     0.6667    0.1622    0.2609        37
           1     0.8691    0.9535    0.9093       731
           2     0.8107    0.6462    0.7192       212

    accuracy                         0.8571       980
   macro avg     0.7821    0.5873    0.6298       980
weighted avg     0.8488    0.8571    0.8437       980

=== 혼동행렬 ===
[[  6  30   1]
 [  3 697  31]
 [  0  75 137]]


In [74]:
# --------------------------------------------
# 6) 로지스틱 회귀 계수 출력
# --------------------------------------------
try:
    logreg = models["LogisticRegression"].named_steps["clf"]
    coef_matrix = pd.DataFrame(
        logreg.coef_, columns=X.columns, index=logreg.classes_
    )
    print("\n[로지스틱 회귀 계수 (클래스별)]")
    print(coef_matrix.to_string())
except Exception as e:
    print("[INFO] 로지스틱 계수 출력 불가:", e)




[로지스틱 회귀 계수 (클래스별)]
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  free sulfur dioxide  total sulfur dioxide   density        pH  sulphates   alcohol
0      -0.054818          0.477433    -0.058582       -1.210162   0.163936            -0.277086             -0.087791  1.402008 -0.131747  -0.218673  0.046400
1      -0.187172         -0.066329     0.070572       -0.040279   0.104821             0.068785              0.062448  0.131010 -0.154231  -0.017729 -0.149802
2       0.241990         -0.411104    -0.011991        1.250441  -0.268757             0.208301              0.025344 -1.533018  0.285978   0.236402  0.103402


In [75]:
# --------------------------------------------
# 7) 랜덤포레스트 특성 중요도 출력
# --------------------------------------------
try:
    rf = models["RandomForest(n_estimators=300)"].named_steps["clf"]
    importances = pd.Series(rf.feature_importances_, index=X.columns)\
                    .sort_values(ascending=False)
    print("\n[랜덤포레스트 특성 중요도]")
    print(importances.head(11).to_string())
except Exception as e:
    print("[INFO] RF 중요도 출력 불가:", e)


[랜덤포레스트 특성 중요도]
alcohol                 0.136870
density                 0.115398
free sulfur dioxide     0.092660
volatile acidity        0.090587
chlorides               0.090224
residual sugar          0.086761
total sulfur dioxide    0.085032
pH                      0.084732
sulphates               0.076071
citric acid             0.071677
fixed acidity           0.069988
