In [11]:
# ============================================
# 화이트 와인 품질 '회귀' 예측 (그래프 없음)
# 모델: Linear Regression, KNN, Decision Tree, Random Forest
# 출력: 모델별 RMSE, R²  +  Linear Regression 기울기/절편
# ============================================

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# 회귀 알고리즘
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [12]:
# --------------------------------------------
# 1) 데이터 로드
# --------------------------------------------
CSV_PATH = r"/content/drive/MyDrive/Col/머신러닝 4주차/winequality-white.csv"
df = pd.read_csv(CSV_PATH, sep=";")

print("\n[결측치 확인]")
print(df.isnull().sum())
print(f"\n→ 전체 결측치 개수: {df.isnull().sum().sum()}")


[결측치 확인]
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

→ 전체 결측치 개수: 0


In [13]:
# --------------------------------------------
# 2) 입력/타깃 분리
# --------------------------------------------
X = df.drop(columns=["quality"])
y = df["quality"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

In [14]:
# --------------------------------------------
# 3) 모델 정의
#    - KNN: 스케일 민감 → StandardScaler 파이프라인
# --------------------------------------------
models = {
    "Linear Regression": LinearRegression(),
    "KNN Regressor (k=7)": Pipeline([
        ("scaler", StandardScaler()),
        ("knn", KNeighborsRegressor(n_neighbors=7))
    ]),
    "Decision Tree (max_depth=8)": DecisionTreeRegressor(max_depth=8, random_state=RANDOM_STATE),
    "Random Forest (n=300)": RandomForestRegressor(
        n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1
    ),
}

In [15]:
# --------------------------------------------
# 4) 학습 & 평가  (RMSE = sqrt(MSE) 방식으로 계산)
# --------------------------------------------
def rmse_score(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

results, predictions = [], {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = rmse_score(y_test, y_pred)
    r2   = r2_score(y_test, y_pred)
    results.append([name, rmse, r2])
    predictions[name] = y_pred

res_df = pd.DataFrame(results, columns=["Model", "RMSE", "R²"]).sort_values("RMSE")
print("\n=== 회귀 모델 성능 비교 ===")
print(res_df.to_string(index=False))


=== 회귀 모델 성능 비교 ===
                      Model     RMSE       R²
      Random Forest (n=300) 0.587599 0.554184
        KNN Regressor (k=7) 0.689014 0.387016
Decision Tree (max_depth=8) 0.740353 0.292264
          Linear Regression 0.754337 0.265275


In [16]:
# --------------------------------------------
# 5) 선형회귀: 기울기(coef_) · 절편(intercept_) 출력
#    (특성별 기울기를 절대값 기준으로 정렬)
# --------------------------------------------
lr = models["Linear Regression"]

# 절편
print(f"\n[Linear Regression] Intercept(절편): {lr.intercept_:.6f}")

# 기울기 표 (절대값 큰 순으로)
coef_df = pd.DataFrame({"Feature": X.columns, "Coefficient": lr.coef_})
coef_df["abs"] = coef_df["Coefficient"].abs()
coef_df = coef_df.sort_values("abs", ascending=False).drop(columns="abs")

print("\n[Linear Regression] Coefficients (기울기, 절대값 내림차순)")
print(coef_df.to_string(index=False))


[Linear Regression] Intercept(절편): 124.393915

[Linear Regression] Coefficients (기울기, 절대값 내림차순)
             Feature  Coefficient
             density  -124.264125
    volatile acidity    -1.914884
           sulphates     0.649073
                  pH     0.600700
             alcohol     0.229009
      residual sugar     0.071240
         citric acid    -0.061303
       fixed acidity     0.045907
           chlorides    -0.026475
 free sulfur dioxide     0.005119
total sulfur dioxide    -0.000242
