In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

# CSV 파일 바로 읽기
df = pd.read_csv("dataset/winequality-red.csv")

# 특징(X)과 타깃(y) 분리
X = df.drop("quality", axis=1).values
y = df["quality"].values

# train/test split (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# z-score 정규화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (1279, 11) Test shape: (320, 11)


In [29]:
# X_train에 bias 항 추가 (절편)
X_train_bias = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
X_test_bias = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

# 정규방정식으로 계수 계산
beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ X_train_bias.T @ y_train

# 예측
y_pred_lr = X_test_bias @ beta

# 평가
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print(f"RMSE: {rmse_lr:.4f}, R2: {r2_lr:.4f}")


RMSE: 0.6245, R2: 0.4032


In [30]:
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)

rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
r2_dt = r2_score(y_test, y_pred_dt)

print(f"RMSE: {rmse_dt:.4f}, R2: {r2_dt:.4f}")

RMSE: 0.7826, R2: 0.0627


In [31]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f"RMSE: {rmse_rf:.4f}, R2: {r2_rf:.4f}")

RMSE: 0.5483, R2: 0.5399


In [32]:
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"RMSE: {rmse_xgb:.4f}, R2: {r2_xgb:.4f}")

RMSE: 0.5853, R2: 0.4758


In [33]:
# train에서 20%를 validation으로 사용
X_train_dnn, X_val_dnn, y_train_dnn, y_val_dnn = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# DNN 모델 정의
dnn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

dnn_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 학습
history = dnn_model.fit(
    X_train_dnn, y_train_dnn,
    validation_data=(X_val_dnn, y_val_dnn),
    epochs=200,
    batch_size=32,
    callbacks=[early_stop],
    verbose=0
)

# 예측
y_pred_dnn = dnn_model.predict(X_test).flatten()

rmse_dnn = mean_squared_error(y_test, y_pred_dnn, squared=False)  # RMSE
r2_dnn = r2_score(y_test, y_pred_dnn)

print(f"RMSE: {rmse_dnn:.4f}, R2: {r2_dnn:.4f}")

RMSE: 0.6413, R2: 0.3707


In [None]:
# Random Forest > XGBoost > Linear Regression > DNN > Decision Tree

results = pd.DataFrame({
    "Model": ["Linear Regression", "Decision Tree", "Random Forest", "XGBoost", "DNN"],
    "RMSE": [round(rmse_lr, 4), round(rmse_dt, 4), round(rmse_rf, 4), round(rmse_xgb, 4), round(rmse_dnn, 4)],
    "R2":   [round(r2_lr, 4),   round(r2_dt, 4),   round(r2_rf, 4),   round(r2_xgb, 4),   round(r2_dnn, 4)]
})

print(results)

               Model    RMSE      R2
0  Linear Regression  0.6245  0.4032
1      Decision Tree  0.7826  0.0627
2      Random Forest  0.5483  0.5399
3            XGBoost  0.5853  0.4758
4                DNN  0.6413  0.3707
