<a href="https://colab.research.google.com/github/windyday0622/windyday/blob/main/m5_%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D/%20ML_%EC%8B%A4%EC%8A%B5%EA%B3%BC%EC%A0%9C1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# california_housing 데이터 셋으로 아래사항을 참조하여 주택가격을 예측하는 회귀모델을 개발하세요.(9개정도의 모델을 다 해야 함. 실습에서)
- 전체 회귀모델을 적용하여 최적 모델 선정
- 각 모델별 최적 하이퍼파라미터 선정 - GridSearchCV 활용(파이프라인도 사용해도 됨)
- 평가지수 MSE 기준으로 가장 성능이 좋은 모델과 파라미터를 적용하여 평가 결과를 출력

In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# 캘리포니아 주택 데이터셋 로드
california = fetch_california_housing(as_frame=True)
X = california.data
y = california.target

# 데이터셋을 학습 및 테스트 셋으로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 선형 회귀 모델 파이프라인
linear_pipeline = Pipeline([
    ('regressor', LinearRegression())
])
linear_pipeline.fit(X_train, y_train)
linear_reg_pred = linear_pipeline.predict(X_test)
linear_reg_mse = mean_squared_error(y_test, linear_reg_pred)

# 결정 트리 회귀 모델 파이프라인
dt_pipeline = Pipeline([
    ('regressor', DecisionTreeRegressor(random_state=42))
])
dt_param_grid = {'regressor__max_depth': [3, 5, 7]}
dt_grid_search = GridSearchCV(dt_pipeline, dt_param_grid, scoring='neg_mean_squared_error', cv=5)
dt_grid_search.fit(X_train, y_train)
dt_reg_best = dt_grid_search.best_estimator_
dt_reg_pred = dt_reg_best.predict(X_test)
dt_reg_mse = mean_squared_error(y_test, dt_reg_pred)

# 랜덤 포레스트 회귀 모델 파이프라인
rf_pipeline = Pipeline([
    ('regressor', RandomForestRegressor(random_state=42))
])
rf_param_grid = {
    'regressor__n_estimators': [100, 300, 500],
    'regressor__max_depth': [3, 5, 7]
}
rf_grid_search = GridSearchCV(rf_pipeline, rf_param_grid, scoring='neg_mean_squared_error', cv=5)
rf_grid_search.fit(X_train, y_train)
rf_reg_best = rf_grid_search.best_estimator_
rf_reg_pred = rf_reg_best.predict(X_test)
rf_reg_mse = mean_squared_error(y_test, rf_reg_pred)

# 각 모델의 MSE 출력
print("Linear Regression MSE:", linear_reg_mse)
print("Decision Tree Regression MSE:", dt_reg_mse)
print("Random Forest Regression MSE:", rf_reg_mse)

# 최적 모델과 하이퍼파라미터 출력
print("Best Decision Tree Regressor:", dt_grid_search.best_params_)
print("Best Random Forest Regressor:", rf_grid_search.best_params_)


Linear Regression MSE: 0.5558915986952444
Decision Tree Regression MSE: 0.4565857103929165
Random Forest Regression MSE: 0.3777064239717286
Best Decision Tree Regressor: {'regressor__max_depth': 7}
Best Random Forest Regressor: {'regressor__max_depth': 7, 'regressor__n_estimators': 100}


In [None]:
# 수정한 코드
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# 데이터 로드
california = fetch_california_housing(as_frame=True)
X = california.data
y = california.target

# 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 및 하이퍼파라미터 그리드 정의
models = [
    ('Linear Regression', LinearRegression(), {}),
    ('Ridge Regression', Ridge(), {'regressor__alpha': [0.1, 1.0, 10.0]}),
    ('Lasso Regression', Lasso(), {'regressor__alpha': [0.1, 1.0, 10.0]}),
    ('ElasticNet Regression', ElasticNet(), {'regressor__alpha': [0.1, 1.0, 10.0]}),
    ('Decision Tree Regression', DecisionTreeRegressor(random_state=42), {'regressor__max_depth': [3, 5, 7]}),
    ('Random Forest Regression', RandomForestRegressor(random_state=42), {
        'regressor__n_estimators': [100, 300, 500],
        'regressor__max_depth': [3, 5, 7]
    }),
    ('Gradient Boosting Regression', GradientBoostingRegressor(random_state=42), {
        'regressor__n_estimators': [100, 300, 500],
        'regressor__max_depth': [3, 5, 7]
    }),
    ('Support Vector Regression', SVR(), {
        'regressor__C': [0.1, 1.0, 10.0],
        'regressor__epsilon': [0.01, 0.1, 1.0]
    }),
    ('K-Nearest Neighbors Regression', KNeighborsRegressor(), {
        'regressor__n_neighbors': [3, 5, 7]
    })
]

# 결과 저장용 리스트
results = []

# 모델별 그리드 서치 수행
for name, model, param_grid in models:
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', model)
    ])
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # 최적 모델 평가
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    results.append({
        'model': name,
        'best_params': grid_search.best_params_,
        'mse': mse
    })

# 결과 출력
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='mse')
print(results_df)

# 최적 모델 평가 결과 출력
best_model_name = results_df.iloc[0]['model']
best_params = results_df.iloc[0]['best_params']
best_mse = results_df.iloc[0]['mse']

print(f"Best Model: {best_model_name}")
print(f"Best Parameters: {best_params}")
print(f"Test MSE: {best_mse:.4f}")

                            model  \
6    Gradient Boosting Regression   
7       Support Vector Regression   
5        Random Forest Regression   
8  K-Nearest Neighbors Regression   
4        Decision Tree Regression   
1                Ridge Regression   
0               Linear Regression   
3           ElasticNet Regression   
2                Lasso Regression   

                                         best_params       mse  
6  {'regressor__max_depth': 5, 'regressor__n_esti...  0.209262  
7  {'regressor__C': 10.0, 'regressor__epsilon': 0.1}  0.323697  
5  {'regressor__max_depth': 7, 'regressor__n_esti...  0.377614  
8                      {'regressor__n_neighbors': 7}  0.428334  
4                        {'regressor__max_depth': 7}  0.456586  
1                          {'regressor__alpha': 0.1}  0.555888  
0                                                 {}  0.555892  
3                          {'regressor__alpha': 0.1}  0.635857  
2                          {'regressor__alph

실습과제 예시

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# 데이터 로드
data = load_breast_cancer()
X = data.data
y = data.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 파이프라인 구성
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svd', TruncatedSVD(n_components=2)),
    ('clf', LogisticRegression(max_iter=1000))
])

# 하이퍼 파라미터 그리드 설정
param_grid = {
    'svd__n_components': [2, 5, 10],
    'logreg__C': [0.1, 1.0, 10.0]
}

# GridSearchCV를 사용한 하이퍼 파라미터 튜닝
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
print('Best parameters found:', grid_search.best_params_)

# 평가 사용자 함수 정의
def evaluate_model(model, X_test, y_test):
    # 예측 수행
    y_pred = model.predict(X_test)
    # 정확도 계산
    accuracy = accuracy_score(y_test, y_pred)
    # 분류 보고서 생성
    report = classification_report(y_test, y_pred)
    # ROC AUC 계산
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    # 결과 출력
    print(f"Test Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)
    print(f"Test ROC AUC: {roc_auc:.4f}")

# 최적의 모델을 사용하여 테스트 데이터 평가
evaluate_model(grid_search, X_test, y_test)