제 4차 과제

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

# 데이터 불러오기
df = pd.read_csv('your_dataset.csv', encoding='utf-8')

# 데이터 확인
print(df.head())
print(df.info())
print(df.describe())

# 수치형과 범주형 변수 구분
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

# 특성과 타겟 변수 분리
X = df.drop(columns=['PRP'])  # 특성 변수
y = df['PRP']  # 타겟 변수

# 데이터 분할 (훈련 데이터와 테스트 데이터)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 전처리 및 모델 파이프라인 구축
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 모델 정의
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# 하이퍼파라미터 그리드 설정
param_grid = {
    'regressor__normalize': [True, False],
}

# GridSearchCV를 사용하여 최적의 모델 선택
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# 최적 모델 출력
best_model = grid_search.best_estimator_
print("Best Model:", best_model)

# 교차 검증 수행
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)
print("Cross Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

# 테스트 데이터에 대한 예측 및 성능 평가
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)

# 교차 검증 수행 (테스트 데이터에서)
cv_scores_test = cross_val_score(best_model, X_test, y_test, cv=5)
print("Cross Validation Scores (Test Data):", cv_scores_test)
print("Mean CV Score (Test Data):", cv_scores_test.mean())


FileNotFoundError: [Errno 2] No such file or directory: 'cpu_performance_dataset.csv'