In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()


In [None]:

# Split data into features (X) and target variable (y)
X = housing.drop("median_house_value", axis=1)
y = housing["median_house_value"]

# Define feature columns for numerical and categorical features
numerical_features = ['roomcnt', 'age']
categorical_features = ['sex', 'dis', 'rm']

# Define preprocessing pipeline for numerical features
num_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Define one-hot encoding pipeline for categorical features
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

######################################################################
(250405)
# 2개의 파이프라인을 - 스케일러, 원핫인코딩 - 정의하고
# 트랜스포머를 정의하여 2개의 파이프라인을 합쳐 - 데이터 전처리기 -로 정의한다.
# 다시 파이프라인을 정의하여 - 전처리기 + 머신러닝 모델읠 합쳐준다.
######################################################################3

# Combine numerical and categorical preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numerical_features),
        ('cat', cat_pipeline, categorical_features)
    ]
)

# Define model pipeline with preprocessing pipeline as input
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

In [5]:
pipe

In [None]:
# Train the model
pipe.fit(X, y)

# Make predictions on test data
y_pred = pipe.predict(X)

# Evaluate the model's performance using mean squared error
mse = mean_squared_error(y, y_pred)
print(f'Mean Squared Error: {mse:.2f}')