In [None]:
# Housing Price Prediction Notebook

## Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

## Step 2: Load the Dataset
housing = fetch_california_housing()
housing_df = pd.DataFrame(housing.data, columns=housing.feature_names)
housing_df['target'] = housing.target

## Step 3: Explore the Dataset
housing_df.head()
housing_df.describe()

## Step 4: Data Preprocessing
# For simplicity, we'll assume no missing values and no categorical variables to encode

## Step 5: Split the Data
X_train, X_test, y_train, y_test = train_test_split(housing_df.drop('target', axis=1), housing_df['target'], test_size=0.2, random_state=42)

## Step 6: Train a Simple Model
model = LinearRegression()
model.fit(X_train, y_train)

## Step 7: Evaluate the Model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

## Step 8: Create a Custom Transformer
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = np.array(X)  # Ensure X is a NumPy array
        rooms_per_household = X[:, 3] / X[:, 6]
        population_per_household = X[:, 5] / X[:, 6]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, 4] / X[:, 3]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

## Step 9: Use a Scikit-learn Pipeline
pipeline = Pipeline([
    ('custom_transformer', CustomTransformer()),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

pipeline.fit(X_train, y_train)

## Step 10: Evaluate the Pipeline
pipeline_predictions = pipeline.predict(X_test)
pipeline_mse = mean_squared_error(y_test, pipeline_predictions)
print(f'Pipeline Mean Squared Error: {pipeline_mse}')
