In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# 1. Load data
# Make sure you've downloaded the dataset CSV from Kaggle's dataset page
data = pd.read_csv('housing.csv')  # Adjust filename if different

# 2. Identify features
numeric_features = ['housing_median_age', 'total_rooms', 'total_bedrooms',
                    'population', 'households', 'median_income']
categorical_features = ['ocean_proximity']

# 3. Preprocessing pipelines
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, numeric_features),
    ('cat', cat_transformer, categorical_features)
])
preprocessor.set_output(transform="pandas")

# 4. Apply preprocessing
X = preprocessor.fit_transform(data)
y = data['median_house_value']

# 5. Feature engineering: median house value per household
X['value_per_household'] = y / data['households']

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 7. Check processed data
print(X_train.head())
print(y_train.head())

       num__housing_median_age  num__total_rooms  num__total_bedrooms  \
14196                 0.346478          0.224718             0.214986   
8267                  1.617807          0.342065             0.596500   
17445                -1.957806         -0.338639            -0.490815   
14265                 0.584852         -0.556832            -0.404974   
2271                  1.141059         -0.116322            -0.252369   

       num__population  num__households  num__median_income  \
14196         0.772251         0.322924           -0.321654   
8267         -0.098440         0.670799           -0.030620   
17445        -0.450778        -0.427755            0.150349   
14265        -0.006602        -0.378059           -1.014947   
2271         -0.486983        -0.312669           -0.166583   

       cat__ocean_proximity_<1H OCEAN  cat__ocean_proximity_INLAND  \
14196                             0.0                          0.0   
8267                              0.0     

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# 8. Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# 9. Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Root Mean Squared Error (RMSE): {rmse}")

TypeError: got an unexpected keyword argument 'squared'