In [81]:
import os
import tarfile
import urllib.request  # ✅ Add `.request` to use `urlretrieve`

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

# Function to fetch the data
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

# Function to load the data
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)  # ✅ Fixed typo

# Fetch and load data
fetch_housing_data()  # This will download and extract the dataset
housing = load_housing_data()  # This will load the CSV into a DataFrame
print(housing.head())  # Show first few rows


   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


  housing_tgz.extractall(path=housing_path)


In [82]:
%matplotlib inline 
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
 #plt.show()


array([[<Axes: title={'center': 'longitude'}>,
        <Axes: title={'center': 'latitude'}>,
        <Axes: title={'center': 'housing_median_age'}>],
       [<Axes: title={'center': 'total_rooms'}>,
        <Axes: title={'center': 'total_bedrooms'}>,
        <Axes: title={'center': 'population'}>],
       [<Axes: title={'center': 'households'}>,
        <Axes: title={'center': 'median_income'}>,
        <Axes: title={'center': 'median_house_value'}>]], dtype=object)

In [83]:
print(housing.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None


In [84]:
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

# Step 1: Create income category column
housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)

# Step 2: Perform stratified split
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index].copy()
    strat_test_set = housing.loc[test_index].copy()

# Step 3: Drop the income_cat column from the final sets
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)




In [85]:
# ✅ STEP 0: Make a copy of the training data
housing = strat_train_set.copy()
housing_labels = housing["median_house_value"].copy()

# ✅ STEP 1: Handling missing values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
housing_num = housing.drop(["ocean_proximity", "median_house_value"], axis=1)  # only numerical features
imputer.fit(housing_num)

# Apply imputer (not needed anymore, but okay if you want to inspect)
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

# ✅ STEP 2: Handling categorical attribute (ocean_proximity)
from sklearn.preprocessing import OneHotEncoder

housing_cat = housing[["ocean_proximity"]]
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

# ✅ STEP 3: Create custom transformer (feature engineering)
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# Column indices (based on housing_num DataFrame)
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):  # default=True
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self  # nothing to fit

    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

# ✅ STEP 4: Create a numerical pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("attribs_adder", CombinedAttributesAdder()),
    ("std_scaler", StandardScaler()),
])

# ✅ STEP 5: Combine numerical and categorical pipelines
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num.columns)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

# ✅ ✅ ✅ THE FIX: Drop the target column BEFORE passing to pipeline
housing_prepared = full_pipeline.fit_transform(housing.drop("median_house_value", axis=1))

# ✅ Optional: Print shape of final processed data
print("Shape after preprocessing:", housing_prepared.shape)


Shape after preprocessing: (16512, 16)


In [86]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)


In [87]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)

print("Linear Regression RMSE (Training Set):", lin_rmse)


Linear Regression RMSE (Training Set): 68627.87390018745


In [88]:
# Prepare test set
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)

# Predict and evaluate
final_predictions = lin_reg.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

print("Linear Regression RMSE (Test Set):", final_rmse)


Linear Regression RMSE (Test Set): 66913.4419132093


In [89]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

# Evaluate on training set
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)

print("Random Forest RMSE (Training Set):", forest_rmse)


Random Forest RMSE (Training Set): 18650.698705770003


In [90]:
final_predictions = forest_reg.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

print("Random Forest RMSE (Test Set):", final_rmse)


Random Forest RMSE (Test Set): 48373.62545757139


In [91]:
import joblib

joblib.dump(forest_reg, "housing_forest_model.pkl")
joblib.dump(full_pipeline, "housing_preprocessing_pipeline.pkl")


['housing_preprocessing_pipeline.pkl']

In [92]:
# Load later
model = joblib.load("housing_forest_model.pkl")
pipeline = joblib.load("housing_preprocessing_pipeline.pkl")

# Predict new data
some_data = X_test.iloc[:5]
some_data_prepared = pipeline.transform(some_data)
predictions = model.predict(some_data_prepared)

print("Predictions:", predictions)
print("Actual:", list(y_test.iloc[:5]))


Predictions: [487818.71 225878.04 213079.   168040.   268311.01]
Actual: [500001.0, 162500.0, 204600.0, 159700.0, 184000.0]


In [93]:
from sklearn.model_selection import cross_val_score

lin_scores = cross_val_score(
    lin_reg, housing_prepared, housing_labels,
    scoring="neg_mean_squared_error", cv=10
)
lin_rmse_scores = np.sqrt(-lin_scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

print("Linear Regression Cross-Validation Scores:")
display_scores(lin_rmse_scores)


Linear Regression Cross-Validation Scores:
Scores: [71762.76364394 64114.99166359 67771.17124356 68635.19072082
 66846.14089488 72528.03725385 73997.08050233 68802.33629334
 66443.28836884 70139.79923956]
Mean: 69104.07998247063
Standard deviation: 2880.328209818068


In [94]:
forest_scores = cross_val_score(
    forest_reg, housing_prepared, housing_labels,
    scoring="neg_mean_squared_error", cv=10
)
forest_rmse_scores = np.sqrt(-forest_scores)

print("Random Forest Cross-Validation Scores:")
display_scores(forest_rmse_scores)


Random Forest Cross-Validation Scores:
Scores: [51559.63379638 48737.57100062 47210.51269766 51875.21247297
 47577.50470123 51863.27467888 52746.34645573 50065.1762751
 48664.66818196 54055.90894609]
Mean: 50435.58092066179
Standard deviation: 2203.3381412764606


In [95]:
import joblib

joblib.dump(forest_reg, "model.pkl")
joblib.dump(full_pipeline, "pipeline.pkl")


['pipeline.pkl']