In [1]:
# Load housing.csv into a DataFrame

# housing.csv was copied from https://github.com/ageron/handson-ml/tree/master/datasets/housing

# This dataset appeared in a 1997 paper titled Sparse Spatial Autoregressions by Pace, R. Kelley and Ronald Barry, published in the Statistics and Probability Letters journal. They built it using the 1990 California census data. It contains one row per census block group. A block group is the smallest geographical unit for which the U.S. Census Bureau publishes sample data (a block group typically has a population of 600 to 3,000 people).

import pandas as pd
raw_df = pd.read_csv("housing.csv")

In [2]:
# Show the first 10 rows of the DataFrame

raw_df.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY


In [3]:
# Create lists with the column/feature names of categorical and numeric columns

CATEGORICAL_FEATURES = ["ocean_proximity"]
NUMERIC_FEATURES = ["housing_median_age", "total_rooms", "total_bedrooms", "households", "median_income", "median_house_value"]

In [4]:
# Show counts of nulls in each column

raw_df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [5]:
# Show unique values for each categorical column

for feature in CATEGORICAL_FEATURES:
    print(feature, raw_df[feature].unique())

ocean_proximity ['NEAR BAY' '<1H OCEAN' 'INLAND' 'NEAR OCEAN' 'ISLAND']


In [6]:
# Replace nulls in a feature-appropriate way.

import numpy as np
from sklearn.impute import SimpleImputer

non_null_df = pd.DataFrame(SimpleImputer(missing_values=np.nan, strategy="most_frequent").fit_transform(raw_df), columns=raw_df.columns)
assert non_null_df.isnull().sum().sum() == 0
non_null_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [7]:
# Standardize the numeric features

from sklearn.preprocessing import StandardScaler

standardized_df = non_null_df.copy()
standardized_df[NUMERIC_FEATURES] = StandardScaler().fit_transform(non_null_df[NUMERIC_FEATURES])
standardized_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,0.982143,-0.804819,-0.967245,322.0,-0.977033,2.344766,2.129631,NEAR BAY
1,-122.22,37.86,-0.607019,2.045890,1.358707,2401.0,1.669961,2.332238,1.314156,NEAR BAY
2,-122.24,37.85,1.856182,-0.535746,-0.822021,496.0,-0.843637,1.782699,1.258693,NEAR BAY
3,-122.25,37.85,1.856182,-0.624215,-0.714889,558.0,-0.733781,0.932968,1.165100,NEAR BAY
4,-122.25,37.85,1.856182,-0.462404,-0.607758,565.0,-0.629157,-0.012881,1.172900,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,-0.289187,-0.444985,-0.383971,845.0,-0.443449,-1.216128,-1.115804,INLAND
20636,-121.21,39.49,-0.845393,-0.888704,-0.917250,356.0,-1.008420,-0.691593,-1.124470,INLAND
20637,-121.22,39.43,-0.924851,-0.174995,-0.119712,1007.0,-0.174042,-1.142593,-0.992746,INLAND
20638,-121.32,39.43,-0.845393,-0.355600,-0.300646,741.0,-0.393753,-1.054583,-1.058608,INLAND


In [8]:
# One-hot encode the categorical features

import pandas as pd

encoded_df = pd.get_dummies(standardized_df, columns=CATEGORICAL_FEATURES)
encoded_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,0.982143,-0.804819,-0.967245,322.0,-0.977033,2.344766,2.129631,0,0,0,1,0
1,-122.22,37.86,-0.607019,2.045890,1.358707,2401.0,1.669961,2.332238,1.314156,0,0,0,1,0
2,-122.24,37.85,1.856182,-0.535746,-0.822021,496.0,-0.843637,1.782699,1.258693,0,0,0,1,0
3,-122.25,37.85,1.856182,-0.624215,-0.714889,558.0,-0.733781,0.932968,1.165100,0,0,0,1,0
4,-122.25,37.85,1.856182,-0.462404,-0.607758,565.0,-0.629157,-0.012881,1.172900,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,-0.289187,-0.444985,-0.383971,845.0,-0.443449,-1.216128,-1.115804,0,1,0,0,0
20636,-121.21,39.49,-0.845393,-0.888704,-0.917250,356.0,-1.008420,-0.691593,-1.124470,0,1,0,0,0
20637,-121.22,39.43,-0.924851,-0.174995,-0.119712,1007.0,-0.174042,-1.142593,-0.992746,0,1,0,0,0
20638,-121.32,39.43,-0.845393,-0.355600,-0.300646,741.0,-0.393753,-1.054583,-1.058608,0,1,0,0,0


In [9]:
# Check that we preserved latitude and longitude throughout

for feature in ("latitude", "longitude"):
    assert encoded_df[feature].tolist() == raw_df[feature].tolist()
for feature in NUMERIC_FEATURES:
    assert encoded_df[feature].tolist() == standardized_df[feature].tolist()