In [1]:
%run 1.fetch_data_from_github.ipynb

In [3]:
df = load_housing_data()

Creating a test set is theoretically quite simple: just pick some instances randomly,
typically 20% of the dataset (or less if your dataset is very large), and set them aside

In [18]:
import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data)) #Randomly permute a sequence, or return a permuted range.
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [19]:
train_set, test_set = split_train_test(df, 0.2)

In [20]:
len(train_set)

16512

In [21]:
train_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
6306,-117.89,33.99,22.0,3272.0,618.0,1784.0,591.0,4.0324,211300.0,<1H OCEAN
14863,-117.08,32.65,28.0,2296.0,603.0,1277.0,550.0,2.3562,123800.0,NEAR OCEAN
14267,-117.11,32.69,37.0,2395.0,627.0,2489.0,599.0,1.5933,86300.0,NEAR OCEAN
6194,-117.90,34.08,32.0,2068.0,356.0,976.0,370.0,5.2120,201200.0,<1H OCEAN
20028,-119.01,36.08,31.0,1620.0,366.0,1154.0,348.0,1.8857,55500.0,INLAND
...,...,...,...,...,...,...,...,...,...,...
9521,-123.20,39.13,26.0,1474.0,417.0,1065.0,401.0,1.3750,84400.0,<1H OCEAN
12703,-121.41,38.59,17.0,12355.0,3630.0,5692.0,3073.0,2.5245,99100.0,INLAND
3085,-118.50,35.70,18.0,3303.0,814.0,986.0,522.0,1.5957,101400.0,INLAND
7479,-118.22,33.93,39.0,1921.0,483.0,2286.0,470.0,3.0167,130000.0,<1H OCEAN


In [22]:
len(test_set)

4128

In [23]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
4224,-118.29,34.11,30.0,2774.0,570.0,1076.0,580.0,5.2960,500001.0,<1H OCEAN
19118,-122.65,38.23,52.0,1923.0,393.0,910.0,345.0,3.4500,200600.0,<1H OCEAN
7545,-118.22,33.90,30.0,1007.0,260.0,1112.0,238.0,1.7262,115600.0,<1H OCEAN
7702,-118.13,33.96,36.0,1933.0,341.0,958.0,335.0,4.4732,266000.0,<1H OCEAN
15300,-117.36,33.18,26.0,5550.0,1153.0,2372.0,1058.0,2.5509,181800.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
508,-122.30,37.84,14.0,7355.0,2408.0,3100.0,2051.0,4.0018,143800.0,NEAR BAY
10436,-117.61,33.43,33.0,1150.0,383.0,604.0,317.0,2.3545,187500.0,NEAR OCEAN
103,-122.25,37.81,52.0,2155.0,701.0,895.0,613.0,2.5795,350000.0,NEAR BAY
11270,-117.99,33.78,15.0,4273.0,993.0,2300.0,946.0,3.5313,213000.0,<1H OCEAN


if you run the program again, it will generate a
different test set! Over time, you (or your Machine Learning algorithms) will get to
see the whole dataset, which is what you want to avoid.

One solution is to save the test set on the first run and then load it in subsequent
runs. Another option is to set the random number generator’s seed (e.g., np.ran
dom.seed(42))14 before calling np.random.permutation(),

But both these solutions will break next time you fetch an updated dataset. A common
solution is to use each instance’s identifier to decide whether or not it should go
in the test set.

For
example, you could compute a hash of each instance’s identifier and put that instance
in the test set if the hash is lower or equal to 20% of the maximum hash value

In [12]:
from zlib import crc32
def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32
def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

Unfortunately, the housing dataset does not have an identifier column. The simplest
solution is to use the row index as the ID:

In [15]:
df_with_id = df.reset_index() # adds an `index` column
train_set, test_set = split_train_test_by_id(df_with_id, 0.2, "index")

In [16]:
train_set

Unnamed: 0,index,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
3,3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
6,6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...,...
20635,20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [17]:
test_set

Unnamed: 0,index,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
2,2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
5,5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
12,12,-122.26,37.85,52.0,2491.0,474.0,1098.0,468.0,3.0750,213500.0,NEAR BAY
16,16,-122.27,37.85,52.0,1966.0,347.0,793.0,331.0,2.7750,152500.0,NEAR BAY
23,23,-122.27,37.84,52.0,1688.0,337.0,853.0,325.0,2.1806,99700.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...,...
20615,20615,-121.54,39.08,23.0,1076.0,216.0,724.0,197.0,2.3598,57500.0,INLAND
20617,20617,-121.53,39.06,20.0,561.0,109.0,308.0,114.0,3.3021,70800.0,INLAND
20622,20622,-121.44,39.00,20.0,755.0,147.0,457.0,157.0,2.4167,67000.0,INLAND
20626,20626,-121.43,39.18,36.0,1124.0,184.0,504.0,171.0,2.1667,93800.0,INLAND
