In [15]:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import hashlib
import seaborn as sns
%matplotlib inline
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [16]:
#DATASET_PATH = '/home/helios/Python/ML/Ch2/Ch2/datasets/housing/housing.csv'
DATASET_PATH = 'C:\\Users\\DevAccessa\\Documents\\VS\\Pt\\ML\\Ch2\\Ch2\\datasets\\housing\\housing.csv'

In [17]:

def load_lboss_data(lboss_path=DATASET_PATH):
    return pd.read_csv(lboss_path)

In [18]:
lboss = load_lboss_data()
lboss.head(1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY


In [19]:
lboss.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [20]:
lboss.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [21]:
lboss.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [43]:
#lboss.hist(bins=50,figsize=(20,15))
#plt.show()

In [23]:
def show_hash(hash_num,last_index=False):
    for i in range(hash_num.digest_size):
        if last_index:
            print(hash_num.digest()[-1])
            break
        else:
            print(hash_num.digest()[i])

In [24]:
def split_train_test(data,test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

In [25]:
train_set, test_set = split_train_test(lboss,0.2)

In [26]:
print("train set:{} test set:{}".format(len(train_set),len(test_set)))

train set:16512 test set:4128


In [27]:
def test_set_check(identifier, test_ratio, hash_func):
    return hash_func(np.int64(identifier)).digest()[-1] < 256*test_ratio
def split_train_test(data,test_ratio,identifier_column_name,hash_func=hashlib.md5):
    ids = data[identifier_column_name]
    in_test_set = ids.apply(lambda id_:test_set_check(id_,test_ratio, hash_func))
    return data.loc[in_test_set],data.loc[~in_test_set]

In [28]:
lboss_with_id = lboss.reset_index()
train_set,test_set = split_train_test(lboss_with_id[:5],0.2,'index')

In [29]:
print("train set:{} test set:{}".format(len(train_set),len(test_set)))

train set:1 test set:4


In [30]:
show_hash(hashlib.md5(np.int64(1)),True)

203


In [31]:
show_hash(hashlib.md5(np.int64(2)),True)

100


In [32]:
show_hash(hashlib.md5(np.int64(3)),True)

244


In [33]:
show_hash(hashlib.md5(np.int64(4)),True)

18


In [34]:
show_hash(hashlib.md5(np.int64(0)),True)

116


In [35]:
for i in range(10):
    show_hash(hashlib.md5(np.int64(i)),True)

116
203
100
244
18
16
237
183
215
182


In [36]:
lboss.median_income.min()

0.4999

In [37]:
lboss.median_income.max()

15.0001

In [38]:
from sklearn.model_selection import train_test_split
tr_set,ts_set = train_test_split(lboss,test_size=0.2,random_state=42)

In [39]:
len(tr_set)

16512

In [40]:
lboss_new = lboss.copy(deep=True)
lboss_new["income_cat"] = np.ceil(lboss["median_income"]/1.5)
lboss_new.head()
lboss_new.income_cat.value_counts()

3.0     7236
2.0     6581
4.0     3639
5.0     1423
1.0      822
6.0      532
7.0      189
8.0      105
9.0       50
11.0      49
10.0      14
Name: income_cat, dtype: int64

In [53]:
lboss_new["income_cat"].where(lboss_new["income_cat"]<5.0,5.0,inplace=True)
lboss_new.income_cat.value_counts()/len(lboss_new)

3.0    0.350581
2.0    0.318847
4.0    0.176308
5.0    0.114438
1.0    0.039826
Name: income_cat, dtype: float64

In [42]:
#plt.bar(sorted(list(lboss_new.income_cat.value_counts().index)),lboss_new.income_cat.value_counts().values)

In [59]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(lboss_new,lboss_new["income_cat"]):
    print("Train_index:",train_index)
    print("Test_index:",test_index)
    print("type:",type(train_index))
    strat_train_set = lboss_new.loc[train_index]
    strat_test_set = lboss_new.loc[test_index]

Train_index: [17606 18632 14650 ... 13908 11159 15775]
Test_index: [ 5241 10970 20351 ...  4019 12107  2398]
type: <class 'numpy.ndarray'>


In [61]:
strat_train_set.income_cat.value_counts()/len(strat_train_set)

3.0    0.350594
2.0    0.318859
4.0    0.176296
5.0    0.114402
1.0    0.039850
Name: income_cat, dtype: float64

In [62]:
strat_test_set.income_cat.value_counts()/len(strat_test_set)

3.0    0.350533
2.0    0.318798
4.0    0.176357
5.0    0.114583
1.0    0.039729
Name: income_cat, dtype: float64

In [78]:
#now remove the income_cat column
for set_ in (strat_train_set,strat_test_set):
    set_.drop(['income_cat'],axis=1,inplace=True)

In [79]:
strat_train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 10 columns):
longitude             16512 non-null float64
latitude              16512 non-null float64
housing_median_age    16512 non-null float64
total_rooms           16512 non-null float64
total_bedrooms        16354 non-null float64
population            16512 non-null float64
households            16512 non-null float64
median_income         16512 non-null float64
median_house_value    16512 non-null float64
ocean_proximity       16512 non-null object
dtypes: float64(9), object(1)
memory usage: 1.4+ MB


In [80]:
lboss_w = strat_train_set.copy()

In [84]:
#lboss_w.plot(kind="scatter",x="longitude",y="latitude",alpha=0.1);

In [105]:
corr_matrix = lboss_w.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64

In [104]:
corr_matrix["median_house_value"]

longitude            -0.047432
latitude             -0.142724
housing_median_age    0.114110
total_rooms           0.135097
total_bedrooms        0.047689
population           -0.026920
households            0.064506
median_income         0.687160
median_house_value    1.000000
Name: median_house_value, dtype: float64