In [None]:
# Import library and download datasets

from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")

    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)

        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)

        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path='datasets')

    return pd.read_csv(Path("datasets/housing/housing.csv"))
    

In [None]:
# Load the data
df_housing = load_housing_data()
df_housing.info()   

In [None]:
df_housing['ocean_proximity'].value_counts()

In [None]:
# util
import numpy as np
from zlib import crc32


# function for splitting data into train and test sets
def shuffle_and_split_data(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

# another ways to split data using id hash
def is_id_in_test_set(id, test_ratio):
    return crc32(np.int64(id)) & 0xffffffff < test_ratio * 2**32

def split_data_with_id_hash(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
# split data into train and test sets using shuffle
train_set, test_set = shuffle_and_split_data(df_housing, 0.2)
display("Train Data Shuffle: "+str(len(train_set)))
display("Test Data Shuffle: "+str(len(test_set))) 

# split data into train and test sets using id hash
df_housing_with_id = df_housing.reset_index()
train_set, test_set = split_data_with_id_hash(df_housing_with_id, 0.2, "index")
display("Train Data Hash: "+str(len(train_set)))
display("Test Data Hash: "+str(len(test_set)))

In [None]:
# creating income category attribute
df_housing["income_cat"] = pd.cut(df_housing["median_income"],
                                    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                                    labels=[1, 2, 3, 4, 5])

import matplotlib.pyplot as plt

# df_housing["income_cat"].value_counts().sort_index().plot.bar(rot=0 ,grid=True)
df_housing["income_cat"].hist()
plt.xlabel("Income Category")
plt.ylabel("Number of districts")
plt.show()

In [None]:
# Split data into train and test sets using stratified sampling
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
strat_splits = []
for train_index, test_index in split.split(df_housing, df_housing["income_cat"]):
    strat_train_set_n = df_housing.loc[train_index]
    strat_test_set_n = df_housing.loc[test_index]
    strat_splits.append((strat_train_set_n, strat_test_set_n))

strat_train_set, strat_test_set = strat_splits[0]
display("Train Data Stratified: "+str(len(strat_train_set)))
display("Test Data Stratified: "+str(len(strat_test_set)))

strat_train_set, strat_test_set = train_test_split(df_housing, 
                                                   test_size=0.2, 
                                                   random_state=42, 
                                                   stratify=df_housing["income_cat"])
display("Train Data Split: "+str(len(strat_train_set)))
display("Test Data Split: "+str(len(strat_test_set)))

In [None]:
# remove income_cat attribute
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
# show in scater longitude and latitude

housing = strat_train_set.copy()
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, grid=True)
plt.show()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude",
             s=housing["population"]/100, label="Population",
                c="median_house_value", cmap='jet',colorbar=True,
                legend=True, sharex=False, figsize=(11,7))
plt.show()

In [None]:
# looking for correlations
h = housing.copy() 
h.drop("ocean_proximity",axis=1, inplace=True)
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
# ways to check correlation
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
                "housing_median_age"]

scatter_matrix(housing[attributes], figsize=(12,8))

housing.plot(kind="scatter",
             x="median_income",
             y="median_house_value",
             alpha=0.1,
             grid=True)


In [None]:
# correlation between attributes
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["people_per_household"] = housing["population"]/housing["households"]

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
# preparation data for machine learning algorithms
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

# data cleaning

housing.dropna(subset=["total_bedrooms"]) # option 1
housing.drop("total_bedrooms", axis=1) # option 2
median = housing["total_bedrooms"].median() # option 3
housing["total_bedrooms"].fillna(median, inplace=True)

# using SimpleImputer is better way to fill missing values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
housing_num = housing.select_dtypes(include=[np.number])
imputer.fit(housing_num)
display(imputer.statistics_)
display(housing_num.median().values)


X = imputer.transform(housing_num)

In [None]:
# Return the result to a pandas DataFrame.
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing_num.index)

In [None]:
# tranform categorical attributes to numerical
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing[["ocean_proximity"]])

display(housing_cat_encoded[:10])
# show categories
display(ordinal_encoder.categories_)

# one hot encoding
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing[["ocean_proximity"]])
display(housing_cat_1hot.toarray())

# custom transformer
df_test = pd.DataFrame({"ocean_proximity": ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]})
pd.get_dummies(df_test)