In [None]:
# Step 1: Import the data
import os
import pandas as pd
import data_utility as du

# setup pathing and names for the resources that needs to be downloaded and extracted
SRC_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz"
DEST_ROOT = "C:\machine-learning-dataset"
TAR_FILE_NAME = "housing.tgz"
TAR_FILE_PATH = os.path.join(DEST_ROOT, TAR_FILE_NAME)
EXTRACTED_DEST_PATH = os.path.join(DEST_ROOT, "housing")
CSV_FILE = os.path.join(EXTRACTED_DEST_PATH, "housing.csv")

In [None]:
du.get_data_from_url(SRC_URL, DEST_ROOT, TAR_FILE_PATH)

In [None]:
du.extract_tarfile(TAR_FILE_PATH, EXTRACTED_DEST_PATH)

In [None]:
housing = pd.read_csv(CSV_FILE)

# see top 5 rows example code
housing.head()

In [None]:
# quick description of the data example code
housing.info()

In [None]:
# aggregate count example code
housing["ocean_proximity"].value_counts()

In [None]:
# summary of numerical attribute
housing.describe()

In [None]:
# STEP 2. Review the data. Check for capped values in histograms and evaluate with teams that will use your ml.
# if it does, either collect proper labels for the set whos labels are capped or remove those set.

# plot a histogram for each numerical attribute
# %matplotlib inline shows plot inside jupyter
%matplotlib inline

# create histogram. hist method is only available with matplotlib
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
# this method is optional. housing.hist() showns this automatically, but it includes the array.
plt.show()

In [None]:
# Step 3: split the data into train, and test. Do not look at the test set. This is to avoid ovefitting if your brain detects pattern from the entire data set. Recommendation is to use option c) stratified method to split.

# option a) generating train, test set using random single run
train_set, test_set = du.split_train_test_single_run(housing, 0.2)
len(train_set)

In [None]:
len(test_set)

In [None]:
# option b-1) using the re-runable method that uses hashing off an index column
# generate an index column first
housing_with_index = housing.reset_index()
housing_with_index.head()

# the trick is to make sure new data gets appended to the end, for this hash algorithm method to work
train_set, test_set = du.split_train_test_multi_run(housing_with_index, 0.2, "index")
len(train_set)

In [None]:
len(test_set)

In [None]:
# option b-2) Alternatively use sci kit framework method
from sklearn.model_selection import train_test_split

# this is the same thing as split_train_test_single_run
# passing random_state will allow setting a random seed. 
# you can also pass it multiple data sets incase you separated the label in another dataframe
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
# option c) We however want to use strafied sampling instead of random.

# Divide into homogenous subgrous called strata, and the right number of instances are sampled from each stratum to 
# guarantee the test is representative of the entire dataset
# we have to categorize things based on mediam_income since most people think it's an important category for predicting median house prices.
import numpy
housing["median_income"].hist()
housing["income_category"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., numpy.inf], labels=[1, 2, 3, 4, 5])

In [None]:
housing.head()

In [None]:
housing["income_category"].hist()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
# n_splits = Number of re-shuffling & splitting iterations, 
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# only 1 iteration instance
for train_index, test_index in split.split(housing, housing["income_category"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index] 


In [None]:
# confirming the sample is as expected
strat_test_set["income_category"].value_counts() / len(strat_test_set)

In [None]:
# remove the category that created for stratify, since it's no longer needed.
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_category", axis=1, inplace=True) # inplace means mutable. default is inplace=false which only affects the copy

In [None]:
# Step 4. Discovery and Visualize data to gain insights
# create a copy, so we don't affect the training set when we play with data visualization
housing = strat_train_set.copy()

# 4.1 - Visualize the data

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude")

In [None]:
# the above plot looks like california but, but density isn't show well. 
# adjust by lowing the alpha to get a heatmap feel
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
# the brain is very good at splotting patterns in pictures, we just have to play with different visualization parameters to make patterns stand out
# s = radius of the circle => population
# c = color => price
# cmp = predefined colour map => jet (ranges from blue = low values to red = high values)
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing["population"]/100, label="population", figsize=(10,7), c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,)
plt.legend()

In [None]:
# 4.2 - Look for correlations
# for none large datasets, easily compute the standard correlation coefficient (Pearson's r) between every pair of attributes using the correlatioon method:
corr_matrix = housing.corr()

In [None]:
# example, check correlation of median_house_value with all values
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
# another way to check correlation is to visualize it all using pandas scatter_matrix() function
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

In [None]:
# the most promising attribute is median income. zoom in on it as an example.
# there is a ceiling due to $500,000 cap on dataset. 
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

In [None]:
# 4.3 - Experiment with attribute combinations
# Identified few data quirks that should be cleaned up before feeidng the data to ML algorithm. Some algorith also has tail heavy distribution. We may want to trasnform them by computing their logarithm. This is done in step 7.abs

# Before preparing the data for ML, try out various attribute combinatioons. Eg, total numbr of bedrooms ina district is not very useful if we dont know how many households there are.
# Total number of bedrooms by itself is not very useful, we want to compare it to numbr of rooms.
# Population per household also seems like an interesting attribute combination to look at.abs

housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

# Look at correlation matrix again
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

# notice that bedrooms_per_room that was calculate has better correlation than total_rooms. 
# We can now notice that house with lower bedroom/room ratio tend to be more expensive

# number of rooms per household also shows more info than total number of rooms in a district. larger house => more expensive.

# this round of exploration does not have to be absolutely thorough. it's just to gain quick insights that can help with a good protoype. This is a iterative process.

#QUESTION TODO: why does this matter when the raw new data is a set of data that's derived from 2 existing data that's used by the model. Are the columns in dataset treated as independent during ML?

In [None]:
# Step 5 Preparing the data for ML algorithm
# Take this opportunity to build a library of transformation functions for reuse

# first revert to a clean train set by taking the last strat_train set snapshot. and removing some unwanted columns
housing = strat_train_set.drop("median_house_value", axis=1) # axis=1 is equivalent to columns=labels. inplace is false, this only affects the copy.
housing_labels = strat_train_set["median_house_value"].copy()

# 5.1 - Data cleaning
# Most ML cannot work with missing features. create a few functions to take care of them.

# We saw total_bedrooms attribute has some missing values.
# option 1 - Get rid of the corresponding districts.
# option 2 - Get rid of the whole attibute.
# option 3 - Set the values to some value (zero, the mean, the median, etc.).

# housing.dropna(subset=["total_bedrooms"]) option 1

# housing.drop("total_bedrooms", axis=1) option 2

# median = housing["total_bedrooms"].median() # option 3
# housing["total_bedrooms"].fillna(median, inplace=True)
# if option 3 is taken, a median value is computed. make usre to save it so we can use it to replace missing values in the test set when we want to evaluate the system.
# We can use SK library imputer to do option 3 instead.

In [None]:
# Option 3 using Sci-Kit
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
# drop ocean_priximity first because median can only be computed on numerical attributes.
housing_num = housing.drop("ocean_proximity", axis=1)

# computer the median of each atribute, and store in statistics_ instance
imputer.fit(housing_num)
# take a look at statistics_ and see how it basically stored all the median values
imputer.statistics_


In [None]:
housing_num.median().values


In [None]:
X = imputer.transform(housing_num)
# X is a NumPy array. put it back into a pandas DataFrame
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

In [None]:

# 5.2 - Handling text and categorical attributes
# Move on to deal with text attributes
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)


In [None]:
# most ml prefer to numbers. convert these categories from text to numbers.
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

In [None]:
# you can get list of categories using categories_ instance variable => a list containing a 1D array of categories for each categorical attribute. 
# (one categorical attribute => single array)
ordinal_encoder.categories_

In [None]:
# a problem with this rep is ml will assume two nearby values are more similar than two distance values.
# this might be ok for some cases such such as order categories 'bad','average','good','excellent'
# but for ocean proximity. eg, categories 0 and 4 are more similar than categories 0 and 1
# To fix this, create one binary attribute per category:
# one attribute equal to 1 when category is <1H OCEAN, 0 otherwise
# one attribute equal to 1 when cateogry is INLAND, 0 otherwise. and so on.
# this is called one-hot encoding. 1 attribute will equal 1 (hot) while others will be 0 (cold)
# the new attributes are sometimes called dummy attributes. Scikit-Learn provide OneHotEncoder class to convert categorical values to one-hot vector

from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
# the output is a SciPy sparse matrix instead of NumPy array. Very useful when you have categorical attriobutes with thousands of categories.
# (This is sparse matrix is more space efficient because it only stores location of the nonzero elements). We can use NumPy but it's expensive.

# eg: if you really want NumPy aray, call toarray()
housing_cat_1hot.toarray()

In [None]:
# eg: can see categories with categories_ property
cat_encoder.categories_

In [None]:
# if categorical attribute has a large number of possible categories, then one-hot encoding will result in a large number of inputs that can slowdown performance. 
# In this case, replace categorical input with useful numerical features like replacing ocean_proximity to distance to ocean

# 5.3 - Custom Transformers
# see transform.py for example
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) 
# this example has one hyperparameter => add_bedrooms_per_room set to True by default. 
# (hyperparameter is a parameter whose value is used to control ml process. In this case allow us to easily findout whether adding this attribute helps the ml algorithm)
# More specifically we can take advantage of hyperparameters to gate any data prep step we're not sure about
housing_extra_attribs = attr_adder.transform(housing.values)

# 5.4 - Feature scaling
# ML algorithms don't perform well when the input numerical values have very different scales. 
# Example, this is the case for housing data where total number of rooms range from 6 to 39,320 while median income only range from 0 to 15.abs
# Note that scaling the target values is generally not required.abs
# 2 Ways to get all attributes in same scale
# a) min-max scaling - (normalization) values are shifted and rescaled so theye nd up ranging from 0 to 1. (value - min) / (max - min)
#    Sci-kit MinMaxScalar, which comes with hyperparameter feature_range if we want to scale beyond 0 to 1
# b) standardization - subtract the mean value so standardized values always have zero mean, then it divides by the standard deviation so that the resulting distribution has unit variance
#    standardization is not bound like min-max, which may be a problem for some ml algorithms like neural networks.
#    Sci-kit has StandardScalar 


In [None]:
# 5.5 - Transformation Pipelines
# Use Scikit-Learn pipeline class to help with sequence of pipelines
# Put all the transform stuff in a pipeline instead

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# constructor takes names/estimator pairs. All but last one must all be transformers (with fit_transform)
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),
                         ('attribs_adder', CombinedAttributesAdder()),
                         ('std_scaler', StandardScaler)),
                         ])

# example
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
# Sci-kit has column transformer 
# use one transformer pipeline to transform multiple columns
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)
# note that num_pipeline returns a dense matrix, but OneHotEncoder returns a sparse matrix. 
# The columnTransformer estimates the density of the final matrix and returns a sparse matrix if the density is
# lower than a given threshold. default => sparse_trshold=0.3

In [None]:
# Step 6 - Select and Train a Model
# 6.1  -Training and evaluating on the training set

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing prepared, housing_labels)

# try out on few instance from training set
some_Data = housing.iloc[:5]
