In [1]:
import os
import pickle
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

## # Preparing the data for Machine Learning Algorithms:

In [2]:
##  loading the org dataset

df = pd.read_csv("housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,5
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,5
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,5
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,4
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,3


In [3]:
## Loading the training dataset

datasets_dir = os.path.join('artifacts', 'datasets')

strat_train_set = pd.read_csv(os.path.join(datasets_dir, "strat_train_set.csv"))
strat_train_set.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,INLAND
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,NEAR OCEAN
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,82700.0,INLAND
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,NEAR OCEAN
4,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,<1H OCEAN


In [4]:
## Separating the columns and labels

X = strat_train_set.drop("median_house_value", axis=1)
Y = strat_train_set[["median_house_value"]]

### # Missing Vals:

In [5]:
## features

X.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        158
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64

In [6]:
## Let's impute the na vals by the median of the missing featues accordingly

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [7]:
### Numerical cols
## dropping the cat. attribute to compute the median

X_num = X.drop("ocean_proximity", axis=1)
X_num.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264
4,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964


In [8]:
## Fitting the numerical data to the imputor

imputer.fit(X_num)

SimpleImputer(strategy='median')

**Note:** We applied imputors to all features **despite only `total_bedrooms` being having the missing vals**, because we can't be sure that new data won't be having any missing values in other features.

In [9]:
## medians computed by the imputor

imputer.statistics_

array([-118.51   ,   34.26   ,   29.     , 2119.     ,  433.     ,
       1164.     ,  408.     ,    3.54155])

In [10]:
## Now we can use this `trained imputor` to transform the training set by replacing 
# missing values by the median of the corrosponding cols

X_ = pd.DataFrame(imputer.transform(X_num), columns=X_num.columns, index=X_num.index)
X_

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264
4,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964
...,...,...,...,...,...,...,...,...
16507,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900
16508,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139
16509,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797
16510,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964


**=> Imputation done!**

In [11]:
### Now. let's deal w the Cat. attributes:

## Categorical attributes

X_cat = X[["ocean_proximity"]]
X_cat.head(10)

Unnamed: 0,ocean_proximity
0,INLAND
1,NEAR OCEAN
2,INLAND
3,NEAR OCEAN
4,<1H OCEAN
5,NEAR BAY
6,<1H OCEAN
7,<1H OCEAN
8,<1H OCEAN
9,<1H OCEAN


In [12]:
## Values counts of categories

X_cat.value_counts()

ocean_proximity
<1H OCEAN          7277
INLAND             5262
NEAR OCEAN         2124
NEAR BAY           1847
ISLAND                2
dtype: int64

### **NOTE: Since our model won't accept any string values, we oughta depend upon the Encoding.**


### # Encoding:

#### # ORDINAL ENCODING:

In [13]:
## Since, It even goes without saying `Encoding` needs to be done

from sklearn.preprocessing import OrdinalEncoder

o_encoder = OrdinalEncoder()
X_cat_encoded = o_encoder.fit_transform(X_cat)
X_cat_encoded

array([[1.],
       [4.],
       [1.],
       ...,
       [0.],
       [0.],
       [1.]])

In [14]:
## categories we've had

o_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

**NOTE:** **One problemo here is that ML algo will assume that two nearby values are more similar than two distant values.** This may be fine in some cases but it's doing no good here for `ocean_proximity`. So the **Ordinal Encoding** fails here.
<br><br>
So we oughta use **`One-hot Encoding`** here.

#### # One-Hot Encoding:

In [15]:
## One-hot Encoding 

from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
X_cat_1hot = cat_encoder.fit_transform(X_cat)
X_cat_1hot

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

**=> `Scipy Sparse Array` instead of the `NummPy array`.**
<br>Like in numpy array, storing up zeroes would be very wasteful, so instead **sparse array** stores only the location of non-zeroes elements. Apart from that, it's almost same as normal 2D array.


In [16]:
## Array form

X_cat_1hot.toarray()

array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [17]:
## Categories we've had (by 1hot encoder)

cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

### # Custom Transformers:

Although Sci-kit learns provides many useful transformers, but we'd want to write our own tasks such as **custom cleaning operations** or **combining specific attributes**.
<br><br>We'll want our transformer to **work seamlessly** with Scikit-learn functionalities (such as pipelines), and since Scikit-learn relies on duck typing(not inheritance), all we need to do is create a class and implement 3 methods: fit(), transform() and fit_transform().

**Note:** We can get the last method for free by simply adding **TransformerMixin** as a base class. If we add **BaseEstimator** as a base class (and avoid \*args and \*\*kwargs in our constructors), we'll get two extra methods (**get_params() and set_params()**) that will be useful for automatic hyperparameter tuning.


In [18]:
X_.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')

In [19]:
## An Example:

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kwargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, bedrooms_ix] / X[:, rooms_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        
        else:
            return np.c_[X, rooms_per_household, population_per_household]

### Note:

Now here, **add_bedrooms_per_room** is acting as an hyperparameter. This hyperparameter will allow us to find out whether adding this attribute helps the Machine Learning algorithms or not. **Moreover generally, we can add a hyperparameter to gate any data preparation step that we are not 100% sure about.**

In [20]:
## An example of having all other extras attributes as hyperparameters

rooms_idx, bedrooms_idx, population_idx, households_idx = 3, 4, 5, 6

class CombinedAttributesAdder2(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True, add_rooms_per_household=True,
                add_population_per_household=True): # no *args or **kwargs
        
        self.add_bedrooms_per_room = add_bedrooms_per_room
        self.add_rooms_per_household = add_rooms_per_household
        self.add_population_per_household = add_population_per_household
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):

        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_idx] / X[:, rooms_idx]
            X = np.c_[X, bedrooms_per_room]
            
        if self.add_rooms_per_household:
            rooms_per_household = X[:, rooms_idx] / X[:, households_idx]
            X = np.c_[X, rooms_per_household]
        
        if self.add_population_per_household:
            population_per_household = X[:, bedrooms_idx] / X[:, rooms_idx]
            X = np.c_[X, population_per_household]
            
        return X

In [21]:
attr_adder2 = CombinedAttributesAdder2()
X_extras2 = attr_adder2.transform(X.values)
X_extras2.shape

## All extras are added!

(16512, 12)

In [22]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
X_extras = attr_adder.transform(X.values)
X_extras.shape

## 2 extras are added!

(16512, 11)

In [23]:
## features' description

X_.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-119.575635,35.639314,28.653404,2622.539789,533.939438,1419.687379,497.01181,3.875884
std,2.001828,2.137963,12.574819,2138.41708,410.80626,1115.663036,375.696156,1.904931
min,-124.35,32.54,1.0,6.0,2.0,3.0,2.0,0.4999
25%,-121.8,33.94,18.0,1443.0,296.0,784.0,279.0,2.56695
50%,-118.51,34.26,29.0,2119.0,433.0,1164.0,408.0,3.54155
75%,-118.01,37.72,37.0,3141.0,641.0,1719.0,602.0,4.745325
max,-114.31,41.95,52.0,39320.0,6210.0,35682.0,5358.0,15.0001


## # Transformation Pipelines:

As evident, there are many data transformation steps that need to be executed in the right order. But lucky us, Scikit-learn provides the **Pipeline class** to help with such sequences of the transformations. 

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")), 
                          ('attirbs_adder', CombinedAttributesAdder2()),
                         ('std_scaler', StandardScaler())], verbose=1)

num_pipeline

Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('attirbs_adder', CombinedAttributesAdder2()),
                ('std_scaler', StandardScaler())],
         verbose=1)

#### NOTE: All but the last estimator must be transformers (i.e. they must have fit_transform() method). By calling pipeline's fit () method, it calls fit_transform() method sequentially for all the transformers, passing the output of each call as the param to the next call, but for the last for which it calls the fit() method.<br><br>The pipeline expresses the same method as the final estimator.**

### # Handling the Num and Cat. features simultaneously in Pipeline

In [25]:
from sklearn.compose import ColumnTransformer

num_atts = X_num.columns
cat_atts = ["ocean_proximity"]

full_pipeline = ColumnTransformer([("num", num_pipeline, num_atts),
                                  ("cat", OneHotEncoder(), cat_atts)])

X_prepared = full_pipeline.fit_transform(X)
X_prepared

[Pipeline] ........... (step 1 of 3) Processing imputer, total=   0.0s
[Pipeline] ..... (step 2 of 3) Processing attirbs_adder, total=   0.0s
[Pipeline] ........ (step 3 of 3) Processing std_scaler, total=   0.0s


array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])

### Note:
Note that OneHotEncoder returns a sparse matrix, while the num_pipeline returns a dense matrix. When there's such a mix of sparse and dense matrices, the **ColumnTransformer** estimates **the density of the final matrix (i.e., the ratio os non-zero cells)** and it returns a sparse matrix if the density is lower than a given threshold (by default, sparse_threshold=0.3).


In [26]:
## Saving the transformation pipeline

pipelines_dir = os.path.join('artifacts', 'pipelines')
os.makedirs(pipelines_dir, exist_ok=True)

pickle.dump(full_pipeline, open(os.path.join(pipelines_dir,'transformation_pipeline.pkl', 'wb')))

In [27]:
## Saving the relevant datasets

datasets_dir = os.path.join('artifacts', 'datasets')
os.makedirs(datasets_dir, exist_ok=True)

# raw training features
X.to_csv(os.path.join(datasets_dir, 'X.csv'), index=None)

# prepared features
np.savetxt(os.path.join(datasets_dir, 'X_prepared.txt'), X_prepared)

# prepared labels
Y.to_csv(os.path.join(datasets_dir, "Y.csv"), index=None)