# How To

In [1]:
import pandas as pd
df = pd.read_csv("../data/housing.csv")

In [2]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Min - Max scaling

$x' = \frac{x - \text{min}(x)}{\text{max}(x)-\text{min}(x)}$

In [3]:
df["house_value_minmax"] = (df.median_house_value-df.median_house_value.min()) / (df.median_house_value.max()-df.median_house_value.min())
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,house_value_minmax
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0.902266
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,0.708247
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,0.695051
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,0.672783
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,0.674638


##  Z score

$x' = \frac{x - \bar{x}}{\sigma}$



In [4]:
df["house_value_z"] =(df.median_house_value -df.median_house_value.mean()) / df.median_house_value.std()
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,house_value_minmax,house_value_z
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0.902266,2.12958
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,0.708247,1.314124
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,0.695051,1.258663
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,0.672783,1.165072
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,0.674638,1.172871


## Binning our data

In [5]:
bin_names = ["cheap", "medium", "high", "luxury"]
df["price_range"] = pd.cut(df.house_value_minmax, [0,0.25,.5,.75,1],
      labels = bin_names
      )
df.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,house_value_minmax,house_value_z,price_range
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0.902266,2.12958,luxury
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,0.708247,1.314124,high
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,0.695051,1.258663,high
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,0.672783,1.165072,high
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,0.674638,1.172871,high
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY,0.525155,0.544598,high
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY,0.585979,0.80024,high
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY,0.466804,0.299354,medium
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY,0.436495,0.171967,medium
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY,0.507423,0.470071,high


In [6]:
# the scaling is linear so we can take any number in the parameter i.e. 4 without using lables
pd.qcut(df.house_value_minmax, 4, 
       )

0           (0.515, 1.0]
1           (0.515, 1.0]
2           (0.515, 1.0]
3           (0.515, 1.0]
4           (0.515, 1.0]
              ...       
20635    (-0.001, 0.216]
20636    (-0.001, 0.216]
20637    (-0.001, 0.216]
20638    (-0.001, 0.216]
20639    (-0.001, 0.216]
Name: house_value_minmax, Length: 20640, dtype: category
Categories (4, interval[float64, right]): [(-0.001, 0.216] < (0.216, 0.34] < (0.34, 0.515] < (0.515, 1.0]]

In [7]:
# to under the qcut with range using label ( difference in category )

In [8]:
df["price_range_quartile"] = pd.qcut(df.house_value_minmax,
                                   4, labels = bin_names )
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,house_value_minmax,house_value_z,price_range,price_range_quartile
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0.902266,2.12958,luxury,luxury
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,0.708247,1.314124,high,luxury
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,0.695051,1.258663,high,luxury
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,0.672783,1.165072,high,luxury
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,0.674638,1.172871,high,luxury


In [9]:
df.price_range_quartile.unique()

['luxury', 'high', 'medium', 'cheap']
Categories (4, object): ['cheap' < 'medium' < 'high' < 'luxury']

##  Using machine learning libraries like scikit-learning to use the pre-processing

## Advance scaling

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [14]:
import sklearn

In [15]:
from sklearn import preprocessing

In [16]:
out = preprocessing.MinMaxScaler().fit_transform(df[["median_house_value"]])

In [17]:
import numpy as np
np.squeeze(out) == df.house_value_minmax

0        False
1         True
2        False
3         True
4        False
         ...  
20635    False
20636    False
20637     True
20638     True
20639     True
Name: house_value_minmax, Length: 20640, dtype: bool

In [19]:
# observing the first values using array of out used by sklearn and dataframe of house_value_minmax

In [20]:
print(out[0], df.house_value_minmax[0])

[0.90226638] 0.9022663824066705


In [21]:
# the value doesnt match after using sklearn as it decreases the floating point

In [22]:
# using numpy allclose() method to compare our data with other data to evaluate within the numerical precision whether they match or not 

In [23]:
np.allclose(np.squeeze(out), df.house_value_minmax)

True

In [24]:
preprocessing.StandardScaler().fit_transform(df[["median_house_value"]])

array([[ 2.12963148],
       [ 1.31415614],
       [ 1.25869341],
       ...,
       [-0.99274649],
       [-1.05860847],
       [-1.01787803]])