# Linear regression

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('./../datasets/housing.csv')

df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
X_train = train_set.drop('median_house_value', axis=1)
Y_train = train_set['median_house_value']

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

total_rooms_index, total_debrooms_index, population_index, households_index = 3, 4, 5, 6

class PropertyAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        self.rooms_per_household = X[:, total_rooms_index] / X[:, households_index]
        self.population_per_household = X[:, population_index] / X[:, households_index]
        return self

    def transform(self, X):

        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, total_rooms_index] / X[:, total_debrooms_index]

            return np.column_stack((X, self.rooms_per_household, self.population_per_household, bedrooms_per_room)) 
        else:
            return  np.column_stack((X, self.rooms_per_household, self.population_per_household))

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('add-fields', PropertyAdder()),
    ('min-max-scaler', StandardScaler())
])

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_columns = [
    'longitude',
    'latitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
 ]

full_pipeline = ColumnTransformer([
    ('num-pipeline', num_pipeline, num_columns),
    ('one-hot-encoder', OneHotEncoder(), ['ocean_proximity']),
])

X_prepared = full_pipeline.fit_transform(X_train)

X_prepared[0]

array([ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
        0.76827628,  0.32290591, -0.326196  , -0.17491646,  0.05137609,
       -0.00315464,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ])

In [9]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()

lr_model.fit(X_prepared, Y_train)

# Simple testing

In [10]:
island = X_train[X_train.ocean_proximity == 'ISLAND'].sample(1)
one_hour_ocean = X_train[X_train.ocean_proximity == '<1H OCEAN'].sample(1)
near_ocean = X_train[X_train.ocean_proximity == 'NEAR OCEAN'].sample(1)
near_bay = X_train[X_train.ocean_proximity == 'NEAR BAY'].sample(1)
inland = X_train[X_train.ocean_proximity == 'INLAND'].sample(1)

test_x_data = pd.concat([island, one_hour_ocean, near_ocean, near_bay, inland])

test_x_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
8314,-118.32,33.35,27.0,1675.0,521.0,744.0,331.0,2.1579,ISLAND
3824,-118.47,34.19,33.0,3879.0,943.0,2113.0,843.0,3.892,<1H OCEAN
14855,-117.07,32.65,12.0,4131.0,891.0,2272.0,840.0,3.4701,NEAR OCEAN
18890,-122.23,38.12,49.0,2715.0,435.0,1006.0,429.0,4.2308,NEAR BAY
1936,-120.98,38.67,13.0,3432.0,516.0,1286.0,470.0,5.584,INLAND


In [11]:
test_y_labels = Y_train.loc[test_x_data.index]

test_y_labels

8314     450000.0
3824     292900.0
14855    204900.0
18890    145800.0
1936     186600.0
Name: median_house_value, dtype: float64

In [12]:
test_x_prepared = full_pipeline.fit_transform(test_x_data)

test_x_prepared[0]

array([ 0.57715773, -0.81748075,  0.0145803 , -1.68076016, -0.6621983 ,
       -1.22288727, -1.16392405, -1.53980407, -0.57187082, -1.3529141 ,
       -1.35777313,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ])

In [13]:
test_predicted_labels = lr_model.predict(test_x_prepared)

test_predicted_labels

array([255799.92387025, 242794.17669346, 169416.30151771, 269939.61954663,
       249182.9038749 ])

In [14]:
test_result = pd.DataFrame({
    'prediction': test_predicted_labels,
    'actually': test_y_labels,
})

test_result

Unnamed: 0,prediction,actually
8314,255799.92387,450000.0
3824,242794.176693,292900.0
14855,169416.301518,204900.0
18890,269939.619547,145800.0
1936,249182.903875,186600.0


In [15]:
test_result['diff'] = test_result['actually'] - test_result['prediction']

test_result

Unnamed: 0,prediction,actually,diff
8314,255799.92387,450000.0,194200.07613
3824,242794.176693,292900.0,50105.823307
14855,169416.301518,204900.0,35483.698482
18890,269939.619547,145800.0,-124139.619547
1936,249182.903875,186600.0,-62582.903875


# Ranking

In [16]:
X_test = test_set.drop('median_house_value', axis=1)

y_test = test_set['median_house_value'].copy()

X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [17]:
X_test_prepared = full_pipeline.fit_transform(X_test)

In [18]:
y_test_predicted = lr_model.predict(X_test_prepared)

y_test_predicted

array([ 67976.20970906, 133671.36686774, 268256.81506608, ...,
       449249.21295068, 119686.76988633, 189589.28004073])

In [19]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_test_predicted)

print('MEAN ABSOLUTE ERROR: ', mae)

MEAN ABSOLUTE ERROR:  51021.0128823905


In [20]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_test, y_test_predicted)

print('ROOT MEAN SQUARE ERROR: ', np.sqrt(rmse))

ROOT MEAN SQUARE ERROR:  70399.33260869946


# Random Forest

In [21]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()

rf_model.fit(X_prepared, Y_train)

In [24]:
y_test_predicted = rf_model.predict(X_test_prepared)

y_test_predicted

array([ 93406.  , 131210.  , 369038.25, ..., 497294.97, 127915.  ,
       160501.  ])

In [25]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_test, y_test_predicted)

print("RMSE: ", np.sqrt(rmse))

RMSE:  77485.99802322284
