In [1]:
import numpy as np
import pandas as pd

housing = pd.read_csv('housing/housing.csv')
housing.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY


In [2]:
median = housing['total_bedrooms'].median()
housing['total_bedrooms'] = housing['total_bedrooms'].fillna(median)

In [3]:
housing['rooms_per_household'] = housing['total_rooms']/housing['households']
del housing['total_rooms']

In [4]:
housing['bedrooms_per_household'] = housing['total_bedrooms']/housing['households']
del housing['total_bedrooms']

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

col_list = list(housing)
col_list.remove('ocean_proximity')
col_list.remove('median_house_value')

housing_numeric = housing[col_list]
housing_scaled = scaler.fit_transform(housing_numeric)

housing_scaled_df = pd.DataFrame(housing_scaled, index=housing_numeric.index, columns=housing_numeric.columns)

housing = pd.concat([housing_scaled_df, housing['median_house_value'], housing['ocean_proximity']], axis=1)
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,population,households,median_income,rooms_per_household,bedrooms_per_household,median_house_value,ocean_proximity
0,-1.327835,1.052548,0.982143,-0.974429,-0.977033,2.344766,0.628559,-0.148514,452600.0,NEAR BAY
1,-1.322844,1.043185,-0.607019,0.861439,1.669961,2.332238,0.327041,-0.248542,358500.0,NEAR BAY
2,-1.332827,1.038503,1.856182,-0.820777,-0.843637,1.782699,1.15562,-0.052902,352100.0,NEAR BAY
3,-1.337818,1.038503,1.856182,-0.766028,-0.733781,0.932968,0.156966,-0.053647,341300.0,NEAR BAY
4,-1.337818,1.038503,1.856182,-0.759847,-0.629157,-0.012881,0.344711,-0.038196,342200.0,NEAR BAY


In [10]:
housing = pd.get_dummies(housing)
housing.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,population,households,median_income,rooms_per_household,bedrooms_per_household,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327835,1.052548,0.982143,-0.974429,-0.977033,2.344766,0.628559,-0.148514,452600.0,0,0,0,1,0
1,-1.322844,1.043185,-0.607019,0.861439,1.669961,2.332238,0.327041,-0.248542,358500.0,0,0,0,1,0
2,-1.332827,1.038503,1.856182,-0.820777,-0.843637,1.782699,1.15562,-0.052902,352100.0,0,0,0,1,0
3,-1.337818,1.038503,1.856182,-0.766028,-0.733781,0.932968,0.156966,-0.053647,341300.0,0,0,0,1,0
4,-1.337818,1.038503,1.856182,-0.759847,-0.629157,-0.012881,0.344711,-0.038196,342200.0,0,0,0,1,0
5,-1.337818,1.038503,1.856182,-0.894071,-0.801787,0.087447,-0.26973,0.005233,269700.0,0,0,0,1,0
6,-1.337818,1.033821,1.856182,-0.292712,0.037823,-0.111366,-0.200918,-0.288066,299200.0,0,0,0,1,0
7,-1.337818,1.033821,1.856182,-0.237079,0.385698,-0.395137,-0.255232,-0.07529,241400.0,0,0,0,1,0
8,-1.342809,1.033821,1.061601,-0.19381,0.249687,-0.942359,-0.458703,0.032239,226700.0,0,0,0,1,0
9,-1.337818,1.033821,1.856182,0.110844,0.560944,-0.09447,-0.185283,-0.213262,261100.0,0,0,0,1,0


In [11]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=45)

print(f'# of train_set: {train_set.shape[0]:.0f}, # of test_set: {test_set.shape[0]:.0f}')

# of train_set: 16512, # of test_set: 4128


In [12]:
train_set_features = train_set.drop('median_house_value', axis=1)
train_set_target = train_set['median_house_value'].copy()

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

lin_reg = LinearRegression()
lin_reg.fit(train_set_features, train_set_target)

print('Learned Parameters:')
print(f'Coefficients: {lin_reg.coef_}')
print(f'bias: {lin_reg.intercept_}')

print(f'Train_r2_score: {lin_reg.score(train_set_features, train_set_target)}')

Learned Parameters:
Coefficients: [-55733.57687297 -57526.6912605   13020.77508401 -54655.31987359
  58449.96190348  69978.22685007   1370.16143239   3946.1530666
 -25881.50680092 -67136.53558387 146680.7337536  -30875.3244372
 -22787.36693161]
bias: 245895.4019997655
Train_r2_score: 0.6488436502504668


In [14]:
test_set_features = test_set.drop('median_house_value', axis=1)
test_set_target = test_set['median_house_value'].copy()

final_model = lin_reg
final_predictions = final_model.predict(test_set_features)

print(f'Test_r2_score: {r2_score(test_set_target, final_predictions)}')

delta = test_set_target - final_predictions
print(f'Errors (%) in housing value prediction: {np.mean(np.abs(delta)/test_set_target)}')

final_mse = mean_squared_error(test_set_target, final_predictions)
final_rmse = np.sqrt(final_mse)
print(f'RMSE: {final_rmse}')

Test_r2_score: 0.6267808944817482
Errors (%) in housing value prediction: 0.29012878677170073
RMSE: 70372.96315833824


In [15]:
print(test_set_features.iloc[2])
print(test_set_features.iloc[43])

x1 = np.array(test_set_features.iloc[2])
x2 = np.array(test_set_features.iloc[43])
prediction1 = np.dot(lin_reg.coef_, x1) + lin_reg.intercept_
prediction2 = np.dot(lin_reg.coef_, x2) + lin_reg.intercept_

print(f'2nd sample - Predicted: {prediction1}, Real value: {test_set_target.iloc[2]}')
print(f'43rd sample - Predicted: {prediction2}, Real value: {test_set_target.iloc[43]}')

longitude                     0.813436
latitude                     -0.792107
housing_median_age            0.108104
population                   -0.512592
households                   -0.498377
median_income                 0.336318
rooms_per_household          -0.106685
bedrooms_per_household       -0.206845
ocean_proximity_<1H OCEAN     1.000000
ocean_proximity_INLAND        0.000000
ocean_proximity_ISLAND        0.000000
ocean_proximity_NEAR BAY      0.000000
ocean_proximity_NEAR OCEAN    0.000000
Name: 10112, dtype: float64
longitude                    -0.728878
latitude                      1.642463
housing_median_age           -1.481058
population                   -0.132879
households                    0.029976
median_income                 0.108870
rooms_per_household           0.287740
bedrooms_per_household       -0.145373
ocean_proximity_<1H OCEAN     0.000000
ocean_proximity_INLAND        1.000000
ocean_proximity_ISLAND        0.000000
ocean_proximity_NEAR BAY      0.0000

In [16]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

model_ridge = Ridge(alpha=100)
model_ridge.fit(train_set_features, train_set_target)

print(f'Train_score: {model_ridge.score(train_set_features, train_set_target)}')

ridge_predicted = model_ridge.predict(test_set_features)
print(f'Test_score: {r2_score(test_set_target, ridge_predicted)}')

delta = test_set_target - ridge_predicted
print(f'Errors in housing value prediction: {np.mean(np.abs(delta)/test_set_target)}')

final_ridge_mse = mean_squared_error(test_set_target, ridge_predicted)
final_ridge_rmse = np.sqrt(final_ridge_mse)
print(f'final_ridge_RMSE: {final_ridge_rmse}')

import warnings
warnings.filterwarnings(action='ignore')

Train_score: 0.6479280606855489
Test_score: 0.6294152624504661
Errors in housing value prediction: 0.2879456480729806
final_ridge_RMSE: 70124.15946923142


In [17]:
from sklearn.linear_model import Lasso

model_lasso = Lasso(alpha=1000)
model_lasso.fit(train_set_features, train_set_target)

print(f'Train_score: {model_lasso.score(train_set_features, train_set_target)}')

lasso_predicted = model_lasso.predict(test_set_features)
print(f'Test_score: {r2_score(test_set_target, lasso_predicted)}')

delta = test_set_target - lasso_predicted
print(f'Errors in housing value prediction: {np.mean(np.abs(delta)/test_set_target)}')

final_lasso_mse = mean_squared_error(test_set_target, lasso_predicted)
final_lasso_rmse = np.sqrt(final_lasso_mse)
print(f'RMSE is: {final_lasso_rmse}')

Train_score: 0.6433762067096818
Test_score: 0.6314423501074209
Errors in housing value prediction: 0.2854219349746559
RMSE is: 69932.10793113596


In [18]:
from sklearn.neural_network import MLPRegressor

model_MLP = MLPRegressor(activation='relu', hidden_layer_sizes=(32,64,32), max_iter=1000, solver='lbfgs', random_state=0)
model_MLP.fit(train_set_features, train_set_target)

print(f'Train_score: {model_MLP.score(train_set_features, train_set_target)}')

MLP_predicted = model_MLP.predict(test_set_features)
print(f'Test_score: {r2_score(test_set_target, MLP_predicted)}')

delta = test_set_target - MLP_predicted
print(f'Errors in housing value prediction: {np.mean(np.abs(delta)/test_set_target)}')

final_MLP_mse = mean_squared_error(test_set_target, MLP_predicted)
final_MLP_rmse = np.sqrt(final_MLP_mse)
print(f'RMSE is: {final_MLP_rmse}')

Train_score: 0.8490564514244182
Test_score: 0.8046952379645904
Errors in housing value prediction: 0.1897572615047462
RMSE is: 50907.33010023564
