# Regression model  california housing data set using fastai

DATE : 2020-7-7

In [69]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Imports

In [70]:
from fastai.imports import *
from fastai.structured import *

In [71]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

from IPython.display import display

sns.set_style('darkgrid')

## Load data 

In [72]:
file = '../data/california/housing.csv'
df_raw = pd.read_csv(file, sep=',')

In [73]:
df_raw.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [74]:
def statistics(df):
    """ Returns the data set info and it's statistical description """
    display(df.describe())
    display(df.info())
    
    print('======================')
    print('   MISSING VALUES')
    print('======================')
    display(df.isnull().sum())
    
    print('======================')
    print('   DATA SET SHAPE')
    print('======================')
    print(f'{df.shape}')

In [75]:
statistics(df_raw)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


None

   MISSING VALUES


longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

   DATA SET SHAPE
(20640, 10)


## Preprocess Data set

In [76]:
# # Change the median_house_value to it's log value
# df_raw['median_house_value'] = np.log(df_raw['median_house_value'])

In [77]:
train_cats(df_raw) # tuning non numerical features to categorical

In [78]:
df, y, nas = proc_df(df_raw, 'median_house_value')

In [79]:
statistics(df) # checking if everything is setup properly

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,536.838857,1425.476744,499.53968,3.870671,2.165843
std,2.003532,2.135952,12.585558,2181.615252,419.391878,1132.462122,382.329753,1.899822,1.420662
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,1.0
25%,-121.8,33.93,18.0,1447.75,297.0,787.0,280.0,2.5634,1.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,2.0
75%,-118.01,37.71,37.0,3148.0,643.25,1725.0,605.0,4.74325,2.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,5.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  int8   
 9   total_bedrooms_na   20640 non-null  bool   
dtypes: bool(1), float64(8), int8(1)
memory usage: 1.3 MB


None

   MISSING VALUES


longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
total_bedrooms_na     0
dtype: int64

   DATA SET SHAPE
(20640, 10)


In [80]:
def split(dataframe, size):
    return dataframe[:size], dataframe[size:]

X_train, X_test = split(df, 16000)
y_train, y_test = split(y, 16000)

In [81]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16000, 10), (4640, 10), (16000,), (4640,))

## Building Models

In [82]:
def rmse(actuals, predictions):
    """Returns the root mean squared error of between the actual median_house_price and the predicted price.
    
    actuals
    =======
    The actual median house value, either y_train, y_test or y depending on the dataset you use
    
    predicted
    =========
    The predicted score from the model fitting; either X_train, X_test or X
    """
    return np.sqrt(mean_squared_error(actuals, predictions))

def results(model):
    """ Prints out the stated model's results 
    
    result1
    =======
    The root mean squares error
    
    result2
    =======
    The accuracy (R^2) score
    
    result3
    =======
    if present the oob_score_
    """
    
    results = [rmse(y_train, model.predict(X_train)), rmse(y_test, model.predict(X_test)), 
               model.score(X_train, y_train), model.score(X_test, y_test)]
    if hasattr(model, 'oob_score_'):
        results.append(model.oob_score_)
    return results

In [132]:
# creating a model on the full dataset
rf = RandomForestRegressor().fit(df, y)
results(rf)

[17661.381039969234,
 19579.512230352375,
 0.9756618441312341,
 0.9739766280875201]

In [131]:
rf = RandomForestRegressor().fit(X_train, y_train)
results(rf)

[17784.97451128792, 68383.09000139999, 0.9753200178368826, 0.6825631987496453]

In [133]:
rf = RandomForestRegressor(n_jobs=-1, max_features=0.5, oob_score=True)
rf.fit(X_train, y_train)
results(rf)

[17198.811152444083,
 65717.56196265838,
 0.9769200319666788,
 0.7068278456761854,
 0.8306644637399796]

In [134]:
rf = RandomForestRegressor(n_jobs=-1, max_features=0.5, oob_score=True, min_samples_leaf=15, n_estimators=150)
rf.fit(X_train, y_train)
results(rf)

[44158.1473548531,
 65576.84215974213,
 0.8478541762118753,
 0.7080820297062755,
 0.8039331750004474]