In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


sns.set_style('darkgrid')

In [3]:
file = '../data/california/housing.csv'
df_raw = pd.read_csv(file, sep=',')

In [4]:
df_raw.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
# Change the median_house_value to it's log value
df_raw['median_house_value'] = np.log(df_raw['median_house_value'])

In [6]:
df_raw.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,12.084884
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,0.569134
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,9.615739
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,11.691908
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,12.099044
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,12.486447
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,13.122365


In [7]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [8]:
df_raw["income_cat"] = pd.cut(df_raw["median_income"],
                              bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                              labels=[1, 2, 3, 4, 5])

In [9]:
X, y = df_raw.drop('median_house_value', axis=1), df_raw['median_house_value'].copy()
split = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

for train_index, test_index in split.split(X, df_raw["income_cat"]):
    train_set = df_raw.iloc[train_index]
    test_set = df_raw.iloc[test_index]

In [10]:
for set_ in (train_set, test_set):
    set_.drop('income_cat', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [11]:
train_set.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        162
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [12]:
for label, content in train_set.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            train_set[label+'_na'] = pd.isnull(content)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [13]:
train_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,total_bedrooms_na
18886,-122.24,38.10,49.0,1851.0,356.0,849.0,307.0,2.9432,11.547327,NEAR BAY,False
17941,-121.94,37.33,37.0,1822.0,329.0,845.0,348.0,4.7500,12.433607,<1H OCEAN,False
11031,-117.84,33.78,26.0,2577.0,434.0,1086.0,432.0,4.6125,12.342350,<1H OCEAN,False
5012,-118.33,34.01,44.0,1762.0,463.0,786.0,445.0,1.9231,12.146853,<1H OCEAN,False
10064,-121.03,39.37,15.0,1337.0,326.0,1172.0,306.0,2.6341,11.350407,INLAND,False
...,...,...,...,...,...,...,...,...,...,...,...
20216,-119.25,34.30,34.0,1189.0,220.0,445.0,203.0,4.8824,12.890179,NEAR OCEAN,False
14857,-117.08,32.64,43.0,1005.0,230.0,548.0,252.0,1.8672,11.889991,NEAR OCEAN,False
5425,-118.42,34.02,34.0,2243.0,444.0,973.0,413.0,4.9676,12.933863,<1H OCEAN,False
19518,-121.01,37.64,33.0,693.0,207.0,598.0,192.0,1.0217,11.305901,INLAND,False


In [14]:
train_set.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        162
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
total_bedrooms_na       0
dtype: int64

In [15]:
train_set['rooms'] = round(train_set['total_rooms'] / train_set['households'])
train_set['bedrooms'] = round(train_set['total_bedrooms'] / train_set['households'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
train_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,total_bedrooms_na,rooms,bedrooms
18886,-122.24,38.10,49.0,1851.0,356.0,849.0,307.0,2.9432,11.547327,NEAR BAY,False,6.0,1.0
17941,-121.94,37.33,37.0,1822.0,329.0,845.0,348.0,4.7500,12.433607,<1H OCEAN,False,5.0,1.0
11031,-117.84,33.78,26.0,2577.0,434.0,1086.0,432.0,4.6125,12.342350,<1H OCEAN,False,6.0,1.0
5012,-118.33,34.01,44.0,1762.0,463.0,786.0,445.0,1.9231,12.146853,<1H OCEAN,False,4.0,1.0
10064,-121.03,39.37,15.0,1337.0,326.0,1172.0,306.0,2.6341,11.350407,INLAND,False,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20216,-119.25,34.30,34.0,1189.0,220.0,445.0,203.0,4.8824,12.890179,NEAR OCEAN,False,6.0,1.0
14857,-117.08,32.64,43.0,1005.0,230.0,548.0,252.0,1.8672,11.889991,NEAR OCEAN,False,4.0,1.0
5425,-118.42,34.02,34.0,2243.0,444.0,973.0,413.0,4.9676,12.933863,<1H OCEAN,False,5.0,1.0
19518,-121.01,37.64,33.0,693.0,207.0,598.0,192.0,1.0217,11.305901,INLAND,False,4.0,1.0


In [17]:
X = train_set.drop('median_house_value', axis=1).copy()
y = train_set['median_house_value'].copy()

In [18]:
missing_values = ['total_bedrooms', 'bedrooms']
cat_col = ['ocean_proximity']

train_transform = ColumnTransformer(
    [('fill missing', SimpleImputer(strategy='median'), missing_values),
     ('one hot', OneHotEncoder(), cat_col)
    ], remainder='passthrough')
X_transformed = train_transform.fit_transform(X)

In [19]:
cats = X['ocean_proximity'].value_counts().index

In [20]:
train_transform.transformers_

[('fill missing',
  SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                missing_values=nan, strategy='median', verbose=0),
  ['total_bedrooms', 'bedrooms']),
 ('one hot',
  OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
                handle_unknown='error', sparse=True),
  ['ocean_proximity']),
 ('remainder', 'passthrough', [0, 1, 2, 3, 5, 6, 7, 9, 10])]

In [21]:
X_train, X_valid, y_train, y_valid = train_test_split(X_transformed, y, test_size=0.2)
X_train.shape, y_train.shape, X_valid.shape

((13209, 16), (13209,), (3303, 16))

### Creating a model

In [22]:
# function that calculates the Rmse of the predicted value and true value
def rmse(actuals, predictions):
    return np.sqrt(mean_squared_error(actuals, predictions))

def print_score(model):
    print(f'The RMSE of the train set {rmse(y_train, model.predict(X_train))}')
    print(f'The RMSE of the train set {rmse(y_valid, model.predict(X_valid))}')
    print(f'The R^2 of the train set {model.score(X_train, y_train)}')
    print(f'The R^2 of the oob_score sample {model.score(X_valid, y_valid)}')
    if hasattr(model, 'oob_score_'):
        print(f'The R^2 of the oob_score {model.oob_score_}')

In [23]:
#Building a single tree
rf = RandomForestRegressor(n_jobs=-1).fit(X_train, y_train)
print_score(rf)

The RMSE of the train set 0.08759944336399184
The RMSE of the train set 0.2312980120620053
The R^2 of the train set 0.9763834623654973
The R^2 of the oob_score sample 0.8325453844316688


In [24]:
rf = RandomForestRegressor(n_jobs=-1, oob_score=True).fit(X_train, y_train)
print_score(rf)

The RMSE of the train set 0.0878700980899127
The RMSE of the train set 0.23234243777002223
The R^2 of the train set 0.9762373015727444
The R^2 of the oob_score sample 0.8310296881193489
The R^2 of the oob_score 0.8266957366428117


In [25]:
gd = GradientBoostingRegressor().fit(X_train, y_train)
print_score(gd)

The RMSE of the train set 0.2552087251323388
The RMSE of the train set 0.2664995994865083
The R^2 of the train set 0.799550687932826
The R^2 of the oob_score sample 0.7776964267011768


###  Performing a GridSearchCv on the models

In [26]:
grid_rf = {
    'n_estimators' : [10, 20, 50, 70, 100, 150],
    'max_features' : [0.5, 1, 0.6, 'log2', 'sqrt'],
    'min_samples_leaf' : [1, 3, 5, 15, 20],
    'oob_score' : [True],
}

grid = GridSearchCV(rf, param_grid=grid_rf, n_jobs=-1).fit(X_train, y_train)
print_score(grid)


The RMSE of the train set 0.08633385914645685
The RMSE of the train set 0.2310789884242416
The R^2 of the train set 0.9770609280897771
The R^2 of the oob_score sample 0.8328623707614581


In [27]:
grid.best_params_

{'max_features': 0.5,
 'min_samples_leaf': 1,
 'n_estimators': 150,
 'oob_score': True}