In [101]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [180]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


sns.set_style('darkgrid')

In [103]:
file = '../data/california/housing.csv'
df_raw = pd.read_csv(file, sep=',')

In [104]:
df_raw.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [105]:
# Change the median_house_value to it's log value
df_raw['median_house_value'] = np.log(df_raw['median_house_value'])

In [106]:
df_raw.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,12.084884
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,0.569134
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,9.615739
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,11.691908
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,12.099044
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,12.486447
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,13.122365


In [107]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [108]:
df_raw["income_cat"] = pd.cut(df_raw["median_income"],
                              bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                              labels=[1, 2, 3, 4, 5])

In [109]:
X, y = df_raw.drop('median_house_value', axis=1), df_raw['median_house_value'].copy()
split = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

for train_index, test_index in split.split(X, df_raw["income_cat"]):
    train_set = df_raw.iloc[train_index]
    test_set = df_raw.iloc[test_index]

In [110]:
for set_ in (train_set, test_set):
    set_.drop('income_cat', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [111]:
train_set.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        169
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [112]:
for label, content in train_set.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            train_set[label+'_na'] = pd.isnull(content)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [113]:
train_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,total_bedrooms_na
18490,-121.61,37.03,5.0,6529.0,1010.0,3071.0,977.0,5.6754,12.606525,<1H OCEAN,False
19577,-120.75,37.69,24.0,2282.0,423.0,1167.0,398.0,3.8214,11.662207,INLAND,False
12711,-121.40,38.59,25.0,2228.0,534.0,1130.0,481.0,2.5363,11.732864,INLAND,False
4576,-118.28,34.07,24.0,3247.0,1281.0,2642.0,1182.0,2.4632,12.286269,<1H OCEAN,False
19608,-121.12,37.48,5.0,4109.0,820.0,3062.0,713.0,3.2396,11.737668,INLAND,False
...,...,...,...,...,...,...,...,...,...,...,...
7753,-118.11,33.91,19.0,3056.0,759.0,1561.0,740.0,3.1369,12.190451,<1H OCEAN,False
11414,-117.98,33.69,22.0,3957.0,520.0,1774.0,527.0,7.0907,12.766260,<1H OCEAN,False
17201,-119.78,34.45,23.0,2077.0,306.0,705.0,256.0,6.4744,13.122365,NEAR OCEAN,False
7268,-118.22,33.98,27.0,1095.0,340.0,1300.0,318.0,2.6548,11.721564,<1H OCEAN,False


In [114]:
train_set.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        169
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
total_bedrooms_na       0
dtype: int64

In [115]:
train_set['rooms'] = round(train_set['total_rooms'] / train_set['households'])
train_set['bedrooms'] = round(train_set['total_bedrooms'] / train_set['households'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [116]:
train_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,total_bedrooms_na,rooms,bedrooms
18490,-121.61,37.03,5.0,6529.0,1010.0,3071.0,977.0,5.6754,12.606525,<1H OCEAN,False,7.0,1.0
19577,-120.75,37.69,24.0,2282.0,423.0,1167.0,398.0,3.8214,11.662207,INLAND,False,6.0,1.0
12711,-121.40,38.59,25.0,2228.0,534.0,1130.0,481.0,2.5363,11.732864,INLAND,False,5.0,1.0
4576,-118.28,34.07,24.0,3247.0,1281.0,2642.0,1182.0,2.4632,12.286269,<1H OCEAN,False,3.0,1.0
19608,-121.12,37.48,5.0,4109.0,820.0,3062.0,713.0,3.2396,11.737668,INLAND,False,6.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7753,-118.11,33.91,19.0,3056.0,759.0,1561.0,740.0,3.1369,12.190451,<1H OCEAN,False,4.0,1.0
11414,-117.98,33.69,22.0,3957.0,520.0,1774.0,527.0,7.0907,12.766260,<1H OCEAN,False,8.0,1.0
17201,-119.78,34.45,23.0,2077.0,306.0,705.0,256.0,6.4744,13.122365,NEAR OCEAN,False,8.0,1.0
7268,-118.22,33.98,27.0,1095.0,340.0,1300.0,318.0,2.6548,11.721564,<1H OCEAN,False,3.0,1.0


In [117]:
X = train_set.drop('median_house_value', axis=1).copy()
y = train_set['median_house_value'].copy()

In [118]:
missing_values = ['total_bedrooms', 'bedrooms']
cat_col = ['ocean_proximity']

train_transform = ColumnTransformer(
    [('fill missing', SimpleImputer(strategy='median'), missing_values),
     ('one hot', OneHotEncoder(), cat_col)
    ], remainder='passthrough')
X_transformed = train_transform.fit_transform(X)

In [136]:
cats = X['ocean_proximity'].value_counts().index

In [129]:
train_transform.transformers_

[('fill missing',
  SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                missing_values=nan, strategy='median', verbose=0),
  ['total_bedrooms', 'bedrooms']),
 ('one hot',
  OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
                handle_unknown='error', sparse=True),
  ['ocean_proximity']),
 ('remainder', 'passthrough', [0, 1, 2, 3, 5, 6, 7, 9, 10])]

In [156]:
X_train, X_valid, y_train, y_valid = train_test_split(X_transformed, y, test_size=0.2)
X_train.shape, y_train.shape, X_valid.shape

((13209, 16), (13209,), (3303, 16))

### Creating a model

In [193]:
# function that calculates the Rmse of the predicted value and true value
def rmse(actuals, predictions):
    return np.sqrt(mean_squared_error(actuals, predictions))

def print_score(model):
    print(f'The RMSE of the train set {rmse(y_train, model.predict(X_train))}')
    print(f'The RMSE of the train set {rmse(y_valid, model.predict(X_valid))}')
    print(f'The R^2 of the train set {model.score(X_train, y_train)}')
    print(f'The R^2 of the train set {model.score(X_valid, y_valid)}')
    if hasattr(model, 'oob_score_'):
        print(f'The R^2 of the oob_score {model.oob_score_}')

In [194]:
#Building a single tree
rf = RandomForestRegressor(n_jobs=-1).fit(X_train, y_train)
print_score(rf)

The RMSE of the train set 0.08739676258206268
The RMSE of the train set 0.2355354790459752
The R^2 of the train set 0.9764732164785418
The R^2 of the train set 0.8223185931039505


In [195]:
rf = RandomForestRegressor(n_jobs=-1, oob_score=True).fit(X_train, y_train)
print_score(rf)

The RMSE of the train set 0.08867190888517407
The RMSE of the train set 0.23476829920767275
The R^2 of the train set 0.9757816817916953
The R^2 of the train set 0.8234741862661825
The R^2 of the oob_score 0.8254783573600946


In [196]:
gd = GradientBoostingRegressor().fit(X_train, y_train)
print_score(gd)

The RMSE of the train set 0.25376253983680297
The RMSE of the train set 0.2659566398989572
The R^2 of the train set 0.8016524272385346
The R^2 of the train set 0.7734568214732308


###  Performing a GridSearchCv on the models

In [189]:
grid_rf = {
    'n_estimators' : [10, 20, 50, 70, 100, 150],
    'max_features' : [0.5, 1, 0.6, 'log2', 'sqrt'],
    'min_samples_leaf' : [1, 3, 5, 15, 20],
    'oob_score' : [True],
}

grid = GridSearchCV(rf, param_grid=grid_rf, n_jobs=-1).fit(X_train, y_train)
print_score(grid)


The RMSE of the train set 0.08753981111656468
The RMSE of the train set 0.23498056675382
The R^2 of the train set 0.9763961375099595
The R^2 of the train set 0.823154827637701


In [197]:
# running with the full data set
rf = RandomForestRegressor(n_jobs=-1, oob_score=True).fit(X_transformed, y)
print_score(rf)

The RMSE of the train set 0.08569024134638459
The RMSE of the train set 0.08605101276441186
The R^2 of the train set 0.9773830208577868
The R^2 of the train set 0.9762840084537519
The R^2 of the oob_score 0.8319035054078627
