## Setup

We will clone the Github repo if we are in Colab so that we can access our functions under the `src` folder.

In [1]:
try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    IS_COLAB = True
except Exception:
    IS_COLAB = False

if IS_COLAB:
    !git clone https://github.com/wpan03/quick_ds_python.git
    %cd quick_ds_python
    !pip install treeinterpreter==0.2.3
    !pip install scikit-learn==0.24.1
else:
    %load_ext autoreload
    %autoreload 2

In [2]:
from pathlib import Path

import pandas as pd
from sklearn import preprocessing, impute
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from src.eda import see_missing
from src.preprocess import get_x_y, get_preprocessor, do_transform

## Get Data

In [3]:
my_file = Path("data/housing.csv")
if my_file.is_file():
    print('The data already exists. Good to go')
else:
    # https://github.com/ageron/handson-ml2/blob/master/02_end_to_end_machine_learning_project.ipynb
    DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
    HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
    ! curl {HOUSING_URL} | tar -xz -C data/

The data already exists. Good to go


In [4]:
df_housing = pd.read_csv('https://raw.githubusercontent.com/wpan03/quick_ds_python/master/data/housing.csv')

## EDA

In [5]:
df_housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
df_housing.shape

(20640, 10)

In [7]:
see_missing(df_housing)

Unnamed: 0,columns,total_missing,percent_missing
0,total_bedrooms,207,1.002907
1,longitude,0,0.0
2,latitude,0,0.0
3,housing_median_age,0,0.0
4,total_rooms,0,0.0
5,population,0,0.0
6,households,0,0.0
7,median_income,0,0.0
8,median_house_value,0,0.0
9,ocean_proximity,0,0.0


## Data Clean and Feature Engineering

### Train and Test Split

In [8]:
x, y = get_x_y(df_housing, label_col='median_house_value')

In [9]:
x.shape, y.shape

((20640, 9), (20640,))

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

### Encode and Impute

In [11]:
oh_enc = preprocessing.OneHotEncoder(drop='if_binary')
med_imp = impute.SimpleImputer(strategy='median')

In [12]:
preprocessor = get_preprocessor(oh_enc, med_imp)

In [13]:
preprocessor

ColumnTransformer(transformers=[('onehotencoder',
                                 OneHotEncoder(drop='if_binary'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fb6b9f96390>),
                                ('simpleimputer',
                                 SimpleImputer(strategy='median'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fb6b9f963d0>)])

In [14]:
_ = preprocessor.fit(x_train)

In [15]:
col_names = list(preprocessor.transformers_[0][1].get_feature_names()) + preprocessor.transformers_[1][2]
col_names

['x0_<1H OCEAN',
 'x0_INLAND',
 'x0_ISLAND',
 'x0_NEAR BAY',
 'x0_NEAR OCEAN',
 'longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

In [16]:
x_train_prepared = do_transform(x_train, preprocessor, col_names)
x_test_prepared = do_transform(x_test, preprocessor, col_names)

## Fit Model

In [17]:
mod_rf = RandomForestRegressor(random_state=36)
_ = mod_rf.fit(x_train_prepared, y_train)

## Explain Model