# Setup

In [267]:
# Common imports
import numpy as np
import pandas as pd


# Get the data

In [268]:
# Importing the data set and understanding the structure:
kchousing = pd.read_csv("kc_house_data.csv")
kchousing.head()
kchousing.shape
kchousing["zipcode"] = kchousing["zipcode"].astype("str")

In [269]:
# Converting the Zipcode to String
kchousing["zipcode"] = kchousing["zipcode"].astype("str")
kchousing.dtypes

# Split the data into train and test

In [253]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(kchousing, test_size=0.3)

## Check the missing values

In [254]:
train.isna().sum()
test.isna().sum()


price            0
bedrooms         1
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

# Data Prep

In [255]:
# Imports for Data Prep:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder


## Separate the target variable (don't transform the target)

In [256]:
# Separating the target variable and input variables
train_targets = train[['price']]
test_targets = test[['price']]

train_inputs = train.drop(['price'], axis=1)
test_inputs = test.drop(['price'], axis=1)


##  Identify the numeric, binary, and categorical columns

In [257]:
# Selecting the numeric coloums
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()
# Selecting the catogerical coloums
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()
#Manually defining the binary coloums
binary_columns = ['waterfront']

In [259]:
# Excluding binary coloums from numeric coloums
for col in binary_columns:
    numeric_columns.remove(col)

# Pipeline (recommended)

If you don't want to use pipelines, feel free to use your own data prep steps.

In [260]:
#Transforming Numerical Coloums
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])


In [261]:
#Transforming Categorical Coloums
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [262]:
#Transforming Binary Coloums
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [263]:
#Combining all the Coloums
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='passthrough')

# Transform: fit_transform() for TRAIN

In [264]:
#Applying FitTransform to Train dataset
train_x = preprocessor.fit_transform(train_inputs)
train_x
train_x.toarray()

array([[ 0.66848823,  0.49638978,  1.02287204, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.66848823,  2.43109507,  3.44874286, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.66848823,  0.49638978,  1.01209039, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.39508927,  0.49638978, -0.47577705, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.66848823,  0.1739389 ,  0.33284656, ...,  0.        ,
         0.        ,  0.        ],
       [-0.39508927, -0.79341374, -0.37874222, ...,  0.        ,
         0.        ,  0.        ]])

# Tranform: transform() for TEST

In [265]:
#Applying Transform to Test dataset
test_x = preprocessor.transform(test_inputs)
test_x
test_x.toarray()

array([[-0.39508927, -0.47096286,  0.23581173, ...,  0.        ,
         0.        ,  0.        ],
       [-0.39508927,  0.49638978, -0.4649954 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.73206572,  0.49638978,  0.06330536, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.39508927,  0.49638978,  0.58082447, ...,  0.        ,
         0.        ,  0.        ],
       [-0.39508927,  0.49638978, -0.69141001, ...,  0.        ,
         0.        ,  0.        ],
       [-0.39508927, -1.43831551, -0.63750177, ...,  0.        ,
         0.        ,  0.        ]])

In [266]:
test_x.shape

(6484, 88)