# Read and Prepare the Data

In [211]:
# Common imports
import numpy as np
import pandas as pd
np.random.seed(1403700)

# Get the data

In [212]:
# Importing the data set and understanding the structure:
kchousing = pd.read_csv("kc_house_data.csv")
kchousing.head()
kchousing.shape
kchousing["zipcode"] = kchousing["zipcode"].astype("str")

In [213]:
# Converting the Zipcode to String
kchousing["zipcode"] = kchousing["zipcode"].astype("str")
kchousing.dtypes

price            float64
bedrooms         float64
bathrooms        float64
sqft_living      float64
sqft_lot         float64
floors           float64
waterfront         int64
view               int64
condition          int64
grade            float64
sqft_above         int64
sqft_basement      int64
yr_built         float64
yr_renovated       int64
zipcode           object
lat              float64
long             float64
sqft_living15      int64
sqft_lot15       float64
dtype: object

# Split data (train/test)

In [214]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(kchousing, test_size=0.3)

In [215]:
train.isna().sum()

price            0
bedrooms         1
bathrooms        0
sqft_living      0
sqft_lot         1
floors           1
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         1
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [216]:
test.isna().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      1
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            1
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       1
dtype: int64

# Data Prep

Perform your data prep here. You can use pipelines like we do in the tutorials. Otherwise, feel free to use your own data prep steps. Eventually, you should do the following at a minimum:<br>
- Separate inputs from target<br>
- Impute/remove missing values<br>
- Standardize the continuous variables<br>
- One-hot encode categorical variables<br>

In [217]:
# Imports for Data Prep:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [218]:
# Separating the target variable and input variables
train_targets = train[['price']]
test_targets = test[['price']]

train_inputs = train.drop(['price'], axis=1)
test_inputs = test.drop(['price'], axis=1)

In [219]:
# Selecting the numeric coloums
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()
# Selecting the catogerical coloums
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()
#Manually defining the binary coloums
binary_columns = ['waterfront']

In [220]:
# Excluding binary coloums from numeric coloums
for col in binary_columns:
    numeric_columns.remove(col)

In [221]:
#Transforming Numerical Coloums
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [222]:
#Transforming Categorical Coloums
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [223]:
#Transforming Binary Coloums
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [224]:
#Combining all the Coloums
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='passthrough')

In [225]:
#Applying FitTransform to Train dataset
train_x = preprocessor.fit_transform(train_inputs)
train_x
train_x.toarray()

array([[ 0.66896858,  0.17605175,  0.27215629, ...,  0.        ,
         0.        ,  0.        ],
       [-0.39503748, -0.4743724 , -0.599434  , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.66896858,  0.50126382, -0.5013801 , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.66896858,  0.50126382,  0.26126141, ...,  0.        ,
         0.        ,  0.        ],
       [-0.39503748, -1.45000863, -1.15507282, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.66896858,  0.17605175,  1.81922905, ...,  0.        ,
         0.        ,  0.        ]])

In [226]:
#Applying Transform to Test dataset
test_x = preprocessor.transform(test_inputs)
test_x
test_x.toarray()

array([[-0.39503748, -0.14916033, -0.40332619, ...,  0.        ,
         0.        ,  0.        ],
       [-0.39503748,  0.50126382, -0.37064155, ...,  0.        ,
         0.        ,  0.        ],
       [-0.39503748,  0.50126382, -0.30527228, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.39503748,  0.50126382,  0.07604847, ...,  0.        ,
         0.        ,  0.        ],
       [-1.45904353, -0.14916033, -0.45780058, ...,  0.        ,
         0.        ,  0.        ],
       [-0.39503748, -0.79958448, -0.83912134, ...,  0.        ,
         0.        ,  0.        ]])

# Calculate the Baseline

In [227]:
# Average value of the target
from sklearn.metrics import mean_squared_error
mean_value = np.mean(train_targets['price'])
mean_value

539605.7990614052

In [228]:
# Predicting all values as the mean

baseline_pred = np.repeat(mean_value, len(test_targets))

baseline_pred

array([539605.79906141, 539605.79906141, 539605.79906141, ...,
       539605.79906141, 539605.79906141, 539605.79906141])

In [229]:
# Calculating Baseline RMSE
baseline_mse = mean_squared_error(test_targets, baseline_pred)
baseline_rmse = np.sqrt(baseline_mse)
print('Baseline RMSE: {}' .format(baseline_rmse))

Baseline RMSE: 368823.32319202076


# Train a SGD model (with no regularization)

In [230]:
#Building SGDRegressor Model
from sklearn.linear_model import SGDRegressor 

sgd_reg = SGDRegressor(max_iter=100, penalty=None, eta0=0.1, tol=0.0001) 

In [231]:
sgd_reg.fit(train_x, train_targets)

  return f(**kwargs)


SGDRegressor(eta0=0.1, max_iter=100, penalty=None, tol=0.0001)

In [233]:
sgd_reg.n_iter_

22

### Generate the error metrics

In [234]:
#Train RMSE for SGDRegressor Model
reg_train_pred = sgd_reg.predict(train_x)

train_mse = mean_squared_error(train_targets, reg_train_pred)

train_rmse = np.sqrt(mean_squared_error (train_targets, reg_train_pred))

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 168876.14487764495


In [235]:
#Test RMSE for SGDRegressor Model
reg_test_pred = sgd_reg.predict(test_x)

test_mse = mean_squared_error (test_targets, reg_test_pred)

test_rmse = np.sqrt(mean_squared_error (test_targets, reg_test_pred))

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 170736.04000846736


# Try L1 Regularization in SGD

In [277]:
# SGDRegressor Model with L1 Regularization
sgd_reg_L1 = SGDRegressor(max_iter=100, penalty='l1', alpha = 0.8, eta0=0.1, tol=0.0001)

sgd_reg_L1.fit(train_x, train_targets)

  return f(**kwargs)


SGDRegressor(alpha=0.8, eta0=0.1, max_iter=100, penalty='l1', tol=0.0001)

### Generate the error metrics

In [278]:
#Train RMSE for SGDRegressor Model with L1 Regularization
reg_train_pred = sgd_reg_L1.predict(train_x)

train_mse = mean_squared_error(train_targets, reg_train_pred)

train_rmse = np.sqrt(mean_squared_error (train_targets, reg_train_pred))

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 164727.63425959487


In [279]:
#Test RMSE for SGDRegressor Model with L1 Regularization
reg_test_pred = sgd_reg_L1.predict(test_x)

test_mse = mean_squared_error (test_targets, reg_test_pred)

test_rmse = np.sqrt(mean_squared_error (test_targets, reg_test_pred))

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 165938.86837833907


# Try L2 Regularization in SGD

In [280]:
# SGDRegressor Model with L2 Regularization

sgd_reg_L2 = SGDRegressor(max_iter=100, penalty='l2', alpha = 0.8, eta0=0.1, tol=0.0001)

sgd_reg_L2.fit(train_x, train_targets)


  return f(**kwargs)


SGDRegressor(alpha=0.8, eta0=0.1, max_iter=100, tol=0.0001)

### Generate the error metrics

In [281]:
#Train RMSE for SGDRegressor Model with L2 Regularization
reg_train_pred = sgd_reg_L2.predict(train_x)

train_mse = mean_squared_error(train_targets, reg_train_pred)

train_rmse = np.sqrt(mean_squared_error (train_targets, reg_train_pred))

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 233577.80090383807


In [282]:
#Train RMSE for SGDRegressor Model with L2 Regularization
reg_test_pred = sgd_reg_L2.predict(test_x)

test_mse = mean_squared_error (test_targets, reg_test_pred)

test_rmse = np.sqrt(mean_squared_error (test_targets, reg_test_pred))

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 239448.38804941875


# Try ElasticNet in SGD

In [283]:
#Train RMSE for SGDRegressor Model with ElasticNet

sgd_reg_elastic = SGDRegressor(max_iter=100, penalty='elasticnet', l1_ratio=0.5, alpha = 0.8, 
                          eta0=0.1, tol=0.0001)
sgd_reg_elastic.fit(train_x, train_targets)


  return f(**kwargs)


SGDRegressor(alpha=0.8, eta0=0.1, l1_ratio=0.5, max_iter=100,
             penalty='elasticnet', tol=0.0001)

### Generate the error metrics

In [284]:
#Train RMSE for SGDRegressor Model with ElasticNet
reg_train_pred = sgd_reg_elastic.predict(train_x)

train_mse = mean_squared_error(train_targets, reg_train_pred)

train_rmse = np.sqrt(mean_squared_error (train_targets, reg_train_pred))

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 243006.145060297


In [285]:
#Test RMSE for SGDRegressor Model with ElasticNet
reg_test_pred = sgd_reg_elastic.predict(test_x)

test_mse = mean_squared_error (test_targets, reg_test_pred)

test_rmse = np.sqrt(mean_squared_error (test_targets, reg_test_pred))

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 245877.5276298229


# Create Polynomial Features

Create polynomial features with degree = 2. 

In [286]:
#Introducing Polynomial Featured of Degree=2
from sklearn.preprocessing import PolynomialFeatures

# Create second degree terms and interaction terms
poly_features = PolynomialFeatures(degree=2).fit(train_x)

train_x_poly = poly_features.transform(train_x)

test_x_poly = poly_features.transform(test_x)

# Try L2 Regularization in SGD (with polynomial features)

In [287]:
# SGDRegressor Model with L2 Regularization  with polynomial features
sgd_reg_L2_poly = SGDRegressor(max_iter=1000, penalty='l2', alpha = 0.8, eta0=0.1, tol=0.0001)

sgd_reg_L2_poly.fit(train_x_poly, train_targets)

  return f(**kwargs)


SGDRegressor(alpha=0.8, eta0=0.1, tol=0.0001)

### Generate the error metrics

In [288]:
#Train RMSE for SGDRegressor Model with L2 Regularization with polynomial features
reg_train_pred = sgd_reg_L2_poly.predict(train_x_poly)

train_mse = mean_squared_error(train_targets, reg_train_pred)

train_rmse = np.sqrt(mean_squared_error (train_targets, reg_train_pred))

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 2880711530843.6885


In [289]:
#Test RMSE for SGDRegressor Model with L2 Regularization with polynomial features
reg_test_pred = sgd_reg_L2_poly.predict(test_x_poly)

test_mse = mean_squared_error (test_targets, reg_test_pred)

test_rmse = np.sqrt(mean_squared_error (test_targets, reg_test_pred))

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 3137064546180.9746
