In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn import metrics, model_selection
import statsmodels.formula.api as smf
from sklearn.pipeline import make_pipeline

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#### HDB Resale Price Predictions 

For today's lab, we will continue working with the HDB resale dataset, and start to explore metrics around the accuracy of the linear models we generate https://data.gov.sg/dataset/resale-flat-prices

In [7]:
hdb = pd.read_csv('data/resale-flat-prices-based-on-registration-date-from-march-2012-onwards.csv')

In [8]:
hdb = hdb.rename(columns={'month': 'year-month'})
hdb['year'] = hdb['year-month'].apply(lambda x: int(x.split("-")[0]))
hdb['month'] = hdb['year-month'].apply(lambda x: int(x.split("-")[1]))
hdb['lower_storey_bound'] = hdb['storey_range'].apply(lambda x: int(x.split()[0]))
hdb['upper_storey_bound'] = hdb['storey_range'].apply(lambda x: int(x.split()[2]))
hdb['flat_age'] = hdb['year'] - hdb['lease_commence_date']
town_dummies = pd.get_dummies(hdb.town, drop_first=True, prefix='TOWN')
hdb_town = pd.concat([hdb, town_dummies], axis=1)
factors = np.concatenate((town_dummies.columns, ["upper_storey_bound", "floor_area_sqm", "flat_age"]), axis=0)

#### 1. Using model_selection.train_test_split, divide the hdb_town data into a training and testing dataset with random_state = 1

In [9]:
hdb_train, hdb_test = model_selection.train_test_split(hdb_town, random_state=1)
print hdb_test.shape, hdb_train.shape

(24158, 40) (72473, 40)


#### 2. Create and fit a Linear Regression Model using Sklearn on the training dataset using the factors provided

In [10]:
lm = LinearRegression().fit(hdb_train[factors], hdb_train["resale_price"])

#### 3. Calculate the MAE, MSE, and RMSE for predictions on the training data 

In [11]:
train_preds = lm.predict(hdb_train[factors])

In [12]:
train_mae = metrics.mean_absolute_error(train_preds, hdb_train["resale_price"])
print train_mae

41750.5048962


In [13]:
train_mse = metrics.mean_squared_error(train_preds, hdb_train["resale_price"])
print train_mse

2920561379.39


In [14]:
train_rmse = np.sqrt(train_mse)
print train_rmse

54042.2184906


#### 4. Calculate the MAE, MSE, and RMSE for predictions on the testing data 

In [15]:
test_preds = lm.predict(hdb_test[factors])

In [16]:
test_mae = metrics.mean_absolute_error(test_preds, hdb_test["resale_price"])
print test_mae

41866.6305437


In [17]:
test_mse = metrics.mean_squared_error(test_preds, hdb_test["resale_price"])
print test_mse

2921704935.12


In [18]:
test_rmse = np.sqrt(test_mse)
print test_rmse

54052.7976623


#### 5. Using the entire dataset i.e. hdb_town, perform 10 folds cross validation. Compute the RMSE for each model fold and the overall mean RMSE 

In [19]:
kf = model_selection.KFold(n_splits=10, shuffle=True)

In [20]:
rmse_list = []
for train_index, test_index in kf.split(hdb_town):
    hdb_kf_train = hdb_town.iloc[train_index]
    hdb_kf_test = hdb_town.iloc[test_index]
    lm_kf = LinearRegression().fit(hdb_kf_train[factors], hdb_kf_train["resale_price"])
    hdb_kf_preds = lm_kf.predict(hdb_kf_test[factors])
    rmse = np.sqrt(metrics.mean_squared_error(hdb_kf_preds, hdb_kf_test["resale_price"]))
    rmse_list.append(rmse)
    print "RMSE:", rmse
print "Mean RMSE:", np.mean(rmse_list)

RMSE: 53183.9829707
RMSE: 53940.8848178
RMSE: 53636.3255946
RMSE: 54535.2370943
RMSE: 54191.9387822
RMSE: 53958.7080015
RMSE: 54195.4464569
RMSE: 53788.7308882
RMSE: 54284.0312026
RMSE: 54855.30793
Mean RMSE: 54057.0593739


#### 6. Now split the dataset using year >= 2016 for testing and year < 2016 for training

In [21]:
hdb_test = hdb_town[hdb_town["year"] >= 2016]
hdb_train = hdb_town[hdb_town["year"] < 2016]
print hdb_test.shape, hdb_train.shape

(25395, 40) (71236, 40)


#### 7. Train a linear model on with this new training dataset and compute the RMSE of this model against both the  training and test data 

In [22]:
lm = LinearRegression().fit(hdb_train[factors], hdb_train["resale_price"])
train_preds = lm.predict(hdb_train[factors])
test_preds = lm.predict(hdb_test[factors])

In [23]:
train_rmse = np.sqrt(metrics.mean_squared_error(train_preds, hdb_train["resale_price"]))
test_rmse = np.sqrt(metrics.mean_squared_error(test_preds, hdb_test["resale_price"]))

print train_rmse 
print test_rmse

51990.462625
60871.419329


#### Bonus Question 1

Housing prices are often estimated by psf depending on the location. This might be a better way to predict the price instead of having a coefficient for area and a separate one for location. 

Create an additional set of columns with the prefix AREA\_TOWN\_ that contains the area if the flat is in that town or is 0 otherwise. Include these columns into the model and run a 10 fold cross validation to see if this improves the model.

Note that you should keep the main terms in a model when interaction terms are included.

Hint: Check the multiply function of dataframes

In [24]:
area_dummies = town_dummies.multiply(hdb["floor_area_sqm"], axis="index").add_prefix('AREA_')

In [25]:
hdb_area = pd.concat([hdb_town, area_dummies], axis=1)
new_factors = np.concatenate((town_dummies.columns, area_dummies.columns, ["upper_storey_bound", "floor_area_sqm", "flat_age"]), axis=0)

In [26]:
rmse_list = []
for train_index, test_index in kf.split(hdb_area):
    hdb_kf_train = hdb_area.iloc[train_index]
    hdb_kf_test = hdb_area.iloc[test_index]
    lm_kf = LinearRegression().fit(hdb_kf_train[new_factors], hdb_kf_train["resale_price"])
    hdb_kf_preds = lm_kf.predict(hdb_kf_test[new_factors])
    rmse = np.sqrt(metrics.mean_squared_error(hdb_kf_preds, hdb_kf_test["resale_price"]))
    rmse_list.append(rmse)
    print "RMSE:", rmse
print "Mean RMSE:", np.mean(rmse_list)

RMSE: 51238.6834765
RMSE: 49018.1418048
RMSE: 50018.2930083
RMSE: 49802.1342365
RMSE: 50280.2480557
RMSE: 50359.4525518
RMSE: 50399.2214184
RMSE: 50564.9382937
RMSE: 49596.1000333
RMSE: 49343.6538345
Mean RMSE: 50062.0866714
