In [8]:
import pandas as pd
import numpy as np
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [49]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [50]:
from sklearn import linear_model

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [51]:
np.nonzero(model_all.coef_)

(array([ 3, 10, 12], dtype=int64),)

In [52]:
# To find a good L1 penalty
# Split data into train, validation and test data
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [53]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [57]:
np.sum(np.square(model_all.predict(sales[all_features]) - sales['price']))

1439386143411741.2

In [65]:
min_rss = None
best_model = None
for l1_penalty in np.logspace(1, 7, num=13):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    rss = np.sum(np.square(model.predict(validation[all_features]) - validation['price']))
    print('L1 penalty {}, RSS {}'.format(l1_penalty, rss))
    if not min_rss:
        min_rss = rss
        min_l1 = l1_penalty
        best_model = model
    else:
        if rss < min_rss:
            min_rss = rss
            min_l1 = l1_penalty
            best_model = model
print('lowest RSS is {} when l1 penalty is {}'.format(min_rss, min_l1))

L1 penalty 10.0, RSS 398213327300134.2
L1 penalty 31.622776601683793, RSS 399041900253348.2
L1 penalty 100.0, RSS 429791604072558.1
L1 penalty 316.22776601683796, RSS 463739831045119.6
L1 penalty 1000.0, RSS 645898733633810.4
L1 penalty 3162.2776601683795, RSS 1222506859427156.8
L1 penalty 10000.0, RSS 1222506859427156.8
L1 penalty 31622.776601683792, RSS 1222506859427156.8
L1 penalty 100000.0, RSS 1222506859427156.8
L1 penalty 316227.7660168379, RSS 1222506859427156.8
L1 penalty 1000000.0, RSS 1222506859427156.8
L1 penalty 3162277.6601683795, RSS 1222506859427156.8
L1 penalty 10000000.0, RSS 1222506859427156.8
lowest RSS is 398213327300134.2 when l1 penalty is 10.0


In [67]:
# RSS on test data
np.sum(np.square(best_model.predict(testing[all_features]) - testing['price']))

98467402552698.81

In [73]:
np.count_nonzero(best_model.coef_) + np.count_nonzero(best_model.intercept_)

15

In [83]:
# max_nonzeros = 7
coefs = []
for l1_penalty in np.logspace(1, 4, num=20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    num_non_zero = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    coefs.append(tuple((num_non_zero, l1_penalty)))
        

In [86]:
coefs

[(15, 10.0),
 (15, 14.384498882876629),
 (15, 20.691380811147901),
 (15, 29.763514416313178),
 (13, 42.813323987193932),
 (12, 61.584821106602639),
 (11, 88.586679041008225),
 (10, 127.42749857031335),
 (7, 183.29807108324357),
 (6, 263.66508987303581),
 (6, 379.26901907322497),
 (6, 545.55947811685144),
 (5, 784.75997035146065),
 (3, 1128.8378916846884),
 (3, 1623.776739188721),
 (2, 2335.7214690901214),
 (1, 3359.8182862837812),
 (1, 4832.9302385717519),
 (1, 6951.9279617756056),
 (1, 10000.0)]

In [118]:
l1_penalty_max = next(x[1] for x in coefs if x[0] < 7)

In [119]:
l1_penalty_min = next(x[1] for x in coefs[::-1] if x[0] > 7)

In [120]:
l1_penalty_max

263.66508987303581

In [121]:
l1_penalty_min

127.42749857031335

In [122]:
best_model = None
lowest_rss = None
best_l1_penalty = None
for l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    non_zeros = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    if non_zeros == 7:
        rss = np.sum(np.square(model.predict(validation[all_features]) - validation['price']))
        if not best_model or rss < lowest_rss:
            best_model = model
            lowest_rss = rss
            best_l1_penalty = l1_penalty

In [123]:
best_l1_penalty

156.10909673930755

In [111]:
np.where(best_model.coef_ != 0)

(array([ 2,  3,  9, 10, 12, 15], dtype=int64),)

In [112]:
all_features

['bedrooms',
 'bedrooms_square',
 'bathrooms',
 'sqft_living',
 'sqft_living_sqrt',
 'sqft_lot',
 'sqft_lot_sqrt',
 'floors',
 'floors_square',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated']