In [69]:
import pandas as pd

In [70]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [71]:
from math import log, sqrt

In [72]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [73]:
sales.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'sqft_living_sqrt',
       'sqft_lot_sqrt', 'bedrooms_square', 'floors_square'],
      dtype='object')

In [74]:
from sklearn import linear_model

In [75]:
import numpy as np

In [76]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']


In [77]:
model_all = linear_model.Lasso(alpha=5e2, normalize=True)

In [78]:
model_all.fit(sales[all_features], sales['price'])

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=True, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [79]:
predicted = model_all.predict(sales[all_features])

In [80]:
errors = predicted - sales['price']

In [81]:
rss_train = np.sum(errors * errors)
rss_train

1439386143411746.0

### Quiz Question: Which features have been chosen by LASSO, i.e. which features were assigned nonzero weights?

In [82]:
model_all.coef_

array([    0.        ,     0.        ,     0.        ,   134.43931396,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        , 24750.00458561,     0.        ,
       61749.10309071,     0.        ,     0.        ,    -0.        ,
           0.        ])

In [83]:
non_zeros_features = [(all_features[i], model_all.coef_[i]) 
                      for i in range(len(all_features)) if model_all.coef_[i] != 0.0]

In [84]:
non_zeros_features

[('sqft_living', 134.43931395541435),
 ('view', 24750.004585609502),
 ('grade', 61749.10309070815)]

In [85]:
def add_more_features(dataset):
    dataset['sqft_living_sqrt'] = dataset['sqft_living'].apply(sqrt)
    dataset['sqft_lot_sqrt'] = dataset['sqft_lot'].apply(sqrt)
    dataset['bedrooms_square'] = dataset['bedrooms']*dataset['bedrooms']
    dataset['floors_square'] = dataset['floors']*dataset['floors']

In [86]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [87]:
add_more_features(testing)
add_more_features(training)
add_more_features(validation)

In [88]:
validation.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'sqft_living_sqrt',
       'sqft_lot_sqrt', 'bedrooms_square', 'floors_square'],
      dtype='object')

### Quiz Question: Which was the best value for the l1_penalty, i.e. which value of l1_penalty produced the lowest RSS on VALIDATION data?

In [89]:
best_l1_penalty = None
min_rss_errors = None

for l1_penalty in np.logspace(1, 7, num=13):
    cur_model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    cur_model.fit(training[all_features], training['price'])
    validation_prediction = cur_model.predict(validation[all_features])
    validation_errors = validation_prediction - validation['price']
    rss_errors = np.sum(validation_errors * validation_errors)
    if (min_rss_errors == None or min_rss_errors > rss_errors):
        min_rss_errors = rss_errors
        best_l1_penalty = l1_penalty
    print ("l1_penalty = %12.2f, rss_errors = %015.0f" % (l1_penalty, rss_errors))
    
print ("Best l1_penalty = %12.2f, min rss_errors = %15.0f" % (best_l1_penalty, min_rss_errors))

l1_penalty =        10.00, rss_errors = 398213327300135
l1_penalty =        31.62, rss_errors = 399041900253347
l1_penalty =       100.00, rss_errors = 429791604072560
l1_penalty =       316.23, rss_errors = 463739831045121
l1_penalty =      1000.00, rss_errors = 645898733633801
l1_penalty =      3162.28, rss_errors = 1222506859427163
l1_penalty =     10000.00, rss_errors = 1222506859427163
l1_penalty =     31622.78, rss_errors = 1222506859427163
l1_penalty =    100000.00, rss_errors = 1222506859427163
l1_penalty =    316227.77, rss_errors = 1222506859427163
l1_penalty =   1000000.00, rss_errors = 1222506859427163
l1_penalty =   3162277.66, rss_errors = 1222506859427163
l1_penalty =  10000000.00, rss_errors = 1222506859427163
Best l1_penalty =        10.00, min rss_errors = 398213327300135


### Rss with best l1 penalty on test set

In [90]:
model = linear_model.Lasso(alpha=best_l1_penalty, normalize=True)

In [91]:
model.fit(training[all_features], training['price'])

Lasso(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=True, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [92]:
test_prediction = model.predict(testing[all_features])
test_errors = test_prediction - testing['price']
rss_test = np.sum(test_errors * test_errors)
rss_test

98467402552698.78

### 8. Quiz Question: Using the best L1 penalty, how many nonzero weights do you have? Count the number of nonzero coefficients first, and add 1 if the intercept is also nonzero. A succinct way to do this is



In [93]:
np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)

15

In [94]:
len(all_features)

17

### Limit to 7 features

In [95]:
max_nonzeros = 7

In [96]:
penalty_and_nrs = []

for l1_penalty in np.logspace(1, 4, num=20):
    cur_model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    cur_model.fit(training[all_features], training['price'])
    nonzeros = np.count_nonzero(cur_model.coef_) + np.count_nonzero(cur_model.intercept_)
    penalty_and_nrs += [(l1_penalty, nonzeros)]
    

In [97]:
penalty_and_nrs

[(10.0, 15),
 (14.38449888287663, 15),
 (20.6913808111479, 15),
 (29.76351441631318, 15),
 (42.81332398719393, 13),
 (61.58482110660264, 12),
 (88.58667904100822, 11),
 (127.42749857031335, 10),
 (183.29807108324357, 7),
 (263.6650898730358, 6),
 (379.26901907322497, 6),
 (545.5594781168514, 6),
 (784.7599703514607, 5),
 (1128.8378916846884, 3),
 (1623.776739188721, 3),
 (2335.7214690901214, 2),
 (3359.818286283781, 1),
 (4832.930238571752, 1),
 (6951.927961775606, 1),
 (10000.0, 1)]

### Quiz Question: What values did you find for l1_penalty_min and l1_penalty_max?


In [98]:
l1_penalty_min = max(s[0] for s in penalty_and_nrs if s[1] > max_nonzeros)
l1_penalty_min

127.42749857031335

In [99]:
l1_penalty_max = min(s[0] for s in penalty_and_nrs if s[1] < max_nonzeros)
l1_penalty_max

263.6650898730358

### Find the model that the lowest RSS on the VALIDATION set and has sparsity equal to ‘max_nonzeros’. (Again, take account of the intercept when counting the number of nonzeros.)

 Quiz Question: What value of l1_penalty in our narrow range has the lowest RSS on the VALIDATION set and has sparsity equal to ‘max_nonzeros’?

In [100]:
cur_best_l1_penalty = None
cur_best_rss_errors = None

for l1_penalty in np.linspace(l1_penalty_min, l1_penalty_max, num=20):
    cur_model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    cur_model.fit(training[all_features], training['price'])
    total_nonzeros = np.count_nonzero(cur_model.coef_) + np.count_nonzero(cur_model.intercept_)
    print ("L1 penalty = %.2f, total non zeros = %d" % (l1_penalty, total_nonzeros))
    
    if total_nonzeros == max_nonzeros:
        valid_prediction = cur_model.predict(validation[all_features])
        valid_errors = valid_prediction - validation['price']
        rss_errors = np.sum(valid_errors * valid_errors)
        print ("Rss errors: %15.0f" % rss_errors)
        if (cur_best_rss_errors == None or rss_errors < cur_best_rss_errors):
            cur_best_rss_errors = rss_errors
            cur_best_l1_penalty = l1_penalty

print ("best l1_penalty: %f, best_rss errors %f" % (cur_best_l1_penalty, cur_best_rss_errors))

L1 penalty = 127.43, total non zeros = 10
L1 penalty = 134.60, total non zeros = 10
L1 penalty = 141.77, total non zeros = 8
L1 penalty = 148.94, total non zeros = 8
L1 penalty = 156.11, total non zeros = 7
Rss errors: 440037365263317
L1 penalty = 163.28, total non zeros = 7
Rss errors: 440777489641605
L1 penalty = 170.45, total non zeros = 7
Rss errors: 441566698090140
L1 penalty = 177.62, total non zeros = 7
Rss errors: 442406413188666
L1 penalty = 184.79, total non zeros = 7
Rss errors: 443296716874315
L1 penalty = 191.96, total non zeros = 7
Rss errors: 444239780526142
L1 penalty = 199.13, total non zeros = 7
Rss errors: 445230739842614
L1 penalty = 206.30, total non zeros = 6
L1 penalty = 213.47, total non zeros = 6
L1 penalty = 220.64, total non zeros = 6
L1 penalty = 227.81, total non zeros = 6
L1 penalty = 234.98, total non zeros = 6
L1 penalty = 242.15, total non zeros = 6
L1 penalty = 249.32, total non zeros = 6
L1 penalty = 256.49, total non zeros = 6
L1 penalty = 263.67, to

In [101]:
best_fit_model = linear_model.Lasso(cur_best_l1_penalty, normalize=True)
best_fit_model.fit(training[all_features], training['price'])

Lasso(alpha=156.10909673930755, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=True, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [104]:
non_zeros_features = [(all_features[i], best_fit_model.coef_[i]) 
                      for i in range(len(all_features)) if best_fit_model.coef_[i] != 0.0]
non_zeros_features

[('bathrooms', 10610.890284398287),
 ('sqft_living', 163.3802516476289),
 ('waterfront', 506451.68711484916),
 ('view', 41960.04355485288),
 ('grade', 116253.55369970747),
 ('yr_built', -2612.234880357487)]