# W5_Feature Selection & Lasso (I)

In [1]:
import numpy as np
import pandas as pd
from math import log, sqrt
from sklearn.linear_model import Lasso

In [2]:
# import data
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
data = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

# create new features
data['sqft_living_sqrt'] = data['sqft_living'].apply(sqrt)
data['sqft_lot_sqrt'] = data['sqft_lot'].apply(sqrt)
data['bedrooms_square'] = data['bedrooms'] * data['bedrooms']
data['floors_square'] = data['floors'] * data['floors']

In [3]:
# build a lasso model
all_features = ['bedrooms', 'bedrooms_square', 'bathrooms', 'sqft_living', 'sqft_living_sqrt', 'sqft_lot', 'sqft_lot_sqrt','floors', 'floors_square', 'waterfront', 'view', 'condition', 'grade', 'sqft_above','sqft_basement','yr_built', 'yr_renovated']
all_model = Lasso(alpha=5e2, normalize=True).fit(data[all_features], data['price'])
print('# Selected features:')
for i in range(len(all_features)):
    if all_model.coef_[i] != 0:
        print(all_features[i])

# Selected features:
sqft_living
view
grade


In [4]:
def get_RSS(X, model, y):
    predictions = model.predict(X)
    return sum((predictions - y) ** 2)

# import data
test = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
train = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

# create new features 
for data in (test, train, validation):
    data['sqft_living_sqrt'] = data['sqft_living'].apply(sqrt)
    data['sqft_lot_sqrt'] = data['sqft_lot'].apply(sqrt)
    data['bedrooms_square'] = data['bedrooms'] * data['bedrooms']
    data['floors_square'] = data['floors'] * data['floors']

f = None
best_model = None
best_penalty = None
min_RSS = float('inf')

# search for optimal penalty
for penalty in np.logspace(1, 7, num=13):
    model = Lasso(alpha=penalty, normalize=True).fit(train[all_features], train['price'])
    # compute RSS on validation data
    RSS = get_RSS(validation[all_features], model, validation['price'])
    if RSS < min_RSS:
        best_penalty = penalty
        best_model = model
        min_RSS = RSS
print('best penalty:{}, min RSS:{}'.format(best_penalty, min_RSS))

best penalty:10.0, min RSS:398213327300134.06


In [5]:
# compute RSS on test data using best penalty
get_RSS(test[all_features], best_model, test['price'])

98467402552698.875

In [6]:
# count the number of nonzero coefficients
np.count_nonzero(best_model.coef_) + np.count_nonzero(best_model.intercept_)

15

In [7]:
# limit the nonzero coefficients to be 7
max_nonzeros = 7
penalty_too_low = []
penalty_too_high = []

# find the penalty range we are interested in
for penalty in np.logspace(1, 4, num=20):
    model = Lasso(alpha=penalty, normalize=True).fit(train[all_features], train['price'])
    nonzeros = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    # too many nonzeors coefficients -> penalty is too low
    if nonzeros > max_nonzeros:
        penalty_too_low.append(penalty)
    # too few nonzeors coefficients -> penalty is too high
    elif nonzeros < max_nonzeros:
        penalty_too_high.append(penalty)
        
l1_penalty_min = max(penalty_too_low)
l1_penalty_max = min(penalty_too_high)

print(l1_penalty_min, l1_penalty_max)

127.42749857 263.665089873


In [8]:
best_penalty_7 = None
best_model_7 = None
min_RSS_7 = float('inf')

for penalty in np.linspace(l1_penalty_min, l1_penalty_max, num=20):
    model = Lasso(alpha=penalty, normalize=True).fit(train[all_features], train['price'])
    nonzeros = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    RSS = get_RSS(validation[all_features], model, validation['price'])
    if (RSS < min_RSS_7) and (nonzeros == max_nonzeros):
        best_penalty_7 = penalty
        best_model_7 = model
        min_RSS_7 = RSS
print('best penalty:{}, min RSS:{}'.format(best_penalty_7, min_RSS_7))

best penalty:156.10909673930755, min RSS:440037365263317.0


In [9]:
print('# Selected features:')
for i in range(len(all_features)):
    if best_model_7.coef_[i] != 0:
        print(all_features[i])

# Selected features:
bathrooms
sqft_living
waterfront
view
grade
yr_built
