# W5_Feature Selection & Lasso (II)

In [1]:
import numpy as np
import pandas as pd

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
data = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [3]:
# get numpy array we need from dataframe
def get_numpy_data(dataframe, features, output):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_matrix = dataframe[features].values
    output_array = dataframe[output].values 
    return (features_matrix, output_array)

In [4]:
# compute for estimated output
def get_predictions(features_matrix, weights):
    predictions = np.dot(features_matrix, weights)
    return predictions

In [5]:
# normarlize the features
def feature_normalization(X):
    norms = np.linalg.norm(X, axis=0)
    X_normalized = X / norms
    return (X_normalized, norms)

In [6]:
# compute ro 
# ro[i] = SUM[ [feature_i]*(output - prediction + w[i]*[feature_i]) ]
def get_ro(features_matrix, weights, output_array, i):
    predicions = get_predictions(features_matrix, weights)
    term1 = features_matrix[:,i]
    term2 = output_array - predicions + weights[i] * features_matrix[:,i]
    ro = np.dot(term1, term2)
    return ro

In [7]:
# test get_ro function
features = ['sqft_living', 'bedrooms']
output = 'price'
features_matrix, output_array = get_numpy_data(data, features, output)
norm_features, norm = feature_normalization(features_matrix)

ro1 = get_ro(norm_features, np.array([1,4,1]), output_array, 1)
ro2 = get_ro(norm_features, np.array([1,4,1]), output_array, 2)

print(ro1, ro1*2)
print(ro2, ro2*2)
print(norm)

87939470.8233 175878941.647
80966698.6662 161933397.332
[  1.47013605e+02   3.34257264e+05   5.14075870e+02]


In [8]:
# implement coordinate descent 
def lasso_coordinate_descent_step(i, features_matrix, output, weights, l1_penalty):
    ro = get_ro(features_matrix, weights, output, i)
    
    if i == 0: # we don't regularize intercept
        new_weight = ro
    elif ro < -l1_penalty/2.:
        new_weight = ro + l1_penalty/2
    elif ro > l1_penalty/2.:
        new_weight = ro - l1_penalty/2
    else:
        new_weight = 0.    
    return new_weight

In [9]:
# implement cyclical coordinate descent
def cyclical_lasso_coordinate_descent(features_matrix, output, initial_weights, l1_penalty, tolerance):
    weights = initial_weights
    max_gradient_change = tolerance + 1
    while max_gradient_change > tolerance:
        gradient_change = []
        for i in range(len(weights)):
            new_w = lasso_coordinate_descent_step(i, features_matrix, output, weights, l1_penalty)
            gradient_change.append(abs(weights[i] - new_w)) 
            weights[i] = new_w
        max_gradient_change = max(gradient_change)
    
    return weights

In [10]:
# compute RSS
def get_RSS(X, weights, y):
    predictions = get_predictions(X, weights)
    return sum((predictions-y) ** 2)

In [11]:
# example 1
learned_w = cyclical_lasso_coordinate_descent(norm_features, output_array, np.array([0,0,0]), 1e7, 1.0)
get_RSS(norm_features, learned_w, output_array )

1630492458794279.8

In [12]:
learned_w

array([21624995, 63157249,        0])

In [13]:
def show_selected_feature(feature_list, coef):
    print('# Selected features:')
    for i in range(len(coef)):
        if coef[i] != 0:
            print(feature_list[i])

In [14]:
train = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']
output = 'price'
features_matrix, output_array = get_numpy_data(train, features, output)
norm_features, norms = feature_normalization(features_matrix)

# example 2
weights_1e7 = cyclical_lasso_coordinate_descent(norm_features, output_array, np.zeros(14), 1e7, 1.) 

constant_features = ['constant'] + features
show_selected_feature(constant_features, weights_1e7)

# Selected features:
constant
sqft_living
waterfront
view


In [15]:
# example 3
weights_1e8 = cyclical_lasso_coordinate_descent(norm_features, output_array, np.zeros(14), 1e8, 1.)
show_selected_feature(constant_features, weights_1e8)       

# Selected features:
constant


In [16]:
# example 4
weights_1e4 = cyclical_lasso_coordinate_descent(norm_features, output_array, np.zeros(14), 1e4, 5e5)
show_selected_feature(constant_features, weights_1e4)

# Selected features:
constant
bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
condition
grade
sqft_above
sqft_basement
yr_built
yr_renovated


In [17]:
# get normalized weights thus we can make new predictions without normalizing new data
def get_normalized_weights(weights, norms):
    normalized_weights = weights / norms
    return normalized_weights

norn_weights_1e4 = get_normalized_weights(weights_1e4, norms)
norn_weights_1e7 = get_normalized_weights(weights_1e7, norms)
norn_weights_1e8 = get_normalized_weights(weights_1e8, norms)

# compare 3 model on test data
X_test, y_test = get_numpy_data(test, features, output)
RSS1 = get_RSS(X_test, norn_weights_1e4, y_test)
RSS2 = get_RSS(X_test, norn_weights_1e7, y_test)
RSS3 = get_RSS(X_test, norn_weights_1e8, y_test)
print(RSS1, RSS2, RSS3)

2.28459958971e+14 2.7596207592e+14 5.37166151497e+14
