In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float,
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str,
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 
              'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 
              'id':str, 'sqft_lot':int, 'view':int}

In [3]:
df_sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [4]:
from math import log, sqrt
df_sales['sqft_living_sqrt'] = df_sales['sqft_living'].apply(sqrt)
df_sales['sqft_lot_sqrt'] = df_sales['sqft_lot'].apply(sqrt)
df_sales['bedrooms_square'] = df_sales['bedrooms']*df_sales['bedrooms']
df_sales['floors_square'] = df_sales['floors']*df_sales['floors']

In [5]:
from sklearn import linear_model

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True)
model_all.fit(df_sales[all_features], df_sales['price'])

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

<hr>

**Quiz Question: Which features have been chosen by LASSO, i.e. which features were assigned nonzero weights?**

In [6]:
dict(zip(all_features, model_all.coef_))

{'bedrooms': 0.0,
 'bedrooms_square': 0.0,
 'bathrooms': 0.0,
 'sqft_living': 134.43931395541438,
 'sqft_living_sqrt': 0.0,
 'sqft_lot': 0.0,
 'sqft_lot_sqrt': 0.0,
 'floors': 0.0,
 'floors_square': 0.0,
 'waterfront': 0.0,
 'view': 24750.00458560952,
 'condition': 0.0,
 'grade': 61749.10309070813,
 'sqft_above': 0.0,
 'sqft_basement': 0.0,
 'yr_built': -0.0,
 'yr_renovated': 0.0}

<hr>

In [7]:
df_test = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
df_train = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
df_valid = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [8]:
df_test['sqft_living_sqrt'] = df_test['sqft_living'].apply(sqrt)
df_test['sqft_lot_sqrt'] = df_test['sqft_lot'].apply(sqrt)
df_test['bedrooms_square'] = df_test['bedrooms']*df_test['bedrooms']
df_test['floors_square'] = df_test['floors']*df_test['floors']

df_train['sqft_living_sqrt'] = df_train['sqft_living'].apply(sqrt)
df_train['sqft_lot_sqrt'] = df_train['sqft_lot'].apply(sqrt)
df_train['bedrooms_square'] = df_train['bedrooms']*df_train['bedrooms']
df_train['floors_square'] = df_train['floors']*df_train['floors']

df_valid['sqft_living_sqrt'] = df_valid['sqft_living'].apply(sqrt)
df_valid['sqft_lot_sqrt'] = df_valid['sqft_lot'].apply(sqrt)
df_valid['bedrooms_square'] = df_valid['bedrooms']*df_valid['bedrooms']
df_valid['floors_square'] = df_valid['floors']*df_valid['floors']

In [9]:
penalties = np.logspace(1, 7, num=13)

In [10]:
rss = lambda y, yhat: np.dot((y - yhat).T, (y - yhat))

calculated_rss = {}

for l1_penalty in penalties:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    
    model.fit(df_train[all_features], df_train['price'])
    
    predicted = model.predict(df_valid[all_features])
    
    calculated_rss[l1_penalty] = rss(df_valid['price'], predicted)
    
pd.DataFrame(calculated_rss, index=['RSS']).T

Unnamed: 0,RSS
10.0,398213300000000.0
31.62278,399041900000000.0
100.0,429791600000000.0
316.2278,463739800000000.0
1000.0,645898700000000.0
3162.278,1222507000000000.0
10000.0,1222507000000000.0
31622.78,1222507000000000.0
100000.0,1222507000000000.0
316227.8,1222507000000000.0


<hr>

**Quiz Question: Which was the best value for the l1_penalty, i.e. which value of l1_penalty produced the lowest RSS on VALIDATION data?**

In [11]:
min(calculated_rss, key=calculated_rss.get)

10.0

Now that you have selected an L1 penalty, compute the RSS on TEST data for the model with the best L1 penalty.

In [12]:
model = linear_model.Lasso(alpha=10, normalize=True)

model.fit(df_train[all_features], df_train['price'])

predicted = model.predict(df_test[all_features])

calculated_rss = rss(df_test['price'], predicted)

**Quiz Question: Using the best L1 penalty, how many nonzero weights do you have? Count the number of nonzero coefficients first, and add 1 if the intercept is also nonzero**

In [13]:
np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)

15

<hr>

In [14]:
max_nonzeros = 7

In [15]:
allowed_penalties = {}

for l1_penalty in np.logspace(1, 4, num=20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    
    model.fit(df_train[all_features], df_train['price'])
    
    allowed_penalties[l1_penalty] = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    
pd.DataFrame(allowed_penalties, index=['# coef']).T

Unnamed: 0,# coef
10.0,15
14.384499,15
20.691381,15
29.763514,15
42.813324,13
61.584821,12
88.586679,11
127.427499,10
183.298071,7
263.66509,6


<hr>

**Quiz Question: What values did you find for l1_penalty_min and l1_penalty_max?**

<hr>

In [16]:
calculated_rss = {}

for l1_penalty in np.linspace(127.427499, 263.665090,20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    
    model.fit(df_train[all_features], df_train['price'])
    
    if max_nonzeros == np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_):
        predictions = model.predict(df_valid[all_features])

        calculated_rss[l1_penalty] = rss(df_valid['price'], predictions)

pd.Series(calculated_rss)

156.109097    4.400374e+14
163.279497    4.407775e+14
170.449896    4.415667e+14
177.620296    4.424064e+14
184.790695    4.432967e+14
191.961095    4.442398e+14
199.131494    4.452307e+14
dtype: float64

<hr>

**Quiz Question: What value of l1_penalty in our narrow range has the lowest RSS on the VALIDATION set and has sparsity equal to ‘max_nonzeros’?**

In [17]:
min(calculated_rss, key=calculated_rss.get)

156.10909710526315

**Quiz Question: What features in this model have non-zero coefficients?**

In [18]:
model = linear_model.Lasso(alpha=156.10909710526315, normalize=True)

model.fit(df_train[all_features], df_train['price'])

features = dict(zip(all_features, model.coef_))

{k: v for k, v in features.items() if v != 0}

{'bathrooms': 10610.89020737359,
 'sqft_living': 163.3802516719241,
 'waterfront': 506451.68682324677,
 'view': 41960.04354862281,
 'grade': 116253.55366145167,
 'yr_built': -2612.2348776920894}