In [3]:
import pandas as pd

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('../data/kc_house_data.csv', dtype=dtype_dict)


In [4]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [5]:
from sklearn import linear_model  # using scikit-learn

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights
print model_all.coef_

[     0.              0.              0.            134.43931396      0.
      0.              0.              0.              0.              0.
  24750.00458561      0.          61749.10309071      0.              0.
     -0.              0.        ]


In [7]:
testing = pd.read_csv('../data/wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('../data/wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('../data/wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [8]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [9]:
import numpy as np
for l1_penalty in np.logspace(1, 7, num=13):
    print l1_penalty
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    predictions = model.predict(validation[all_features])
    rss = 0
    results = {}
    for i in range(0, len(predictions)):
        error = predictions[i] - validation['price'][i]
        rss += error * error
    print rss

10.0
3.982133273e+14
31.6227766017
3.99041900253e+14
100.0
4.29791604073e+14
316.227766017
4.63739831045e+14
1000.0
6.45898733634e+14
3162.27766017
1.22250685943e+15
10000.0
1.22250685943e+15
31622.7766017
1.22250685943e+15
100000.0
1.22250685943e+15
316227.766017
1.22250685943e+15
1000000.0
1.22250685943e+15
3162277.66017
1.22250685943e+15
10000000.0
1.22250685943e+15


In [10]:
model = linear_model.Lasso(alpha=10.0, normalize=True) # set parameters
model.fit(training[all_features], training['price']) # learn weights
test = model.predict(testing[all_features])
for i in range(0, len(test)):
    error = test[i] - testing['price'][i]
    rss += error * error

print "RSS on test data " + str(rss)
k = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
print k

RSS on test data 1.32097426198e+15
15


In [11]:
max_nonzeros = 7
alist = []
for l1_penalty in np.logspace(1,4,num=20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features],training['price'])
    alist.append(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))
print max(alist)
print min(alist)
#print list(np.logspace(1,4,num=20))
zero_dict = dict(zip(list(np.logspace(1,4,num=20)),alist))
print zero_dict

15
1
{42.813323987193932: 13, 1128.8378916846884: 3, 6951.9279617756056: 1, 1623.776739188721: 3, 14.384498882876629: 15, 2335.7214690901214: 2, 10.0: 15, 127.42749857031335: 10, 61.584821106602639: 12, 784.75997035146065: 5, 4832.9302385717519: 1, 183.29807108324357: 7, 20.691380811147901: 15, 263.66508987303581: 6, 3359.8182862837812: 1, 88.586679041008225: 11, 29.763514416313178: 15, 379.26901907322497: 6, 545.55947811685144: 6, 10000.0: 1}


In [12]:
l1_penalty_min = 127.42749857031335
l1_penalty_max = 263.66508987303581
for l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):
    print "l1_penalty" + str(l1_penalty)
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    predictions = model.predict(validation[all_features])
    rss = 0
    results = {}
    print model.coef_
    k = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    print "non_zeros " + str(k)
    for i in range(0, len(predictions)):
        error = predictions[i] - validation['price'][i]
        rss += error * error
    print "RSS : "+str(rss)
    print "------------------"
    

l1_penalty127.42749857
[ -2.78660782e+03  -0.00000000e+00   1.62952813e+04   1.64911566e+02
   0.00000000e+00  -0.00000000e+00  -4.87299465e+01   0.00000000e+00
   3.90255150e+02   5.28876991e+05   4.24208113e+04   0.00000000e+00
   1.18475470e+05   0.00000000e+00   0.00000000e+00  -2.82780772e+03
   0.00000000e+00]
non_zeros 10
RSS : 4.35374677103e+14
------------------
l1_penalty134.597898113
[ -9.16867351e+02  -0.00000000e+00   1.47218522e+04   1.63692844e+02
   0.00000000e+00  -0.00000000e+00  -3.56625993e+01   0.00000000e+00
   7.24404474e+00   5.23783564e+05   4.23757492e+04   0.00000000e+00
   1.18244143e+05   0.00000000e+00   0.00000000e+00  -2.76685637e+03
   0.00000000e+00]
non_zeros 10
RSS : 4.37009229124e+14
------------------
l1_penalty141.768297655
[ -0.00000000e+00  -0.00000000e+00   1.31781404e+04   1.63173823e+02
   0.00000000e+00  -0.00000000e+00  -2.18494884e+01   0.00000000e+00
   0.00000000e+00   5.18208433e+05   4.22785669e+04   0.00000000e+00
   1.17693998e+05   