In [8]:
import sklearn as sk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from math import sqrt,log


In [4]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}


In [7]:
sales = pd.read_csv('kc_house_data.csv',dtype=dtype_dict)


# Adding new features to our model

In [11]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [15]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']


In [16]:
model_all = sk.linear_model.Lasso(normalize=True,alpha=5e2)
model_all.fit(sales[all_features],sales['price'])

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [24]:
w_all = list(model_all.coef_)

In [27]:
for i in range(len(all_features)):
    print all_features[i],w_all[i]

bedrooms 0.0
bedrooms_square 0.0
bathrooms 0.0
sqft_living 134.439313955
sqft_living_sqrt 0.0
sqft_lot 0.0
sqft_lot_sqrt 0.0
floors 0.0
floors_square 0.0
waterfront 0.0
view 24750.0045856
condition 0.0
grade 61749.1030907
sqft_above 0.0
sqft_basement 0.0
yr_built -0.0
yr_renovated 0.0


# NONZERO FEATURES :
## sqft_living 134.439313955
## view 24750.0045856
## grade 61749.1030907


In [28]:
test_data = pd.read_csv('wk3_kc_house_test_data.csv',dtype=dtype_dict)
train_data = pd.read_csv('wk3_kc_house_train_data.csv',dtype = dtype_dict)
validation_data = pd.read_csv('wk3_kc_house_valid_data.csv',dtype=dtype_dict)

In [29]:
test_data['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
test_data['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
test_data['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
test_data['floors_square'] = sales['floors']*sales['floors']

train_data['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
train_data['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
train_data['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
train_data['floors_square'] = sales['floors']*sales['floors']

validation_data['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
validation_data['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
validation_data['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
validation_data['floors_square'] = sales['floors']*sales['floors']

In [30]:
L1 = np.logspace(1,7,num=13)

In [36]:
for l in L1:
    model = sk.linear_model.Lasso(normalize=True,alpha=l)
    model.fit(train_data[all_features],train_data['price'])
    prediction = model.predict(validation_data[all_features])
    error = prediction - validation_data['price']
    print sqrt((error*error).sum()), l

20637259.4965 10.0
20635272.1437 31.6227766017
20793220.4335 100.0
21534619.3615 316.227766017
25414537.8403 1000.0
34964365.5659 3162.27766017
34964365.5659 10000.0
34964365.5659 31622.7766017
34964365.5659 100000.0
34964365.5659 316227.766017
34964365.5659 1000000.0
34964365.5659 3162277.66017
34964365.5659 10000000.0


# l1 = 31.6227766017 Produced minimum RSS

In [41]:
final_model = sk.linear_model.Lasso(normalize=True,alpha=31.6227766017)
final_model.fit(train_data[all_features],train_data['price'])

Lasso(alpha=31.6227766017, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [42]:
prediction = final_model.predict(test_data[all_features])
error = test_data['price'] - prediction
print sqrt((error*error).sum())

10297030.5789


In [43]:
final_model.coef_

array([ -2.76729393e+04,   1.67230240e+02,   4.10384139e+04,
         1.71521611e+02,  -0.00000000e+00,  -1.25120980e-01,
        -1.94053934e+01,   1.71158856e+04,  -1.00860209e+03,
         5.93010322e+05,   4.28665240e+04,   1.16493744e+04,
         1.22281472e+05,   2.96713405e+00,   0.00000000e+00,
        -3.56025727e+03,   2.40572982e-01])

In [44]:
final_model.intercept_

6189101.1820576582

# RSS ON THE TEST DATA WITH THE BEST MODEL IS : 10297030.5789

In [68]:
np.count_nonzero(final_model.coef_) + np.count_nonzero(final_model.intercept_)


16

# TOTAL NONZEROS COEFFICIENTS IN THE FINAL MODEL ARE 16

In [56]:
max_nonzeroes = 7
l1_penalty = np.logspace(1,4,num=20)
box1 = []
box2 = []
for l in l1_penalty:
    model = sk.linear_model.Lasso(normalize=True,alpha=l)
    model.fit(train_data[all_features],train_data['price'])
    count = np.count_nonzero(model.coef_)+np.count_nonzero(model.intercept_)
    if count > 7 :
        box1.append(l)
    elif count < 7:
        box2.append(l)
            
print max(box1)
print min(box2)

127.42749857
263.665089873


In [57]:
l1_penalty_min = max(box1)
l1_penalty_max = min(box2)

# l1_penalty_min = 127.42749857
# l1_penalty_max = 263.665089873

In [61]:
L = np.linspace(l1_penalty_min,l1_penalty_max,20)
for l1 in L :
    model = sk.linear_model.Lasso(normalize=True,alpha=l1)
    model.fit(train_data[all_features],train_data['price'])
    prediction = model.predict(validation_data[all_features])
    error = prediction - validation_data['price']
#     print sqrt((error*error).sum())
    print "alpha = ",l1," non_zeroes = ",np.count_nonzero(model.coef_)+np.count_nonzero(model.intercept_),'RSS is : ',sqrt((error*error).sum())

alpha =  127.42749857  non_zeroes =  8 RSS is :  20895204.9962
alpha =  134.597898113  non_zeroes =  8 RSS is :  20924478.7952
alpha =  141.768297655  non_zeroes =  7 RSS is :  20945458.9641
alpha =  148.938697197  non_zeroes =  7 RSS is :  20960667.4992
alpha =  156.109096739  non_zeroes =  7 RSS is :  20977073.9868
alpha =  163.279496282  non_zeroes =  7 RSS is :  20994713.4941
alpha =  170.449895824  non_zeroes =  7 RSS is :  21013497.7571
alpha =  177.620295366  non_zeroes =  7 RSS is :  21033466.0691
alpha =  184.790694908  non_zeroes =  7 RSS is :  21054617.6842
alpha =  191.961094451  non_zeroes =  7 RSS is :  21077000.5181
alpha =  199.131493993  non_zeroes =  7 RSS is :  21100493.0591
alpha =  206.301893535  non_zeroes =  6 RSS is :  21125077.5483
alpha =  213.472293077  non_zeroes =  6 RSS is :  21145044.7962
alpha =  220.64269262  non_zeroes =  6 RSS is :  21165967.6805
alpha =  227.813092162  non_zeroes =  6 RSS is :  21187843.37
alpha =  234.983491704  non_zeroes =  6 RSS 

# alpha =  141.768297655  non_zeroes =  7 RSS is :  20945458.9641

In [62]:
final_model_7 = sk.linear_model.Lasso(normalize=True,alpha=141.768297655)
final_model_7.fit(train_data[all_features],train_data['price'])


Lasso(alpha=141.768297655, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [66]:
for i in range(len(all_features)):
    print all_features[i],final_model_7.coef_[i]

bedrooms -0.0
bedrooms_square 0.0
bathrooms 13640.8669422
sqft_living 162.420730103
sqft_living_sqrt -0.0
sqft_lot -0.0
sqft_lot_sqrt -0.0
floors 0.0
floors_square -0.0
waterfront 517880.636737
view 42203.3982868
condition 0.0
grade 117754.029315
sqft_above 0.0
sqft_basement 0.0
yr_built -2716.78668132
yr_renovated 0.0
