In [247]:
import pandas as pd
import numpy as np
from math import sqrt

In [308]:
def get_numpy_data(dataframe,features,output):
    dataframe['constant']=1
    features=['constant']+features
    feature_matrix=dataframe[features].values
    output_array=dataframe[output].values
    return (feature_matrix,output_array)

In [309]:
sales=pd.read_csv('kc_house_data.csv')

In [374]:
def predict(feature_matrix,weights):
    predictions=np.dot(feature_matrix,weights)
    return predictions


In [311]:
def normalize_features(features):
    norms=np.linalg.norm(features,axis=0)
    normalized_features=features/norms
    return (normalized_features,norms)

In [377]:
sqft_bed,output_s=get_numpy_data(sales,['sqft_living','bedrooms'],'price')

In [378]:
norm_sb,sb_norms=normalize_features(sqft_bed)
print(sb_norms)
(((sqft_bed)**2).sum(axis=0))**0.5

[1.47013605e+02 3.34257264e+05 5.14075870e+02]


array([1.47013605e+02, 3.34257264e+05, 5.14075870e+02])

In [379]:
initial_weights=[1,4,1]

In [380]:
predict_1=predict(norm_sb,initial_weights)

In [381]:
ro=[]
for i in range(len(initial_weights)):
    r=((output_s-predict_1+initial_weights[i]*norm_sb[:,i])*norm_sb[:,i]).sum()
    ro.append(r)

In [382]:
ro

[79400300.0145229, 87939470.82325175, 80966698.66623947]

In [383]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    # compute prediction
    prediction = predict(feature_matrix,weights)
    # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
    ro_i = ((output-prediction+weights[i]*feature_matrix[:,i])*feature_matrix[:,i]).sum()
    if i == 0: # intercept -- do not regularize
        new_weight_i = ro_i
    elif ro_i < -l1_penalty/2.0:
        new_weight_i = ro_i+l1_penalty/2
    elif ro_i > l1_penalty/2.0:
        new_weight_i = ro_i-l1_penalty/2
    else:
        new_weight_i = 0.
    return new_weight_i

In [384]:
import math
print (lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],[2./math.sqrt(13),3./math.sqrt(10)]]), np.array([1., 1.]), np.array([1., 4.]), 0.1))

0.4255588466910251


In [331]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    weights=np.array(initial_weights)
    dif=[tolerance+1]*len(initial_weights)
    while True:
        for i in range(len(initial_weights)):
            old=weights[i]
            weights[i]=lasso_coordinate_descent_step(i,feature_matrix,output,weights,l1_penalty)
            dif[i]=abs(weights[i]-old)
        if max(dif)<tolerance:
            break
    return weights

In [332]:
try_mat,output=get_numpy_data(sales,['sqft_living','bedrooms'],'price')

sqft_bed_try,sqft_bed_norms=normalize_features(try_mat)

In [334]:
sqft_bed_weights=lasso_cyclical_coordinate_descent(sqft_bed_try,output,[0.0,0.0,0.0],1e7,1.0)

In [335]:
sqft_bed_weights

array([21624997.95951909, 63157247.20788956,        0.        ])

In [386]:
((predict(sqft_bed_try,sqft_bed_weights)-output_s)**2.0).sum()

1630492476715386.5

In [353]:
training=pd.read_csv('kc_house_train_data.csv')
testing=pd.read_csv('kc_house_test_data.csv')
features_train=['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']

In [354]:
train,output=get_numpy_data(training,['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated'],'price')

In [355]:
test,test_output=get_numpy_data(testing,['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated'],'price')

In [357]:
norm_train,norm_for_all=normalize_features(train)

In [360]:
weights1e7=lasso_cyclical_coordinate_descent(norm_train,output,[0.0]*len(norm_train[0]),1e7,1.0)

In [367]:
pd.Series(weights1e7,index=['constant']+features_train)

constant         2.442960e+07
bedrooms         0.000000e+00
bathrooms        0.000000e+00
sqft_living      4.838917e+07
sqft_lot         0.000000e+00
floors           0.000000e+00
waterfront       3.317511e+06
view             7.329962e+06
condition        0.000000e+00
grade            0.000000e+00
sqft_above       0.000000e+00
sqft_basement    0.000000e+00
yr_built         0.000000e+00
yr_renovated     0.000000e+00
dtype: float64

In [364]:
weights1e8=lasso_cyclical_coordinate_descent(norm_train,output,[0.0]*len(norm_train[0]),1e8,1.0)

pd.Series(weights1e8,index=['intercept']+features_train)

intercept        7.111463e+07
bedrooms         0.000000e+00
bathrooms        0.000000e+00
sqft_living      0.000000e+00
sqft_lot         0.000000e+00
floors           0.000000e+00
waterfront       0.000000e+00
view             0.000000e+00
condition        0.000000e+00
grade            0.000000e+00
sqft_above       0.000000e+00
sqft_basement    0.000000e+00
yr_built         0.000000e+00
yr_renovated     0.000000e+00
dtype: float64

In [365]:
weights1e4=lasso_cyclical_coordinate_descent(norm_train,output,[0.0]*len(norm_train[0]),1e4,5e5)

pd.Series(weights1e4,index=['intercept']+features_train)

intercept        7.856474e+07
bedrooms        -2.209740e+07
bathrooms        1.279107e+07
sqft_living      9.380809e+07
sqft_lot        -2.013173e+06
floors          -4.219185e+06
waterfront       6.482843e+06
view             7.127409e+06
condition        5.001665e+06
grade            1.432752e+07
sqft_above      -1.577096e+07
sqft_basement   -5.159591e+06
yr_built        -8.449534e+07
yr_renovated     2.824439e+06
dtype: float64

In [370]:
norm_1e4=weights1e4/norm_for_all
norm_1e7=weights1e7/norm_for_all
norm_1e8=weights1e8/norm_for_all

In [372]:
rss1e4=((predict(test,norm_1e4)-test_output)**2).sum()
rss1e7=((predict(test,norm_1e7)-test_output)**2).sum()
rss1e8=((predict(test,norm_1e8)-test_output)**2).sum()

In [373]:
print(rss1e4)
print(rss1e7)
print(rss1e8)

228459958971393.25
275962075920366.78
537166151497322.75
