In [8]:
import graphlab
import math

In [2]:
sales = graphlab.SFrame('kc_house_data.gl/')

[INFO] This non-commercial license of GraphLab Create is assigned to kaviarasu.govindaraju@snapchat.com and will expire on February 06, 2017. For commercial licensing options, visit https://dato.com/buy/.

[INFO] Start server at: ipc:///tmp/graphlab_server-32572 - Server binary: /Users/kaviarasu.govindaraju/anaconda/envs/dato-env/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1456787701.log
[INFO] GraphLab Server Version: 1.8.1


In [3]:
train_data,test_data = sales.random_split(.8,seed=0)

In [28]:
# Create new functional features for both training and test dataset

def create_bedrooms_squared(dataset):
    dataset['bedrooms_squared'] = dataset['bedrooms'] * dataset['bedrooms']
    
def create_bed_bath_rooms(dataset):
    dataset['bed_bath_rooms'] = dataset['bedrooms'] * dataset['bathrooms']
        
def create_log_sqft_living(dataset):
    dataset['log_sqft_living'] = dataset['sqft_living'].apply(lambda x : math.log(x))
    
def create_lat_plus_long(dataset):
    dataset['lat_plus_long'] = dataset['lat'] + dataset['long']
    
def create_new_functional_features(dataset):
    create_bedrooms_squared(dataset)
    create_bed_bath_rooms(dataset)
    create_log_sqft_living(dataset)
    create_lat_plus_long(dataset)

In [29]:
create_new_functional_features(train_data)
create_new_functional_features(test_data)

In [30]:
def get_mean_of_functional_features(dataset):
    return (dataset['bedrooms_squared'].mean(), dataset['bed_bath_rooms'].mean(),
            dataset['log_sqft_living'].mean(), dataset['lat_plus_long'].mean())

In [19]:
def train_regression_model(dataset, feature_list):
    return graphlab.linear_regression.create(dataset, target="price", features=feature_list,
                                            validation_set=None)

features_for_model1 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model1 = train_regression_model(train_data, features_for_model1)

features_for_model2 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']
model2 = train_regression_model(train_data, features_for_model2)

features_for_model3 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms',
                      'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']
model3 = train_regression_model(train_data, features_for_model3)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 17384
PROGRESS: Number of features          : 5
PROGRESS: Number of unpacked features : 5
PROGRESS: Number of coefficients    : 6
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 0.090935     | 4074878.213096     | 236378.596455 |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: SUCCESS: Optimal solution found.
PROGRESS:
PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 17384
PROG

In [20]:
model1.coefficients

name,index,value,stderr
(intercept),,-56140675.7444,1649985.42028
sqft_living,,310.263325778,3.18882960408
bedrooms,,-59577.1160683,2487.27977322
bathrooms,,13811.8405419,3593.54213297
lat,,629865.789485,13120.7100323
long,,-214790.285186,13284.2851607


In [21]:
model2.coefficients

name,index,value,stderr
(intercept),,-54410676.1152,1650405.16541
sqft_living,,304.449298056,3.20217535637
bedrooms,,-116366.04323,4805.54966546
bathrooms,,-77972.3305131,7565.05991091
lat,,625433.834953,13058.3530972
long,,-203958.602959,13268.1283711
bed_bath_rooms,,26961.6249091,1956.36561555


In [22]:
model3.coefficients

name,index,value,stderr
(intercept),,-52974974.0601,1615194.94383
sqft_living,,529.196420562,7.69913498511
bedrooms,,28948.5277295,9395.72889105
bathrooms,,65661.2072305,10795.3380703
lat,,704762.148384,
long,,-137780.019962,
bed_bath_rooms,,-8478.36410508,2858.95391257
bedrooms_squared,,-6072.38466049,1494.97042777
log_sqft_living,,-1297432.52043,40451.4075436
lat_plus_long,,-83217.1979028,


In [24]:
def get_rss(dataset, model):
    evaluation = model.evaluate(dataset)
    rmse = evaluation['rmse']
    return rmse * rmse

In [26]:
print 'Training RSS for Model-1 = ' + str(get_rss(train_data, model1))
print 'Training RSS for Model-2 = ' + str(get_rss(train_data, model2))
print 'Training RSS for Model-3 = ' + str(get_rss(train_data, model3))

print 'Test RSS for Model-1 = ' + str(get_rss(test_data, model1))
print 'Test RSS for Model-2 = ' + str(get_rss(test_data, model2))
print 'Test RSS for Model-3 = ' + str(get_rss(test_data, model3))

Training RSS for Model-1 = 55874840861.9
Training RSS for Model-2 = 55314776107.7
Training RSS for Model-3 = 52075259696.0
Test RSS for Model-1 = 53574861454.9
Test RSS for Model-2 = 53054812010.8
Test RSS for Model-3 = 59548195542.7


In [31]:
get_mean_of_functional_features(test_data)

(12.446677701584301, 7.503901631591394, 7.550274679645938, -74.65333497217307)