### Mount the local drive to save the works

In [1]:
from google.colab import drive
drive.mount('/content/MyDrive/')

Mounted at /content/MyDrive/


In [3]:
#!pip install turicreate
import turicreate

In [4]:
!ls

MyDrive  sample_data


In [5]:
#!7z x '/content/MyDrive/MyDrive/SFRAMES/home_data_small.sframe.zip'

In [6]:
sf = turicreate.SFrame('/content/MyDrive/MyDrive/SFRAMES/home_data.sframe')

In [7]:
sf.head(3)

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900.0,3.0,1.0,1180.0,5650.0,1.0,0
6414100192,2014-12-09 00:00:00+00:00,538000.0,3.0,2.25,2570.0,7242.0,2.0,0
5631500400,2015-02-25 00:00:00+00:00,180000.0,2.0,1.0,770.0,10000.0,1.0,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7.0,1180.0,0.0,1955.0,0.0,98178,47.51123398
0,3,7.0,2170.0,400.0,1951.0,1991.0,98125,47.72102274
0,3,6.0,770.0,0.0,1933.0,0.0,98028,47.73792661

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0
-122.23319601,2720.0,8062.0


In [9]:
sf.shape

(21613, 21)

# **Regression Model**

In [10]:
train_data,test_data = sf.random_split(.8,seed=0)

In [11]:
example_features = ['sqft_living', 'bedrooms', 'bathrooms']
example_model = turicreate.linear_regression.create(train_data, target = 'price', features = example_features, 
                                                    validation_set = None)

In [12]:
example_weight_summary = example_model.coefficients
print(example_weight_summary)

+-------------+-------+--------------------+--------------------+
|     name    | index |       value        |       stderr       |
+-------------+-------+--------------------+--------------------+
| (intercept) |  None | 87910.07249239809  | 7873.338143401673  |
| sqft_living |  None |   315.4034405521   | 3.455700325854743  |
|   bedrooms  |  None | -65080.21555282686 | 2717.456854420703  |
|  bathrooms  |  None | 6944.020192636717  | 3923.1149314414993 |
+-------------+-------+--------------------+--------------------+
[4 rows x 4 columns]



# **Making Predictions**

In [13]:
example_predictions = example_model.predict(train_data)
print(example_predictions[0]) # should be 271789.505878

271789.5058780322


In [14]:
def get_residual_sum_of_squares(model, data, outcome):
    # First get the predictions
    predictions = model.predict(data)
    residual = outcome - predictions
    RSS = sum(residual * residual)
    # Then square and add them up
    return(RSS)    

In [15]:
rss_example_train = get_residual_sum_of_squares(example_model, test_data, test_data['price'])
print(rss_example_train) # should be 2.7376153833e+14

273761538330192.28


# **Some new features**


In [16]:
from math import log
train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x**2)
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)

In [17]:
# create the remaining 3 features in both TEST and TRAIN data
train_data['bed_bath_rooms'] = train_data['bedrooms']*train_data['bathrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms']*test_data['bathrooms']
train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: log(x))
test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x: log(x))
train_data['lat_plus_long'] = train_data['lat'] + train_data['long']
test_data['lat_plus_long'] = test_data['lat'] + test_data['long']

In [18]:
print('bedrooms_squared _ ' + str(round(sum(test_data['bedrooms_squared'])/len(test_data['bedrooms_squared']),2)))
print('bed_bath_rooms _ ' + str(round(sum(test_data['bed_bath_rooms'])/len(test_data['bed_bath_rooms']),2)))
print('log_sqft_living _ ' + str(round(sum(test_data['log_sqft_living'])/len(test_data['log_sqft_living']),2)))
print('lat_plus_long _ ' + str(round(sum(test_data['lat_plus_long'])/len(test_data['lat_plus_long']),2)))

bedrooms_squared _ 12.45
bed_bath_rooms _ 7.5
log_sqft_living _ 7.55
lat_plus_long _ -74.65


# Learning Multiple Models

In [19]:
model_1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_features = model_1_features + ['bed_bath_rooms']
model_3_features = model_2_features + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [21]:
# Learn the three models: (don't forget to set validation_set = None)
model_1 = turicreate.linear_regression.create(train_data, target = 'price', features = model_1_features, 
                                                  validation_set = None)
model_2 = turicreate.linear_regression.create(train_data, target = 'price', features = model_2_features, 
                                                  validation_set = None)
model_3 = turicreate.linear_regression.create(train_data, target = 'price', features = model_3_features, 
                                                  validation_set = None)

In [23]:
print('model 1')
model_1.coefficients

model 1


name,index,value,stderr
(intercept),,-56140675.74114517,1649985.420232753
sqft_living,,310.26332577692176,3.188829604072599
bedrooms,,-59577.11606759663,2487.279773224208
bathrooms,,13811.8405416533,3593.5421329676874
lat,,629865.7894714857,13120.71003228041
long,,-214790.2851647169,13284.285159947443


In [24]:
print('model 2')
model_2.coefficients

model 2


name,index,value,stderr
(intercept),,-54410676.11184308,1650405.1653694494
sqft_living,,304.44929805557945,3.202175356366349
bedrooms,,-116366.04322944838,4805.549665484017
bathrooms,,-77972.33051298767,7565.059910947668
lat,,625433.8349398111,13058.353097203788
long,,-203958.60293748276,13268.128370376078
bed_bath_rooms,,26961.624908952057,1956.365615558322


In [25]:
print('model 3')
model_3.coefficients

model 3


name,index,value,stderr
(intercept),,-52974974.06892172,1615194.9446730777
sqft_living,,529.1964205687524,7.699134985121209
bedrooms,,28948.5277463525,9395.728891147388
bathrooms,,65661.20723969756,10795.33807025712
lat,,704762.1484430789,787087632.0068617
long,,-137780.02000718215,787087632.297173
bed_bath_rooms,,-8478.364107167637,2858.9539125683445
bedrooms_squared,,-6072.384661905136,1494.9704277890871
log_sqft_living,,-563467.7842801766,17567.82308147047
lat_plus_long,,-83217.19791002195,787087631.9888161


# RSS for each model - Training Set

In [32]:
print("RSS Value of Model - 1: ")
get_residual_sum_of_squares(model_1, train_data, train_data['price'])


RSS Value of Model - 1: 


971328233545430.9

In [33]:
print("RSS Value of Model - 2: ")
get_residual_sum_of_squares(model_2, train_data, train_data['price'])

RSS Value of Model - 2: 


961592067857503.2

In [34]:
print("RSS Value of Model - 3: ")
get_residual_sum_of_squares(model_3, train_data, train_data['price'])

RSS Value of Model - 3: 


905276314551641.4

# RSS for each model - Test Set

In [28]:
print("RSS Value of Model - 1 Test: ")
get_residual_sum_of_squares(model_1, test_data, test_data['price'])

RSS Value of Model - 1 Test: 


226568089093160.53

In [29]:
print("RSS Value of Model - 2 Test: ")
get_residual_sum_of_squares(model_2, test_data, test_data['price'])

RSS Value of Model - 2 Test: 


224368799993970.88

In [30]:
print("RSS Value of Model - 3 Test: ")
get_residual_sum_of_squares(model_3, test_data, test_data['price'])

RSS Value of Model - 3 Test: 


251829318963158.94