In [95]:
import graphlab

In [96]:
sales = graphlab.SFrame('kc_house_data.gl/')

In [97]:
train_data,test_data = sales.random_split(.8,seed=0)

In [98]:
graphlab.SArray(range(5))

dtype: int
Rows: 5
[0, 1, 2, 3, 4]

In [113]:
# Create a simple regression model
def simple_linear_regression(input_feature, output):
    # numerator = (sum of X*Y) - (1/N)*((sum of X) * (sum of Y))
    numerator = (input_feature * output).sum() - ((1 / float(len(output))) * (input_feature.sum() * output.sum()))
    print numerator
    # denominator = (sum of X^2) - (1/N)*((sum of X) * (sum of X))
    denominator = (input_feature * input_feature).sum() - ((1 / float(len(output))) * (input_feature.sum() * input_feature.sum()))
    print denominator
    slope = numerator / denominator
    
    # intercept = (mean of Y) - slope * (mean of X)
    intercept = output.mean() - slope * input_feature.mean()
    return(intercept, slope)

In [114]:
sa = graphlab.SArray([1,2,3,4,5])
intercept, slope = simple_linear_regression(sa, sa)

10.0
10.0


In [115]:
intercept, slope

(0.0, 1.0)

In [116]:
# Train the model using sqft as feature
input_feature = train_data['sqft_living']
output = train_data['price']
squarfeet_intercept, squarfeet_slope = simple_linear_regression(input_feature, output)
squarfeet_intercept, squarfeet_slope

4.16317326566e+12
14765180927.9


(-47116.076574939885, 281.9588385676974)

In [117]:
# Predicts price given model params and input feature
def get_regression_predictions(input_feature, intercept, slope):
    predicted_output = intercept + (slope * input_feature)
    return(predicted_output)

In [118]:
# Predict price for a sample house
get_regression_predictions(2650, squarfeet_intercept, squarfeet_slope)

700074.8456294582

In [119]:
# Return residual sum of squares
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    estimated_output = intercept + (input_feature * slope)
    observed_error = output - estimated_output
    RSS = (observed_error * observed_error).sum()
    return(RSS)

In [120]:
# Find error from training set
rss_sqft = get_residual_sum_of_squares(test_data['sqft_living'], test_data['price'], squarfeet_intercept, squarfeet_slope)

In [129]:
def inverse_regression_predictions(output, intercept, slope):
    # y = mx + c => x = (y-c) / m
    estimated_input = (output - intercept) / float(slope)
    return(estimated_input)

In [130]:
inverse_regression_predictions(800000, squarfeet_intercept, squarfeet_slope)

3004.3962476159445

In [123]:
# Train the model using bedrooms as feature
input_feature = train_data['bedrooms']
output = train_data['price']
bedroom_intercept, bedroom_slope = simple_linear_regression(input_feature, output)
bedroom_intercept, bedroom_slope

1822394156.19
14283.322538


(109473.18046928738, 127588.95217458403)

In [124]:
rss_bedroom = get_residual_sum_of_squares(test_data['bedrooms'], test_data['price'], bedroom_intercept, bedroom_slope)

In [125]:
rss_sqft, rss_bedroom

(275402936247141.47, 493364582868288.1)

In [126]:
test_features = graphlab.SArray([0,1,2,3,4])
test_output = graphlab.SArray([1,3,7,13,21])

In [127]:
simple_linear_regression(test_features, test_output)

50.0
10.0


(-1.0, 5.0)