In [114]:
import pandas as pd

import numpy as np
import sklearn

In [115]:
print(sklearn.__version__)

print(pd.__version__)

print(np.__version__)

0.21.2
0.24.2
1.16.4


In [116]:
from sklearn.model_selection import train_test_split

In [117]:
import matplotlib 

In [149]:
sales = pd.read_csv('kc_house_data.csv')

In [150]:
sales.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [158]:
# Use mask to split random
msk = np.random.rand(len(sales)) < 0.8

train_data, test_data = sales[msk], sales[~msk]

In [159]:
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')


In [160]:
#train_data, test_data = train_test_split(sales, test_size=0.2, random_state=0, shuffle=False)

In [161]:
print('Total sample in training data and test data is: %s, %s'
     % (len(train_data), len(test_data)))

Total sample in training data and test data is: 17384, 4229


### Closed-form solution to calculate the slope and intercept

In [162]:
def simple_linear_regression(input_feature, output):
    intercept, slope = 0, 0
    mean_xy = (input_feature * output).mean()
    mean_x = input_feature.mean()
    mean_y = output.mean()
    mean_xx = (input_feature * input_feature).mean()
    
    slope = (mean_xy - mean_x * mean_y) / (mean_xx - mean_x * mean_x)
    intercept = mean_y - slope * mean_x
    
    return (intercept, slope)

In [163]:
def simple_linear_regression_2(input_feature, output):
    intercept, slope = 0, 0
    mean_xy = (input_feature * output).mean()
    mean_x = input_feature.mean()
    mean_y = output.mean()
    mean_xx = (input_feature * input_feature).mean()

    tmp = input_feature - mean_x
    slope = (tmp * (output - mean_y)).sum() / (tmp * tmp).sum()
    intercept = mean_y - slope * mean_x
    
    return (intercept, slope)

In [164]:
input_feature = train_data['sqft_living']
output = train_data['price']

In [165]:
sqft_intercept, sqft_slope = simple_linear_regression(input_feature, output)
print (sqft_intercept, sqft_slope)

-47116.07907289488 281.95883963034294


In [166]:
sqft_intercept, sqft_slope = simple_linear_regression_2(input_feature, output)
print (sqft_intercept, sqft_slope)

-47116.07907289371 281.95883963034237


### Predicted output

In [167]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_output = input_feature * slope + intercept
    return predicted_output

In [168]:
sample_feature = pd.DataFrame([2650])

In [169]:
print('Predicted price for a particular house: %s' % get_regression_predictions(sample_feature, sqft_intercept, sqft_slope)[0][0])

Predicted price for a particular house: 700074.8459475136


### Calculate Residual Sum of Squares (RSS)

In [170]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    predicted_output = get_regression_predictions(input_feature, intercept, slope)
    diff = output - predicted_output
    return (diff * diff).sum()

In [171]:
sqft_rss = get_residual_sum_of_squares(input_feature, output, sqft_intercept, sqft_slope)
print (sqft_rss)

1201918354177283.2


### Inverse regression predictions

In [172]:
def inverse_regression_predictions(output, intercept, slope):
    return (output - intercept) / slope

In [173]:
housing_cost = pd.DataFrame([800000])

In [174]:
estimated_sqft_living = inverse_regression_predictions(housing_cost,
                                                       sqft_intercept,sqft_slope)
print (estimated_sqft_living)

             0
0  3004.396245


In [137]:
bedroom_feature = train_data['bedrooms']

In [138]:
(bedroom_intercept, bedroom_slope) = simple_linear_regression(bedroom_feature,
                                                             output)

In [139]:
print (bedroom_intercept, bedroom_slope)

146052.87236618437 116771.26739926598


In [140]:
bedroom_rss = get_residual_sum_of_squares(bedroom_feature, output,
                            bedroom_intercept, bedroom_slope)
print (bedroom_rss)

2109512992748517.8


In [141]:
sqft_rss < bedroom_rss

True

### Compare on test data

In [142]:
sqft_living_test = test_data['sqft_living']
price_test = test_data['price']
bedrooms_test = test_data['bedrooms']

In [143]:
(sqft_test_intercept, sqft_test_slope) = simple_linear_regression(sqft_living_test,
                                                                 price_test)
print (sqft_test_intercept, sqft_test_slope)

-46961.947796835215 281.627548678143


In [144]:
(bedrooms_test_intercept, bedrooms_test_slope) = simple_linear_regression(
    bedrooms_test, price_test)
print (bedrooms_test_intercept, bedrooms_test_slope)

62348.88438127545 142209.06161986655


In [145]:
sqft_test_rss = get_residual_sum_of_squares(sqft_living_test, price_test, 
                                            sqft_test_intercept,
                                            sqft_test_slope
                                           )
print (sqft_test_rss)

289775338497663.2


In [146]:
bedroom_test_rss = get_residual_sum_of_squares(
    bedrooms_test, price_test,
    bedrooms_test_intercept, bedrooms_test_slope
)
print (bedroom_test_rss)

524536077804989.0


In [147]:
sqft_test_rss < bedroom_test_rss

True