In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [8]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

train_data = pd.read_csv("kc_house_train_data.csv",dtype = dtype_dict)
test_data = pd.read_csv("kc_house_test_data.csv",dtype = dtype_dict)

In [9]:
train_data.shape,test_data.shape

((17384, 21), (4229, 21))

In [10]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [41]:
def simple_linear_regression(input_feature, output):
    N = train_data[input_feature].shape[0]

    sum_xi = np.sum(train_data[input_feature])
    sum_yi = np.sum(train_data[output])
    sum_yi_xi = np.sum(train_data[input_feature] * train_data[output])
    sum_xi_square = np.sum(train_data[input_feature] ** 2)
    
#     print(sum_xi)
#     print(sum_yi)
#     print(sum_yi_xi)
#     print(sum_xi_square)
    
    slope = (sum_yi_xi - sum_xi * sum_yi / N) / (sum_xi_square - sum_xi ** 2 / N)
    intercept = (sum_yi / N) - (slope * sum_xi / N)
    return intercept, slope

In [42]:
intercept, slope = simple_linear_regression("sqft_living","price")
intercept, slope

(-47116.07907289418, 281.9588396303426)

In [48]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_output = input_feature * slope + intercept
    return predicted_output

In [49]:
get_regression_predictions(2650,intercept, slope)

700074.8459475137

In [50]:
def get_residual_sum_of_squares(input_feature, output, intercept,slope):
    RSS = np.sum((input_feature * slope + intercept - output) ** 2)
    return RSS

In [53]:
input_feature = train_data["sqft_living"]
output = train_data["price"]
get_residual_sum_of_squares(input_feature,output, intercept, slope)

1201918354177283.0

In [54]:
def inverse_regression_predictions(output, intercept, slope):
    estimated_input = (output - intercept) / slope
    return estimated_input

In [55]:
inverse_regression_predictions(800000,intercept,slope )

3004.3962451522766

In [56]:
intercept, slope = simple_linear_regression("bedrooms","price")
intercept, slope

(109473.1776229596, 127588.95293398784)

In [57]:
intercept, slope = (-47116.07907289418, 281.9588396303426)

input_feature = test_data["sqft_living"]
output = test_data["price"]
get_residual_sum_of_squares(input_feature,output, intercept, slope)

275402933617812.12

In [58]:
intercept, slope = (109473.1776229596, 127588.95293398784)

input_feature = test_data["bedrooms"]
output = test_data["price"]
get_residual_sum_of_squares(input_feature,output, intercept, slope)

493364585960300.9