In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')

In [3]:
def simple_linear_regression(input_feature, output):
    """
    Given a column of input feature data and a column of output data, 
    calculate the simple linear regression parameters - intercept and slope -- based on closed form solution
    :param input_feature: a column of input feature
    :param output: a column of output data
    :return intercept: single number value of the intercept
    :return slope: single number value of the slope
    """
    slope = (sum(input_feature*output) - sum(input_feature)*sum(output)/len(output))/(sum(input_feature**2)-sum(input_feature)*sum(input_feature)/len(input_feature))
    intercept = sum(output)/len(output) - slope*sum(input_feature)/len(output)
    
    return intercept, slope

In [4]:
def get_regression_predictions(input_feature, intercept, slope):
    """
    Predict the output based on the input feature, intercept and slope given
    """
    predicted_output = intercept + input_feature*slope
    return predicted_output

In [5]:
def get_residual_sum_of_squares(input_feature, output, intercept,slope):
    """
    Get RSS -the sum of the squares of the prediction errors (difference between output and prediction
    """
    rss = sum((output - (intercept+input_feature*slope))**2)
    return rss


In [6]:
def inverse_regression_predictions(output, intercept, slope):
    """
    Given an ‘output’ column and the regression parameters ‘slope’ and ‘intercept’,
    outputs the column of data ‘estimated_input’
    """
    estimated_input = (output-intercept)/slope
    return estimated_input

In [7]:
# Build simple linear regression model based on 'sqft_living' as the input feature, and 'price' as the outcome
input_feature = train_data['sqft_living']
output = train_data['price']
squarefeet_intercept, squarefeet_slope = simple_linear_regression(input_feature, output)

In [8]:
print("Predicted price for a house with 2650 sqft", get_regression_predictions(2650, squarefeet_intercept, squarefeet_slope))
print("RSS of the model: ", get_residual_sum_of_squares(input_feature, output, squarefeet_intercept, squarefeet_slope))
print("The estimated square-feet for a house costing $800,000 is: ", inverse_regression_predictions(800000, squarefeet_intercept, squarefeet_slope))

Predicted price for a house with 2650 sqft 700074.8459475137
RSS of the model:  1201918354177286.2
The estimated square-feet for a house costing $800,000 is:  3004.3962451522766


In [9]:
# Build simple linear regression model based on 'bedrooms' as the input feature, and 'price' as the outcome
bedroom_slope, bedroom_intercept = simple_linear_regression(train_data['bedrooms'], output)
print("RSS of the model: ", get_residual_sum_of_squares(train_data['bedrooms'], output, bedroom_slope, bedroom_intercept))

RSS of the model:  2143244498162069.0
