In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Housing Data

In [2]:
#dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

df_train = pd.read_csv(f"../Data/house_data/kc_house_train_data.csv")
df_test = pd.read_csv(f"../Data/house_data/kc_house_test_data.csv")
print("\nDF Train Shape: {}".format(df_train.shape))
print("\nDF Test Shape: {}".format(df_test.shape))


DF Train Shape: (17384, 21)

DF Test Shape: (4229, 21)


# Linear Regression Functions

## Function: Simple linear regression

In [4]:
def simple_linear_regression_train(input_feature, output):
    features = df_train[input_feature].values.reshape(-1, 1)
    target = df_train[output].values.reshape(-1, 1)
    #Import linear regression model
    from sklearn.linear_model import LinearRegression
    model = LinearRegression().fit(features, target)
    return model.intercept_, model.coef_

def simple_linear_regression_test(input_feature, output):
    features = df_test[input_feature].values.reshape(-1, 1)
    target = df_test[output].values.reshape(-1, 1)
    #Import linear regression model
    from sklearn.linear_model import LinearRegression
    model_test = LinearRegression().fit(features, target)
    return model_test.intercept_, model_test.coef_


input_feature_1 = ['sqft_living']
output_1 = ['price']

intercept_train, coef_train = simple_linear_regression_train(input_feature_1, output_1)
intercept_test, coef_test = simple_linear_regression_test(input_feature_1, output_1)

print('\n Train data intercept: ' + str(intercept_train) + ' and coeff ' + str(coef_train))
print('\n Test data intercept: ' + str(intercept_test) + ' and coeff ' + str(coef_test))


 Train data intercept: [-47116.07907289] and coeff [[281.95883963]]

 Test data intercept: [-28639.89722154] and coeff [[274.93662162]]


## Function: Predictions of LR

In [5]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_output = input_feature * slope + intercept
    return predicted_output

intercept_val, slope_val = simple_linear_regression_train(input_feature_1, output_1)
intercept_test, slope_test = simple_linear_regression_test(input_feature_1, output_1)

result_pred_train = get_regression_predictions(2650, intercept_val, slope_val)
result_pred_test = get_regression_predictions(2650, intercept_test, slope_test)

print("Price prediction (train) for a house with area 2650 is {}".format(result_pred_train))
print("\nPrice prediction (test) for a house with area 2650 is {}".format(result_pred_test))

Price prediction (train) for a house with area 2650 is [[700074.84594751]]

Price prediction (test) for a house with area 2650 is [[699942.15007369]]


## Function: Get RSS value

In [6]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    result_ref = get_regression_predictions(input_feature, intercept, slope)
    from sklearn.metrics import mean_squared_error
    rms = mean_squared_error(output, result_ref)
    return rms

RSS_ans = get_residual_sum_of_squares(2650, [446500], intercept_val, slope_val)
print(RSS_ans)

64300202497.30519


## Function: Inverse value calculation

In [8]:
def inverse_regression_predictions(output, intercept, slope):
    estimated_input = (output - intercept) / slope
    return estimated_input

val_area_800k = inverse_regression_predictions(800000, intercept_val, slope_val)
print("Area prediction for a house with price of $800,000 is {}".format(val_area_800k))

Area prediction for a house with price of $800,000 is [[3004.39624515]]
