# W1_Simple Linear Regression

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression

In [2]:
train_data = pd.read_csv('kc_house_train_data.csv')
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
def simple_linear_regression(X, y):
    """
    input: X(one feature), y(target)
    return: intercept and slope of the linear regression model 
    """
    lr = LinearRegression().fit(X, y)
    return(lr.intercept_, lr.coef_[0])

In [4]:
def get_predictions(X, intercept, slope):
    """
    input: X(feature) used to make predictions, model's intercept, model's slope
    return: predictions of y
    """
    predictions = intercept + slope * X
    return predictions

In [5]:
def get_rss(X, y, intercept, slope):
    """
    input: X(feature), y, model's intercept, model's slope
    return: sum of squared error(SSE/RSS) of the model
    """
    predictions = get_predictions(X, intercept, slope)
    return sum((predictions-y) ** 2) 

In [6]:
def inverse_predictions(y, intercept, slope):
    """
    input: y, model's intercept, model's slope
    return: predictions of X
    """
    return (y - intercept)/slope

In [7]:
# 1st model
# use 'sqft_living' as the input feature  
sqft_living_intercept, sqft_living_slope = simple_linear_regression(train_data['sqft_living'].reshape(-1,1), train_data['price'])

  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
# make a prediction of price when sqft_living is 2650
get_predictions(2650, sqft_living_intercept, sqft_living_slope)

700074.84594751359

In [9]:
# RSS of 1st model
sqft_living_rss = get_rss(train_data['sqft_living'], train_data['price'], sqft_living_intercept, sqft_living_slope)
sqft_living_rss

1201918354177285.8

In [10]:
# make a prediction of sqft_living when price is 800000
inverse_predictions(800000, sqft_living_intercept, sqft_living_slope)

3004.3962451522771

In [11]:
# 2nd model
# use 'bedrooms' as the input feature  
bedrooms_intercept, bedrooms_slope = simple_linear_regression(train_data['bedrooms'].reshape(-1,1), train_data['price'])

  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
# RSS of 2nd model
bedrooms_rss = get_rss(train_data['bedrooms'], train_data['price'], bedrooms_intercept, bedrooms_slope)
bedrooms_rss

2143244498162068.0

In [13]:
sqft_living_rss > bedrooms_rss

False