In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

## Import Data

In [2]:
train_data = pd.read_csv('kc_house_train_data.csv')
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17384 entries, 0 to 17383
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             17384 non-null  int64  
 1   date           17384 non-null  object 
 2   price          17384 non-null  float64
 3   bedrooms       17384 non-null  int64  
 4   bathrooms      17384 non-null  float64
 5   sqft_living    17384 non-null  int64  
 6   sqft_lot       17384 non-null  int64  
 7   floors         17384 non-null  float64
 8   waterfront     17384 non-null  int64  
 9   view           17384 non-null  int64  
 10  condition      17384 non-null  int64  
 11  grade          17384 non-null  int64  
 12  sqft_above     17384 non-null  int64  
 13  sqft_basement  17384 non-null  int64  
 14  yr_built       17384 non-null  int64  
 15  yr_renovated   17384 non-null  int64  
 16  zipcode        17384 non-null  int64  
 17  lat            17384 non-null  float64
 18  long  

In [4]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [5]:
train = train_data.astype(dtype_dict)
train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17384 entries, 0 to 17383
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             17384 non-null  object 
 1   date           17384 non-null  object 
 2   price          17384 non-null  float64
 3   bedrooms       17384 non-null  float64
 4   bathrooms      17384 non-null  float64
 5   sqft_living    17384 non-null  float64
 6   sqft_lot       17384 non-null  int64  
 7   floors         17384 non-null  object 
 8   waterfront     17384 non-null  int64  
 9   view           17384 non-null  int64  
 10  condition      17384 non-null  int64  
 11  grade          17384 non-null  int64  
 12  sqft_above     17384 non-null  int64  
 13  sqft_basement  17384 non-null  int64  
 14  yr_built       17384 non-null  int64  
 15  yr_renovated   17384 non-null  int64  
 16  zipcode        17384 non-null  object 
 17  lat            17384 non-null  float64
 18  long  

- Adding 4 new variables:
1. ‘bedrooms_squared’ = ‘bedrooms’*‘bedrooms’

2. ‘bed_bath_rooms’ = ‘bedrooms’*‘bathrooms’

3. ‘log_sqft_living’ = log(‘sqft_living’)

4. ‘lat_plus_long’ =  ‘lat’ + ‘long’

In [7]:
train['bedrooms_squared'] = train['bedrooms']*train['bedrooms'] # increase the separation between not many bedrooms (e.g. 1) and lots of bedrooms
train['bed_bath_rooms'] = train['bedrooms']*train['bathrooms'] # interaction variable
train['log_sqft_living'] = np.log(train['sqft_living']) # bringing large values closer together and spreading out small values
train['lat_plus_long'] = train['lat']+train['long']
train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_long
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1.0,0,0,...,0,98178,47.5112,-122.257,1340.0,5650.0,9.0,3.0,7.07327,-74.7458
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2.0,0,0,...,1991,98125,47.721,-122.319,1690.0,7639.0,9.0,6.75,7.851661,-74.598
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1.0,0,0,...,0,98028,47.7379,-122.233,2720.0,8062.0,4.0,2.0,6.646391,-74.4951
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1.0,0,0,...,0,98136,47.5208,-122.393,1360.0,5000.0,16.0,12.0,7.5807,-74.8722
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1.0,0,0,...,0,98074,47.6168,-122.045,1800.0,7503.0,9.0,6.0,7.426549,-74.4282


In [8]:
# Quiz 1: Mean value on new variables 
train.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,waterfront,view,condition,grade,sqft_above,...,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_long
count,17384.0,17384.0,17384.0,17384.0,17384.0,17384.0,17384.0,17384.0,17384.0,17384.0,...,17384.0,17384.0,17384.0,17384.0,17384.0,17384.0,17384.0,17384.0,17384.0,17384.0
mean,539366.6,3.369363,2.115048,2080.02951,15091.91,0.007651,0.236079,3.41078,7.655028,1787.844512,...,1971.152727,83.107973,47.559313,-122.213281,1985.994995,12776.380867,12.174241,7.496592,7.550349,-74.653968
std,369691.2,0.906468,0.771783,921.630888,41459.27,0.087136,0.768008,0.649792,1.169818,827.107595,...,29.328722,398.692283,0.138703,0.140906,686.512835,27175.730523,6.731892,4.227952,0.424393,0.184071
min,75000.0,0.0,0.0,290.0,520.0,0.0,0.0,1.0,1.0,290.0,...,1900.0,0.0,47.1593,-122.519,399.0,651.0,0.0,0.0,5.669881,-75.1718
25%,320000.0,3.0,1.75,1420.0,5049.5,0.0,0.0,3.0,7.0,1200.0,...,1952.0,0.0,47.46865,-122.328,1490.0,5100.0,9.0,4.5,7.258412,-74.7686
50%,450000.0,3.0,2.25,1910.0,7616.0,0.0,0.0,3.0,7.0,1560.0,...,1975.0,0.0,47.5714,-122.229,1840.0,7620.0,9.0,7.0,7.554859,-74.66825
75%,640000.0,4.0,2.5,2550.0,10665.25,0.0,0.0,4.0,8.0,2210.0,...,1997.0,0.0,47.677625,-122.125,2360.0,10065.25,16.0,10.0,7.843849,-74.5253
max,7700000.0,10.0,8.0,13540.0,1651359.0,1.0,4.0,5.0,13.0,9410.0,...,2015.0,2015.0,47.7776,-121.315,6210.0,871200.0,100.0,67.5,9.513404,-73.6038


In [10]:
# Model 1: sqft_living, bedrooms, bathrooms, lat, and long
X_model1 = train[['sqft_living','bedrooms','bathrooms','lat','long']]
y_model1 = train['price']
model_1 = LinearRegression()
model_1.fit(X_model1, y_model1)

# Model 2: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, and ‘bed_bath_rooms’
X_model2 = train[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms']]
y_model2 = train['price']
model_2 = LinearRegression()
model_2.fit(X_model2, y_model2)

# Model 3: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, ‘bed_bath_rooms’, ‘bedrooms_squared’, ‘log_sqft_living’, and ‘lat_plus_long’
X_model3 = train[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms','bedrooms_squared','lat_plus_long']]
y_model3 = train['price']
model_3 = LinearRegression()
model_3.fit(X_model3, y_model3)

LinearRegression()

In [12]:
print(model_1.coef_)

[ 3.12258646e+02 -5.95865332e+04  1.57067421e+04  6.58619264e+05
 -3.09374351e+05]


In [13]:
model_1.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}

In [14]:
print(list(zip(model_1.coef_, X_model1)))

[(312.25864627320277, 'sqft_living'), (-59586.53315361202, 'bedrooms'), (15706.742082734683, 'bathrooms'), (658619.2639305172, 'lat'), (-309374.3512682333, 'long')]


In [16]:
coef_dict= {}
for feature, param in zip(X_model1, model_1.coef_):
    coef_dict[feature] = param

In [17]:
coef_dict

{'sqft_living': 312.25864627320277,
 'bedrooms': -59586.53315361202,
 'bathrooms': 15706.742082734683,
 'lat': 658619.2639305172,
 'long': -309374.3512682333}

In [22]:
def coef_dict(model,coef):
    coef_dict = {}
    for feature, param in zip(model,coef):
        coef_dict[feature]=param
    return coef_dict

In [30]:
model_1_coefficients = coef_dict(X_model1,model_1.coef_)
print('Model 1')
pd.Series(model_1_coefficients).to_frame('coefficients')

Model 1


Unnamed: 0,coefficients
sqft_living,312.258646
bedrooms,-59586.533154
bathrooms,15706.742083
lat,658619.263931
long,-309374.351268


In [31]:
model_2_coefficients = coef_dict(X_model2,model_2.coef_)
print('Model 2')
pd.Series(model_2_coefficients).to_frame('coefficients')

Model 2


Unnamed: 0,coefficients
sqft_living,306.610053
bedrooms,-113446.36807
bathrooms,-71461.308293
lat,654844.629503
long,-294298.969138
bed_bath_rooms,25579.652001


In [33]:
model_3_coefficients = coef_dict(X_model3,model_3.coef_)
print('Model 3')
pd.Series(model_3_coefficients).to_frame('coefficients')

Model 3


Unnamed: 0,coefficients
sqft_living,304.464634
bedrooms,-70933.241592
bathrooms,-104607.813258
lat,538618.642464
long,-418314.482413
bed_bath_rooms,34970.725385
bedrooms_squared,-8513.115799
lat_plus_long,120304.160051


In [45]:
# 6. Quiz Question: What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 1?

# 7. Quiz Question: What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 2?

# 10. Quiz Question: Which model (1, 2 or 3) had the lowest RSS on TRAINING data? 

In [42]:
from sklearn.metrics import mean_squared_error
model1_predictions = model_1.predict(X_model1)
model1_mse = mean_squared_error(model1_predictions, y_model1)
print('RSS for model 1 is {}'.format(model1_mse*model1_mse))

RSS for model 1 is 3.0998706476500055e+21


In [43]:
model2_predictions = model_2.predict(X_model2)
model2_mse = mean_squared_error(model2_predictions, y_model2)
print('RSS for model 2 is {}'.format(model2_mse*model2_mse))

RSS for model 2 is 3.0395688026000374e+21


In [44]:
model3_predictions = model_3.predict(X_model3)
model3_mse = mean_squared_error(model3_predictions, y_model3)
print('RSS for model 3 is {}'.format(model3_mse*model3_mse))

RSS for model 3 is 3.0288471409655546e+21


In [None]:
# 12. Quiz Question: Which model (1, 2, or 3) had the lowest RSS on TESTING data?
