unit 3 / lesson 1 / project 4
# Challenge: Model Comparison

In [1]:
import pandas as pd
import numpy as np
import math

from sklearn import neighbors
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('kc_house_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
id               21597 non-null int64
date             21597 non-null object
price            21597 non-null float64
bedrooms         21597 non-null int64
bathrooms        21597 non-null float64
sqft_living      21597 non-null int64
sqft_lot         21597 non-null int64
floors           21597 non-null float64
waterfront       21597 non-null int64
view             21597 non-null int64
condition        21597 non-null int64
grade            21597 non-null int64
sqft_above       21597 non-null int64
sqft_basement    21597 non-null int64
yr_built         21597 non-null int64
yr_renovated     21597 non-null int64
zipcode          21597 non-null int64
lat              21597 non-null float64
long             21597 non-null float64
sqft_living15    21597 non-null int64
sqft_lot15       21597 non-null int64
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB


In [3]:
# Instead of looking at year built and year sold, 
# look at how old the house was at the time of sale

df['year_sold'] = df.date.str[-4:].astype(int)
df['age'] =df['year_sold'] - df['yr_built']

# instead of looking at the year it was renovated,
# look at how long it has been since it was last renovated
# in the case it has never been renovated, then the last "renovation" would
# be the same as the year it was first built
df['last_renovation'] = df['age']
for index, row in df.iterrows():
    yr_renovated = df.loc[index, 'yr_renovated']
    if (yr_renovated > 0):
        year_sold = df.loc[index, 'year_sold']
        df.loc[[index], ['last_renovation']] = year_sold - yr_renovated

# sqft_living is redundant since it is sqft_above + sqft_basement
df.drop(['id', 'date', 'sqft_living', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'year_sold'], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,age,last_renovation
0,221900.0,3,1.0,5650,1.0,0,0,3,7,1180,0,59,59
1,538000.0,3,2.25,7242,2.0,0,0,3,7,2170,400,63,23
2,180000.0,2,1.0,10000,1.0,0,0,3,6,770,0,82,82
3,604000.0,4,3.0,5000,1.0,0,0,5,7,1050,910,49,49
4,510000.0,3,2.0,8080,1.0,0,0,3,8,1680,0,28,28


In [5]:
X = df.drop('price', axis=1)
y = df['price']

scaler = MinMaxScaler(feature_range=(0, 1))

X_scaled = pd.DataFrame(scaler.fit_transform(X))

X_scaled.columns = X.columns

In [6]:
X_scaled.head()

Unnamed: 0,bedrooms,bathrooms,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,age,last_renovation
0,0.0625,0.066667,0.003108,0.0,0.0,0.0,0.5,0.4,0.089602,0.0,0.517241,0.517241
1,0.0625,0.233333,0.004072,0.4,0.0,0.0,0.5,0.4,0.199115,0.082988,0.551724,0.206897
2,0.03125,0.066667,0.005743,0.0,0.0,0.0,0.5,0.3,0.044248,0.0,0.715517,0.715517
3,0.09375,0.333333,0.002714,0.0,0.0,0.0,1.0,0.4,0.075221,0.188797,0.431034,0.431034
4,0.0625,0.2,0.004579,0.0,0.0,0.0,0.5,0.5,0.144912,0.0,0.25,0.25


## KNN Regression

In [10]:
knn_model = neighbors.KNeighborsRegressor(n_neighbors = 5)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

In [16]:
knn_model.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [17]:
knn_pred = knn_model.predict(X_test)

In [18]:
# rmse
knn_error = np.sqrt(mean_squared_error(y_test,knn_pred))
print(knn_error)

280640.45684049197


In [19]:
df.price.mean()

540296.5735055795

## Linear Regression

In [24]:
linreg_model = LinearRegression()
linreg_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [26]:
klinreg_pred = linreg_model.predict(X_test)
# rmse
linreg_error = np.sqrt(mean_squared_error(y_test,klinreg_pred))
print(linreg_error)

231117.29185063485
