## Final Project Submission

Please fill out:
* Student name: 
* Student pace: self paced / part time / full time
* Scheduled project review date/time: 
* Instructor name: 
* Blog post URL:


In [1]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
import scipy.stats as stats
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [2]:
#loading data

df = pd.read_csv('data/kc_house_data.csv', index_col='id')

df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/kc_house_data.csv'

In [None]:
df.info()

In [None]:
# column data type cleanup (Zach)
df.date =  pd.to_datetime(df.date, format='%m/%d/%Y')
df.waterfront.replace({'NO': 0, 'YES': 1}, inplace=True)
df.view.replace({'NONE': 0, 'FAIR': 1, 'AVERAGE': 2, 'GOOD': 3, 'EXCELLENT': 4}, inplace=True)
df.condition.replace({'Poor': 0, 'Fair': 1, 'Average': 2, 'Good': 3, 'Very Good': 4}, inplace=True)
df.grade.replace({'3 Poor': 0, '4 Low': 1, '5 Fair': 2, '6 Low Average': 3, '7 Average': 4, '8 Good': 5, 
                  '9 Better': 6, '10 Very Good': 7, '11 Excellent': 8, '12 Luxury': 9, '13 Mansion': 10}, inplace=True)
df.sqft_basement.replace('?', np.NaN, inplace=True)
df.sqft_basement = df.sqft_basement.astype(float)

In [None]:
# corr matrix and heatmap
corrMatrix = df.corr()
matrix = np.triu(corrMatrix)

# Put corr matrix in seaborn heat map
fig, ax = plt.subplots(figsize=(17,17)) 
sns.heatmap(corrMatrix, annot=True, mask=matrix)
plt.title('Correlation Matrix of King County Housing data', size=30)
plt.xlabel('House features', size=30)
plt.ylabel('House features', size=30)
plt.xticks(size=20)
plt.yticks(size=20);

In [None]:
df.fillna(0, inplace=True)
df.head()

# Adonis's Analysis

### Insight: Features are related to the actual house and the area in which the house is located. Three models will be built for price: Inferential, Predictive, and a model based upon area.

In [None]:
pd.plotting.scatter_matrix(df, figsize=[15, 15]);
plt.savefig('../images/scatter_matrix_AM.png')
plt.show()

In [None]:
df.corr()

In [None]:
abs(df.corr()) > 0.75

### Insight: sqft_living is also highly correlated with bathrooms, grade, sqft_above, and sqft_living15. These features will be dropped from the predictive model due to collinearity.

## MODEL CREATION (SKLEARN, predictive)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [None]:
y = df[['price']]
X = df.drop(['id', 'price', 'date', 'bathrooms', 'grade', 'sqft_above', 'sqft_living15'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                   random_state=42)

In [None]:
#create dummy regressor as baseline
dummy = DummyRegressor(strategy='mean')
dummy.fit(X_train, y_train)
dy_pred = dummy.predict(X_train)
dy_pred_test = dummy.predict(X_test)

In [None]:
dummy.score(X_train, y_train)

In [None]:
#linear regression model with no feature adjustments
lr = LinearRegression()
lr.fit(X, y)
lr.score(X, y)

In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_train)
y_pred_test = linreg.predict(X_test)

In [None]:
linreg.score(X_test, y_test)

In [None]:
r2_score(y_test, y_pred_test)

In [None]:
RMSE_train = mean_squared_error(y_train, y_pred, squared=False)
RMSE_test = mean_squared_error(y_test, y_pred_test, squared=False)
print('RMSE for test set is: ', RMSE_test)
print("RMSE for training set is: ", RMSE_train)

## QQ plot for sklearn model

###INSIGHT: To get a QQ plot for sklearn, we must do y_train - y_pred to calculate residuals. 

In [None]:
y_pred.shape

In [None]:
#qq plot of residuals
resid1 = y_train - y_pred

fig = sm.graphics.qqplot(resid1['price'], dist=stats.norm, line='45', fit=True)

In [None]:
#hist plot of residuals
plt.hist(resid1['price'], bins='auto')
plt.show()

In [None]:
sns.distplot(resid1['price']);

In [None]:
#visualizing predicted price vs actual price
fig, ax = plt.subplots(figsize=(10, 8))
sns.regplot(x=y_pred_test, y=y_test, data=df)
ax.set_xlabel("Predicted Price")
ax.set_ylabel("Actual Price")
ax.set_xlim(xmin=0)
ax.set_ylim(ymin=0)
ax.ticklabel_format(useOffset=False, style='plain')
plt.show();

In [None]:
#examining skew and kurtosis of sklearn model

from scipy.stats import kurtosis, skew
print(skew(resid1))
print(kurtosis(resid1))

In [None]:
#boxplot of various features to examine outliers
df.boxplot(column = ['bedrooms', 'bathrooms', 'condition', 'view', 'grade', 'floors']);

In [None]:
#sqft outliers
df.boxplot(column=['sqft_living', 'sqft_above', 'sqft_basement']);

In [None]:
df.boxplot(column='sqft_lot');

### Insight: The model is severely affected by outliers. We will remove the outliers in the data and re-test the model.

In [None]:
#dropping outliers from key features related to the house itself

total_drop = []
for col in ['sqft_living', 'bedrooms', 'sqft_lot', 'sqft_above', 'bathrooms', 'floors']:
    to_drop = np.where(np.abs(stats.zscore(df[col])) > 1.96)[0].tolist()
    total_drop.extend(to_drop)
    
drop_idx = list(set(total_drop))


df2 = df.drop(to_drop)
df2.head()

In [None]:
#including all features for model improvement

y1 = df2[['price']]
X1 = df2.drop(['id', 'price', 'date'], axis=1)


In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2,
                                                   random_state=42)

In [None]:
linreg = LinearRegression()
linreg.fit(X1_train, y1_train)
y1_pred = linreg.predict(X1_train)
y1_pred_test = linreg.predict(X1_test)

In [None]:
r2_score(y1_test, y1_pred_test)

In [None]:
#RMSE of outlier free model

RMSE_train = mean_squared_error(y1_train, y1_pred, squared=False)
RMSE_test = mean_squared_error(y1_test, y1_pred_test, squared=False)
print('RMSE for test set is: ', RMSE_test)
print("RMSE for training set is: ", RMSE_train)

In [None]:
#actual price vs predicted price
fig, ax = plt.subplots(figsize=(10, 8))
sns.regplot(x=y1_pred_test, y=y1_test, data=df)
ax.set_xlabel("Predicted Price")
ax.set_ylabel("Actual Price")
ax.set_xlim(xmin=0)
ax.set_ylim(ymin=0)
ax.ticklabel_format(useOffset=False, style='plain')
plt.savefig('../images/pred_model_AM.png')
plt.show();

### Insight: removing outliers improved the RMSE overall. Will scaling the features improve the model?

In [None]:
#standard scaler
scaler = StandardScaler()

In [None]:
scaler.fit(X1_train)

In [None]:
scaler.transform(X1_train)

In [None]:
#create X_train_scaled to store different models

regression = LinearRegression()
regression.fit(scaler.transform(X1_train), y1_train)
regression.score(scaler.transform(X1_test), y1_test)

In [None]:
X_scale_train = scaler.transform(X1_train)
X_scale_test = scaler.transform(X1_test)


In [None]:
y_scale_pred = regression.predict(X_scale_train)
y_scale_testpred = regression.predict(X_scale_test)
y_scale_pred.shape

In [None]:
print(r2_score(y1_test, y_scale_testpred))

In [None]:
RMSE_train = mean_squared_error(y1_train, y_scale_pred, squared=False)
RMSE_test = mean_squared_error(y1_test, y_scale_testpred, squared=False)
print('RMSE for test set is: ', RMSE_test)
print("RMSE for training set is: ", RMSE_train)

### Insight: The model did not improve. Attempting MinMaxScaler method

In [None]:
#minmax scaler attempt
from sklearn.preprocessing import MinMaxScaler
mmscaler = MinMaxScaler()
mmscaler.fit(X_train)

In [None]:
linreg2 = LinearRegression()
linreg2.fit(mmscaler.transform(X_train), y_train)
linreg2.score(mmscaler.transform(X_test), y_test)

In [None]:
X_mmscale_train = mmscaler.transform(X_train)
X_mmscale_test = mmscaler.transform(X_test)

In [None]:
y_mmscale_pred = linreg2.predict(X_mmscale_train)
y_mmscale_testpred = linreg2.predict(X_mmscale_test)

In [None]:
r2_score(y_test, y_mmscale_testpred)

In [None]:
RMSE_train = mean_squared_error(y_train, y_mmscale_pred, squared=False)
RMSE_test = mean_squared_error(y_test, y_mmscale_testpred, squared=False)
print('RMSE for test set is: ', RMSE_test)
print("RMSE for training set is: ", RMSE_train)

Insight: Despite transformations, standard and minmax scaling, and elimination of outliers, the models still all perform similarly, with varying RMSE between them. This indicates that a predictive linear regression model might not be the most effective model at predicting prices with the given features. The highest performing model is the one free of outliers before scaling and transformation.