## Final Project Submission

Please fill out:
* Student name: 
* Student pace: self paced / part time / full time
* Scheduled project review date/time: 
* Instructor name: 
* Blog post URL:


In [None]:
#This project was an exercise in multiple linear regression, using the Housing 
#dataset from King County. The process follows the OSEM-I Data Science work flow.

In [None]:
#O: Obtain Data

In [None]:
##The first step is to upload the data and load the numpy/pandas libraries, then take a 
##look at what the dataframe looks like, inspect column names

In [None]:
import numpy as np
import pandas as pd
kchouse = pd.read_csv('data/kc_house_data.csv')
kchouse.head()

In [None]:
kchouse.info()

In [None]:
#I dropped unnecessary columns

In [None]:
kc_new = kchouse.drop(['date', 'view', 'sqft_above', 'sqft_basement', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'], axis=1)
kc_new.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
corr = kc_new.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

with sns.axes_style("white"):

    f, ax = plt.subplots(figsize=(16, 14))

    ax = sns.heatmap(corr, mask=mask, square=True, annot = True, cmap = 'coolwarm')

In [None]:
#The next steps, Scrub and Explore, do mean separate things, but tend go hand in 
#hand. As we Explore more representations of our data, we gain more particular 
#insight into its characteristics. 

In [None]:
#Look at the basic stats of the variables

In [None]:
kc_new.describe()

In [None]:
#some notes: # Mean price is $540,296.57 
             # std $367368.14
             # min $78,000.0 
             # 25% $322,000.0
             # 50% $450,000.0,
             # 75% $645000.0
             # max $7,700,000.0

In [None]:
#I noticed there were null values in the waterfront column. The values 1 and 0 
#represent if the property is on a waterfront or not. I'm making and educated 
#assumption that that means there is no waterfront, so I changed the null values 
#to a 0.

In [None]:
kc_new.isna().value_counts()

In [None]:
kc_new['waterfront'] = kc_new['waterfront'].fillna(0.0)
kc_new.isna().value_counts()

In [None]:
#I wanted to make sure there was one entry per ID, and drop any double-entries

In [None]:
kc_new['id'].value_counts()

In [None]:
kc_new.drop_duplicates(subset = 'id', inplace = True)

In [None]:
kc_new.info()

In [None]:
#I created a correlation heatmap to ID any potential independent variables that
#exhibit multicolinearity, as that would violate one of the necessary assumptions
#we need for Linear Regression

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
corr = kc_new.corr()

f, ax = plt.subplots(figsize=(12, 8))

ax = sns.heatmap(corr, mask=mask, center = 0, square=True, annot = True, cmap = 'coolwarm')


In [None]:
df = kc_new.corr().abs().stack().reset_index().sort_values(0, ascending=False)

# zip the variable name columns (Which were only named level_0 and level_1 by default) in a new column named "pairs"
df['pairs'] = list(zip(df.level_0, df.level_1))

# set index to pairs
df.set_index(['pairs'], inplace = True)

#d rop level columns
df.drop(columns=['level_1', 'level_0'], inplace = True)

# rename correlation column as cc rather than 0
df.columns = ['cc']

# drop duplicates. This could be dangerous if you have variables perfectly correlated with variables other than themselves.
# for the sake of exercise, kept it in.
df.drop_duplicates(inplace=True)
df[(df.cc>.75) & (df.cc <1)]

In [None]:
### big ol scatter matrix to help discern between continuous/categorical variables

In [None]:
pd.plotting.scatter_matrix(kc_new, figsize=[12,12])

In [None]:
### Check for outliers ###

In [None]:
plt.scatter(x = kc_new['bedrooms'], y = kc_new['price'])

In [None]:
plt.scatter(x = kc_new['bathrooms'], y = kc_new['price'])


In [None]:
plt.scatter(x = kc_new['sqft_living'], y = kc_new['price'])

In [None]:
plt.scatter(x = kc_new['sqft_lot'], y = kc_new['price'])


In [None]:
plt.scatter(x = kc_new['id'], y = kc_new['price'])


In [None]:
plt.scatter(x = kc_new['condition'], y = kc_new['price'])

In [None]:
plt.scatter(x = kc_new['floors'], y = kc_new['price'])


In [None]:
plt.scatter(x = kc_new['yr_built'], y = kc_new['price'])


In [None]:
plt.scatter(x = kc_new['grade'], y = kc_new['price'])


In [None]:
plt.scatter(x = kc_new['waterfront'], y = kc_new['price'])

###****** outlier elimination *******###

from scipy import stats
z = np.abs(stats.zscore(kc_new))
z

threshold = 4
print(np.where(z >= 4))

kc_new_out = kc_new[(z<=4).all(axis = 1)]
kc_new_out

###********************************###

In [None]:
continuous = ['price',  'sqft_lot', 'sqft_living']
categoricals = ['waterfront', 'condition', 'grade', 'yr_built','bathrooms', 'floors','bedrooms', 'id']
kccat = kc_new[categoricals]
kccon = kc_new[continuous]

In [None]:
kccon.hist(figsize = [12,12]);

In [None]:
#### doing an initial model just for funsies ####
from statsmodels.formula.api import ols

outcome = 'price'
predictors = kc_new.drop('price', axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum
model = ols(formula=formula, data=kc_new).fit()
model.summary()

In [None]:
# log and normalize features
log_names = [f'{column}_log' for column in kccon.columns]

kccon_log = np.log(kccon)
kccon_log.columns = log_names

log_names

In [None]:
kccon_log

In [None]:
def normalize(feature):
    return (feature - feature.mean()) / feature.std()

kc_log_norm = kccon_log.apply(normalize)

In [None]:
kc_log_norm.hist(figsize = [12,12])

In [None]:
### check features with statsmodels ###

# OLS with Statsmodels #

In [None]:
from statsmodels.formula.api import ols
import scipy.stats as stats
import statsmodels.api as sm


In [None]:
### ols for continuous features

In [None]:
outcome = 'price_log'
predictors = kc_log_norm.drop('price_log', axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum
model = ols(formula=formula, data=kc_log_norm).fit()
model.summary()

### OLS Regression Results for normalized continuous variables ####

# ------------ Categorical -------------- #

In [None]:
kccat.hist(figsize = [12,12])

In [None]:
condition_ohe = pd.get_dummies(kccat['condition'], prefix = 'condition', drop_first=True)
waterfront_ohe =pd.get_dummies(kccat['waterfront'], prefix = 'waterfront', drop_first=True)
grade_ohe= pd.get_dummies(kccat['grade'], prefix = 'grade', drop_first=True)
floors_ohe = pd.get_dummies(kccat['floors'], prefix = 'floors', drop_first=True)
bed_ohe = pd.get_dummies(kccat['bedrooms'], prefix = 'bedrooms', drop_first=True)
bath_ohe = pd.get_dummies(kccat['bathrooms'], prefix = 'bathrooms', drop_first=True)

In [None]:
ohe_concat = pd.concat([condition_ohe, waterfront_ohe, grade_ohe, floors_ohe, bed_ohe, kccat['yr_built'], kccat['id'], bath_ohe], axis = 1)


In [None]:
preprocessed = pd.concat([kc_log_norm, ohe_concat], axis=1)

In [None]:
preprocessed.columns = preprocessed.columns.str.replace('.','_')

In [None]:
preprocessed.info()

###MODEL###

In [None]:
outcome = 'price_log'
predictors = preprocessed.drop('price_log', axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum
model = ols(formula=formula, data=preprocessed).fit()
model.summary()

In [None]:
X = preprocessed.drop('price_log', axis=1)
y = preprocessed['price_log']

In [246]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X, y)
preds = linreg.predict(X)

In [None]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
linreg.coef_

In [None]:
linreg.intercept_

In [None]:
from sklearn.metrics import r2_score
r2_score(y, preds)

In [None]:
import matplotlib.pyplot as plt

residuals = y - preds

plt.scatter(preds, residuals)
plt.hlines(0, preds.min(), preds.max())



### Cross Validation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit, cross_validate

splitter = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)
baseline_model = LinearRegression()

baseline_scores = cross_validate(
    estimator=baseline_model,
    X=X,
    y=y,
    return_train_score=True,
    cv=splitter
)

print("Train score:     ", baseline_scores["train_score"].mean())
print("Validation score:", baseline_scores["test_score"].mean())

In [None]:
fig = sm.graphics.qqplot(model.resid, dist=stats.norm, line='45', fit=True)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(len(X_train), len(X_test), len(y_train), len(y_test))

In [None]:
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    """ 
    Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    See https://en.wikipedia.org/wiki/Stepwise_regression for the details
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded, dtype='float64')
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [None]:
X = X
y = y

result = stepwise_selection(X, y, verbose = True)
print('resulting features:')
print(result)

In [None]:
X_fin = X[result]
X_with_intercept = sm.add_constant(X_fin)
model = sm.OLS(y,X_with_intercept).fit()
model.summary()