# Imports

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, metrics
from sklearn.feature_selection import SelectKBest, f_regression, RFECV
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pickle

In [2]:
df = pd.read_csv('engineered_data.csv', index_col=0)

# Features

In [3]:
target = df['price']
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
            'waterfront', 'view', 'condition', 'grade', 'sqft_above',
            'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat',
            'long', 'sqft_living15', 'sqft_lot15', 'yr_built_or_renovated',
            'years_old', 'age_feature', 'sqft_living_relative_to_nabe',
            'renovated', 'viewed', 'sq_ft_per_flr']

df = df[features]

# Train test split

In [None]:
def lin_reg_model(df, target, random_state=34, test_size=0.2):
    #train test split
    X_train, X_test, y_train, y_test = train_test_split(
        df, target, random_state, test_size)
    #linear regression model
    lm = linear_model.LinearRegression().fit(X_train, y_train)
    #train rmse
    y_pred_train = lm.predict(X_train)
    train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_pred_train))
    #test rmse
    y_pred_test = lm.predict(X_test)
    test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))
    
    return 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    df, target, random_state=34, test_size=0.2)
# print("Training set - Features: ", X_train.shape, "Target: ", y_train.shape)
# print("Training set - Features: ", X_test.shape, "Target: ",y_test.shape)

# Instantiate linear regression model & fit to training data

In [5]:
lm = linear_model.LinearRegression().fit(X_train, y_train)
# print(lm.intercept_)
# print(lm.coef_)
# print("R^2: ", lm.score(X_train, y_train))

# Predicting the training set

In [6]:
y_pred_train = lm.predict(X_train)

# Evaluate training data

In [7]:
# train_mae = metrics.mean_absolute_error(y_train, y_train_pred)
# train_mse = metrics.mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_pred_train))
# print('Mean Absolute Error:', train_mae )
# print('Mean Squared Error:',  train_mse)
print('train RMSE:' , train_rmse)

train RMSE: 198481.35354156978


# Predicting test set

In [8]:
y_pred_test = lm.predict(X_test)

# Evaluate the test set

In [10]:
# test_mae = metrics.mean_absolute_error(y_test, y_pred)
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))
# print('Mean Absolute Error:' + str(metrics.mean_absolute_error(y_test, y_pred)))
# print('Mean Squared Error:' + str(metrics.mean_squared_error(y_test, y_pred)))
print('test RMSE:', test_rmse)

test RMSE: 205913.274652516


# Create polynomials and interactions

In [11]:
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_data = poly.fit_transform(df)
poly_columns = poly.get_feature_names(df.columns)
df = pd.DataFrame(poly_data, columns=poly_columns)
# df.shape

In [None]:
#call train_test_split on the data and capture the results
X_train, X_test, y_train, y_test = train_test_split(
    df, target, random_state=34,test_size=0.2)

#check the shape of the results
print("Training set - Features: ", X_train.shape, "Target: ", y_train.shape)
print("Training set - Features: ", X_test.shape, "Target: ",y_test.shape)

In [None]:
#instantiate a linear regression object & fit the linear regression to the data
lr = LinearRegression().fit(X_train, y_train)

# access output
print(lr.intercept_)
print(lr.coef_)
print("R^2: ", lr.score(df, target))

## Predict on training set 

In [None]:
train_preds = lr.predict(X_train)

## Evaluate training data

In [None]:
# train_mae_poly = metrics.mean_absolute_error(y_train_poly, train_preds)
# train_mse_poly = metrics.mean_squared_error(y_train_poly, train_preds)
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, train_preds))

# print('Mean Absolute Error:', train_mae_poly )
# print('Mean Squared Error:',  train_mse_poly)
print('Root Mean Squared Error:' , train_rmse)

# Remove vars based on correlation coefficient

In [None]:
# # X_train_8, X_test_8, y_train_8, y_test_8 = train_test_split(
# #     df, target, random_state=9,test_size=0.2)

# # Create correlation matrix
# corr_matrix = X_train.corr().abs()

# # Select upper triangle of correlation matrix
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# upper

In [None]:
# sns.set(style="white")

# # Compute the correlation matrix
# corr = X_train_8.corr()

# # Generate a mask for the upper triangle
# mask = np.zeros_like(corr, dtype=np.bool)
# mask[np.triu_indices_from(mask)] = True

# # Set up the matplotlib figure
# f, ax = plt.subplots(figsize=(11, 9))

# # Generate a custom diverging colormap
# cmap = sns.diverging_palette(220, 10, as_cmap=True)

# # Draw the heatmap with the mask and correct aspect ratio
# sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
#             square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
# # Find index of feature columns with correlation greater than 0.90
# to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

# #if you change inplace to True it will go through and drop all of those columns from the dataset
# X_train.drop(columns=to_drop, inplace=False)
# X_test.drop(columns=to_drop, inplace=False)

In [None]:
# to_drop

# VIF

In [None]:
# [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]

In [None]:
# vif = pd.DataFrame()
# vif["VIF Factor"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
# vif["features"] = df.columns
# vif.round(1)

In [None]:
# df.corrwith(target).abs()

# Stat tests

In [None]:
selector = SelectKBest(f_regression, k=20)

selector.fit(X_train, y_train)

In [None]:
selected_columns = X_train.columns[selector.get_support()]
removed_columns = X_train.columns[~selector.get_support()]
# X_train = X_train[selected_columns]
# X_test = X_test[selected_columns]

In [None]:
list(removed_columns)

In [None]:
list(selected_columns)

In [None]:
#instantiate a linear regression object
lm_kbest = LinearRegression()

#fit the linear regression to the data
lm_kbest.fit(X_train[selected_columns], y_train)

y_train_kbest = lm_kbest.predict(X_train[selected_columns])


trainK_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_kbest))


print('Training Root Mean Squared Error:' , trainK_rmse)

y_kbest = lm_kbest.predict(X_test[selected_columns])

testK_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_kbest))

print('Testing Root Mean Squared Error:' , testK_rmse)

In [None]:
list(zip(selected_columns, lm_kbest.coef_))

In [None]:
print('Original: ', test_rmse, '\n',
      "KBest:   ", testK_rmse,'\n')

# Wrapper methods

In [None]:
ols = linear_model.LinearRegression()


In [None]:
# Recursive Feature Elimination

ols = linear_model.LinearRegression()

# Create recursive feature eliminator that scores features by mean squared errors
selector = RFECV(estimator=ols, step=1, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit recursive feature eliminator 
selector.fit(X_train, y_train)

In [None]:
selected_rfe = X_train.columns[selector.support_]
removed_rfe = X_train.columns[~selector.support_]

In [None]:
list(removed_rfe)

In [None]:
len(list(selected_rfe))

In [None]:
#instantiate a linear regression object
lm_rfe = LinearRegression()

#fit the linear regression to the data
lm_rfe = lm_rfe.fit(X_train[selected_rfe], y_train)

y_rfe = lm_rfe.predict(X_train[selected_rfe])


trainRFE_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_rfe))


print('Training Root Mean Squared Error:' , trainRFE_rmse)

y_pred_rfe = lm_rfe.predict(X_test[selected_rfe])

testRFE_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred_rfe))

print('Testing Root Mean Squared Error:' , testRFE_rmse)

In [None]:
print('Original: ', test_rmse, '\n',
      "KBest:   ", testK_rmse,'\n',
      "RFE:     ", testRFE_rmse)

# Final model

In [None]:
selector = SelectKBest(f_regression, k=20)

selector.fit(df, target)

In [None]:
selected_columns = X_train.columns[selector.get_support()]
# removed_columns = X_train.columns[~selector.get_support()]

In [None]:
#instantiate a linear regression object
lm_final = LinearRegression()

#fit the linear regression to the data
lm_final = lm_final.fit(df[selected_columns], target)

In [None]:
lm_final.coef_ # <--- final model for entire dataset

In [None]:
pickle_out = open("model.pickle","wb")
pickle.dump(lm_final, pickle_out)
pickle_out.close()

In [None]:
# pickle_out = open("scaler.pickle", "wb")
# pickle.dump(scaler, pickle_out)
# pickle_out.close

In [None]:
pd.DataFrame(selected_columns).to_csv('selected_columns.csv')

In [None]:
pickle_out = open("selected_columns.pickle","wb")
pickle.dump(selected_columns, pickle_out)
pickle_out.close()