In order to further improve our model, it is time to add even more data to our model! 

In [None]:
import warnings
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from   sklearn.linear_model import LinearRegression
from   sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
import plotly.express as px

warnings.filterwarnings('ignore')

In [None]:
avg_income = pd.read_excel("..\\data\\washington_state_income_by_zipcode.xlsx", usecols = ["Zip Code", "Location", "City", "Population", "Avg. Income/H/hold", "National Rank"])
avg_income.drop(index = [100, 201, 302, 403, 504], inplace = True)

In [None]:
avg_income.rename(columns = {'Zip Code': 'zipcode'}, inplace = True)

In [None]:
avg_income

In [None]:
avg_income["zipcode"] = avg_income["zipcode"].astype("int")

In [None]:
['Population', "Avg. Income/H/hold", "National Rank"]

In [None]:
avg_income['Population'] = avg_income['Population'].astype('int')

In [None]:
avg_income['Avg. Income/H/hold'] =  avg_income['Avg. Income/H/hold'].astype('int')

In [None]:
import re
avg_income['National Rank'] = avg_income['National Rank'].apply(lambda x: int(re.sub(r'[^\w\s]', '', x)))

In [None]:
avg_income['Location'] = avg_income['Location'].apply(lambda x: x[:9] + ',' +  x[11:])

In [None]:
incomes = []

for income in avg_income["Location"]:
    incomes.append((float(income[:8]), float(income[11:])))
    
for income, index in zip(incomes, range(len(avg_income['Location']))):
        avg_income['Location'][index] = income 

In [None]:
type(avg_income.Location[0])

In [None]:
homes = pd.read_excel("..\\data\\king_county_home_sales.xlsx", index_col = "Unnamed: 0")


In [None]:
homes.head(2)

In [None]:
len(homes.zipcode.unique())

In [None]:
homes.columns

In [None]:
homes_and_income = homes.merge(avg_income, on = "zipcode")

In [None]:
fig, ax = plt.subplots(figsize=(15,10))

sns.heatmap(homes_and_income.drop(columns = ["id", "age", "distance_to_amazon_miles", "distance_to_needle_miles", "National Rank"]).corr().iloc[0:1, 1:].T.sort_values(by = 'price', ascending= False), center = 0, cmap = "coolwarm", annot=True, linewidths=.25, fmt = ".03f", ax=ax)
ax.set_title('Features Used in Best Model Correlated with Price')
ax.set_ylabel('Model Features')
ax.set_xlabel('Correlation Values')
plt.tight_layout()


In [None]:
dummy_zip = pd.get_dummies(homes_and_income['zipcode'], drop_first = True)

In [None]:
dummy_zip

In [None]:
features = homes_and_income[['bedrooms', 'bathrooms', 'sqft_living',
                 'sqft_lot', 'floors', 'waterfront', 'sqft_above', 'lat', 'long',
                 'sqft_basement', 'sqft_living15', 'sqft_lot15', 'view_ord', 'condition_ord', 'grade_ord', 
                 'age', "renovated", "distance_to_amazon_miles", 'Population', 'Avg. Income/H/hold', 'National Rank']]

In [None]:
features = features.join(dummy_zip)

In [None]:
features

In [None]:
X, y = features, homes_and_income[['price']]

# Generate train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.3, random_state=42
)


# Init, fit, score
multi_model = LinearRegression()
multi_model.fit(X_train, y_train)

print(f"Training score: {multi_model.score(X_train, y_train)}")
multi_model_score = multi_model.score(X_test, y_test)
print(f"Test score: {multi_model.score(X_test, y_test)}")

In [None]:
def correlated_column_names(df, target, threshold):
    # this function returns the a list of the names of columns with greater than a certain threshold of correlation. 
    index_columns = df.corr()[target][(df.corr()[target] >= threshold) & (df.corr()[target] != 1) ].index # this gets an index object
    columns = [index for index in index_columns] # this creates a list with column names 
    return(columns)


features = homes_and_income[correlated_column_names(homes_and_income, 'price', .1 )]
features.columns

In [None]:
features = features.join(dummy_zip)

In [None]:
features

In [None]:
X, y = features, homes_and_income[['price']]

# Generate train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.3, random_state=42
)


# Init, fit, score
multi_model = LinearRegression()
multi_model.fit(X_train, y_train)

print(f"Training score: {multi_model.score(X_train, y_train)}")
multi_model_score = multi_model.score(X_test, y_test)
print(f"Test score: {multi_model.score(X_test, y_test)}")

In [None]:
df = homes_and_income.groupby('zipcode')['price', 'Avg. Income/H/hold'].mean()

In [None]:
# plot fit

plt.scatter(df['Avg. Income/H/hold'], df.price, alpha=0.7, color= "red")
plt.title(" Average price of a home as a function of average income ")
plt.xlabel("Average Income")
plt.ylabel("Average Sales Price")
plt.tight_layout()
plt.show()

In [None]:
homes_and_income

In [None]:
homes_and_income.columns

In [None]:
for col in ['bedrooms', 'bathrooms', 'sqft_living',
'sqft_lot', 'floors', 'waterfront', 'sqft_above', 'sqft_basement', 'sqft_living15',
'sqft_lot15', 'view_ord', 'condition_ord', 'grade_ord',
'age', 'distance_to_starbucks_miles',
'distance_to_amazon_miles', 'distance_to_needle_miles', 'Population', 'Avg. Income/H/hold', 'National Rank']:
    sns.scatterplot(data=homes_and_income, x = col,  y="price", color = "red" )
    plt.title(f"Average price of a home as a function of {col} ")
    plt.xlabel(f"{col}")
    plt.ylabel(f"Sales Pricea")
    plt.tight_layout()
    plt.show()

# Now its time to do the dependencies of our new dataset

In [None]:
#list(homes_and_income.loc[homes_and_income.price > 5000000].index.values)
homes_and_income.drop(index = list(homes_and_income.loc[homes_and_income.price > 5000000].index.values), inplace = True)

In [None]:
#homes_and_income.to_excel("sale_price_and_income.xlsx")

In [None]:
y = homes_and_income['price']
X = homes_and_income[['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'waterfront',
       'sqft_above', 'sqft_basement', 'yr_renovated', 'lat', 'sqft_living15',
       'renovated', 'view_ord', 'grade_ord', 'Avg. Income/H/hold']]


linear_with_interaction = PolynomialFeatures(degree = 1, interaction_only = True, include_bias = False)

X_transformed = linear_with_interaction.fit_transform(X)

feat_names = pd.Series(linear_with_interaction.get_feature_names())

feat_names

In [None]:
xs = ['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',  'x11', 'x12', 'x13']

columns = X.columns

map_dict = {x:y for x,y in zip(xs, columns)}

for key, value in map_dict.items():
    feat_names = feat_names.str.replace(key, value)


income_trans = pd.DataFrame(X_transformed)

income_trans.columns = feat_names

income_trans = income_trans.join(dummy_zip)

In [None]:
# Generate train/test sets

X_train, X_test, y_train, y_test = train_test_split(
    income_trans, np.log(y), test_size=.3, random_state=42
)


# Init, fit, score
interactions_model = LinearRegression()
interactions_model.fit(X_train, y_train)

print(f"Training score: {interactions_model.score(X_train, y_train)}")

print(f"Testing score: {interactions_model.score(X_test, y_test)}")

interactions_model_score  = interactions_model.score(X_test, y_test)

In [None]:
#import joblib

### Save our model, This will generate .pkl file in your currentl dircetroy, your model is ready to use.
#joblib.dump(interactions_model,"house_price_prediction.pkl")

# Now its time to visualize our result!

In [None]:
X = income_trans
y = np.log(homes_and_income['price'])
y_pred = interactions_model.predict(income_trans)

fig = px.scatter(x=np.exp(y), y=np.exp(y_pred), labels={'x': 'True Sales Price', 'y': 'Predicted Sales Price'}, title = f"Predict Home Sale Price in King County with {len(income_trans.columns)}: R^2 = {interactions_model_score:.05f}")

fig.add_shape(
    type="line", line=dict(dash='dash'),
    x0=np.exp(y.min()), y0=np.exp(y.min()),
    x1=np.exp(y.max()), y1= np.exp(y.max())
)


import dash
import json
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Output, Input

cache = "fig.json"
# Construct a figure object and save it as json.

with open(cache, 'w') as f:
    f.write(fig.to_json())

# Create example app.
app = dash.Dash(prevent_initial_callbacks=True)
app.layout = html.Div([dcc.Graph(id="graph"), html.Button("Click me", id="btn")])


@app.callback(Output("graph", "figure"), [Input("btn", "n_clicks")])
def func(n_clicks):
    with open(cache, 'r') as f:
        return json.load(f)


if __name__ == '__main__':
    app.run_server()

fig.show()



In [None]:
# colors = ['Positive' if c > 0 else 'Negative' for c in interactions_model.coef_]
#
# fig = px.bar(
#     x=income_trans.columns, y=interactions_model.coef_, color=colors,
#     color_discrete_sequence=['red', 'blue'],
#     labels=dict(x='Feature', y='Linear coefficient'),
#     title='Weight of each feature for predicting petal width'
# )
# fig.show()

# Splitting our Data to see how budget tier effects our R^2 Score

In [None]:
low = homes_and_income.loc[homes_and_income.price <= 1000000]


high = homes_and_income.loc[(homes_and_income.price > 1000000) ]

In [None]:
home_groups = [low, high]

In [None]:
for tier in home_groups:
    y = tier['price']
    X = tier[['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'waterfront',
                          'sqft_above', 'sqft_basement', 'yr_renovated', 'lat', 'sqft_living15',
                          'renovated', 'view_ord', 'grade_ord', 'Avg. Income/H/hold']]


    linear_with_interaction = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

    X_transformed = linear_with_interaction.fit_transform(X)

    feat_names = pd.Series(linear_with_interaction.get_feature_names())

    feat_names
    xs = ['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13']

    columns = X.columns

    map_dict = {x: y for x, y in zip(xs, columns)}

    for key, value in map_dict.items():
        feat_names = feat_names.str.replace(key, value)

    income_trans = pd.DataFrame(X_transformed)

    income_trans.columns = feat_names

    income_trans = income_trans.join(dummy_zip)
    # Generate train/test sets

    X_train, X_test, y_train, y_test = train_test_split(
        income_trans, y, test_size=.3, random_state=42
    )

    # Init, fit, score
    interactions_model = LinearRegression()
    interactions_model.fit(X_train, y_train)

    print(f"Training score: {interactions_model.score(X_train, y_train)}")

    print(f"Testing score: {interactions_model.score(X_test, y_test)}")

    interactions_model_score = interactions_model.score(X_test, y_test)
    # Now its time to visualize our beautiful result!

    X = income_trans
    y = tier['price']
    y_pred = interactions_model.predict(income_trans)

    fig = px.scatter(x=y, y=y_pred, labels={'x': 'ground truth', 'y': 'prediction'},
                     title=f"Multi-Regression Model to Predict Home Sale Price in King County: R^2 = {interactions_model_score:.05f}")
    fig.add_shape(
        type="line", line=dict(dash='dash'),
        x0=y.min(), y0=y.min(),
        x1=y.max(), y1=y.max()
    )

    fig.show()