# Visualizing structured housing data

### Import packages

In [None]:
import math
import json
import warnings
warnings.filterwarnings(action="ignore")

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression

import plotly.graph_objects as go
import plotly.express as px

Definde constants.

- ``PATH``: Path to the base data folder
- ``TEMPLATE``: Which plotly template to use
- ``MAPBOX_TOKEN``: Token for [mapbox](https://docs.mapbox.com/help/getting-started/access-tokens/)

In [None]:
TEMPLATE = "plotly_white"
PATH = "C:/Users/Tim/.keras/datasets/wikipedia_real_estate/"
MAPBOX_TOKEN = "YOUR_KEY"

In [None]:
px.set_mapbox_access_token(MAPBOX_TOKEN)

Load data

In [None]:
df = pd.read_csv(PATH + "structured_visualization.csv")
df_processed = pd.read_csv(PATH + "structured_preprocessed.csv")

In [None]:
df_processed = df_processed.loc[:,"LOTAREA":"longitude"]
df_processed.columns

In [None]:
df_processed.head(10)

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df_numeric = df.select_dtypes(include=numerics)
df_numeric = df_numeric.loc[:,"LOTAREA":"FINISHEDLIVINGAREA"]
df_numeric.columns

In [None]:
heatmap, xedges, yedges = np.histogram2d(list(df["longitude"]), list(df["latitude"]), bins=200)
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]

plt.clf()
plt.figure(figsize = (9,9))
plt.imshow(heatmap.T, extent=extent, origin='lower', interpolation='none')
plt.show()

In [None]:
correlation_mat = df_processed.loc[:, "LOTAREA":"FINISHEDLIVINGAREA"].corr()

plt.subplots(figsize=(15,10))
sns.heatmap(correlation_mat, annot = True, cmap="viridis")
plt.show()

In [None]:
df_processed["SALEPRICE"].describe()

In [None]:
np.max(df["SALEPRICE"])

In [None]:
df_processed = df_processed[df_processed["SALEPRICE"] >= 5000]
# plot
fig = px.scatter(df_processed, x="YEARBLT", y="SALEPRICE", template=TEMPLATE, trendline="ols", log_y=True,
                 title="Average saleprice against fair market estimate over year built")
fig.show()

In [None]:
# group and filter by year
df_year_grouped = df_processed[["YEARBLT", "SALEPRICE", "FAIRMARKETTOTAL"]].groupby(by=["YEARBLT"]).mean()
df_year_grouped["count"] = df_processed.groupby(by=["YEARBLT"]).size()
df_year_grouped = df_year_grouped[df_year_grouped["count"] > 50]  # filter years with less than 50 houses
df_year_grouped["year"] = df_year_grouped.index

# turn into long
rows = []
for index, row in df_year_grouped.iterrows():
    rows.append(["saleprice", row["SALEPRICE"], row["year"]])
    rows.append(["fairmarket", row["FAIRMARKETTOTAL"], row["year"]])
df_long = pd.DataFrame(rows)
df_long.rename({0:"type", 1:"value", 2:"year"}, axis=1, inplace=True)

# plot
fig = px.line(df_long, x="year", y="value", color="type", template=TEMPLATE,
              title="Average saleprice against fair market estimate over year built")
fig.show()

In [None]:
# linear regression
X = np.array(df_year_grouped["year"]).reshape(-1, 1)
Y = np.array(df_year_grouped["SALEPRICE"])

reg = LinearRegression().fit(X, Y)
df_year_grouped["fit"] = reg.predict(X)

# turn into long
rows = []
for index, row in df_year_grouped.iterrows():
    rows.append(["saleprice", row["SALEPRICE"], row["year"]])
#    rows.append(["fairmarket", row["FAIRMARKETTOTAL"], row["year"]])
    rows.append(["fit", row["fit"], row["year"]])
df_long = pd.DataFrame(rows)
df_long.rename({0:"type", 1:"value", 2:"year"}, axis=1, inplace=True)

# plot
fig = px.line(df_long, x="year", y="value", color="type", template=TEMPLATE,
              title="Average saleprice over year built with a fitted regression line")
fig.show()

In [None]:
fig = px.histogram(x=df_processed["YEARBLT"], nbins=100, template=TEMPLATE)
fig.show()

In [None]:
fig = px.histogram(x=df["SALEPRICE"], nbins=150, template=TEMPLATE)
fig.update_xaxes(title_text="sale price")
fig.update_yaxes(title_text="count")
fig.write_image("../Visualisierungen/saleprice_histogram.png", width=800, height=500)
fig.show()

In [None]:
df_processed.columns

In [None]:
df_processed.insert(3, "SALEYEAR", [int(date[-4:]) for date in df_processed["SALEDATE"]])
df_processed.head(10)

In [None]:
fig = px.histogram(x=df_processed["SALEYEAR"], nbins=50, template=TEMPLATE)
fig.show()

In [None]:
# group and filter by year
df_year_grouped = df_processed[["SALEPRICE", "SALEYEAR"]].groupby(by=["SALEYEAR"]).mean()
df_year_grouped["count"] = df_processed.groupby(by=["SALEYEAR"]).size()
df_year_grouped = df_year_grouped[df_year_grouped["count"] > 50]  # filter years with less than 50 houses
df_year_grouped["saleyear"] = df_year_grouped.index

# plot
fig = px.line(df_year_grouped, x="saleyear", y="SALEPRICE", template=TEMPLATE,
              title="Average saleprice over year sold")
fig.show()

In [None]:
fig = px.scatter_mapbox(df_processed, lat="latitude", lon="longitude", color="SALEPRICE",
                        mapbox_style="light", zoom=9, height=600, opacity=0.7, template=TEMPLATE)
fig.write_image("../Visualisierungen/sale_price_spatial.png", width=700, height=600, scale=1.5)
fig.show()

In [None]:
fig = px.scatter(df_processed, x="CONDITION", y="CDU", trendline="ols", template=TEMPLATE,
                 title="CDU against condition with a fitted trendline")
fig.show()
cor = df_processed["CONDITION"].corr(df_processed["CDU"])
print(f"Correlation: {round(cor,2)}")

Create scater plot visualization

In [None]:
x = np.random.rand(300)
residuals = np.random.randn(300)/3
y = x * 2 + 3 + residuals

fig = px.scatter(x=x, y=y, trendline="ols", template=TEMPLATE,
                 labels=dict(x="x-axis", y="y-axis"))

# find trendline trace
for  k, trace  in enumerate(fig.data):
        if trace.mode is not None and trace.mode == 'lines':
            fig.data[k].update(line_width=4)
            fig.data[k].update(line_color="red")

fig.write_image("../Visualisierungen/linear_regression.png", width=800, height=500, scale=1.3)

fig.show()