# Visualizing housing data with text-based features

### Import packages

In [None]:
import math
import json
import os
import warnings
warnings.filterwarnings(action="ignore")

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

import plotly.graph_objects as go
import plotly.express as px

Definde constants.

- ``PATH``: Path to the base data folder
- ``TEMPLATE``: Which plotly template to use
- ``MAPBOX_TOKEN``: Token for [mapbox](https://docs.mapbox.com/help/getting-started/access-tokens/)

In [None]:
TEMPLATE = "plotly_white"
PATH = "C:/Users/Tim/.keras/datasets/wikipedia_real_estate/"
MAPBOX_TOKEN = "YOUR_KEY"

In [None]:
px.set_mapbox_access_token(MAPBOX_TOKEN)

Load Wikipedia coprus

In [None]:
with open(PATH+"wikipedia/wikipedia_selected.ndjson") as fin:
    data_loaded = json.load(fin)

In [None]:
df_allegheny = pd.DataFrame(data_loaded)
df_allegheny.columns = ["title", "coords", "templates", "text", "wikilinks", "ex_links", "length"]
df_allegheny["latitude"] = [lat for (lat, long) in df_allegheny["coords"]]
df_allegheny["longitude"] = [long for (lat, long) in df_allegheny["coords"]]
df_allegheny.head(10)

In [None]:
fig = px.scatter_mapbox(df_allegheny, lat="latitude", lon="longitude",
                        mapbox_style="light", zoom=9, height=600, opacity=0.7, template=TEMPLATE)
fig.write_image("../Visualisierungen/wikipedia_allegheny_coverage.png", width=700, height=600, scale=1.5)
fig.show()

## Visualize articles impact on value

In [None]:
articles_value = pd.read_csv(PATH+"wikipedia_article_values.csv")

In [None]:
fig = px.scatter_mapbox(articles_value, lat="article_lat", lon="article_long", color="article_value",
                        mapbox_style="light", zoom=9, height=500, opacity=0.5, template=TEMPLATE,
                        color_continuous_scale=px.colors.diverging.RdBu,
                        range_color=[-2000000, 2000000]
                       )
fig.update_traces(marker=dict(size=8), selector=dict(type='scattermapbox'))
fig.write_image("../Visualisierungen/wikipedia_article_values.png", width=800, height=700, scale=1.4)
fig.show()

In [None]:
articles_value.boxplot(column=["article_value"])

### Wikipedia articles over USA

In [None]:
with open(PATH+"wikipedia/wikipedia_selected_usa.ndjson") as fin:
    data_loaded = json.load(fin)

In [None]:
df_full = pd.DataFrame(data_loaded)
df_full.columns = ["title", "coords"]
df_full["latitude"] = [lat for (lat, long) in df_full["coords"]]
df_full["longitude"] = [long for (lat, long) in df_full["coords"]]
df_full.head(10)

In [None]:
fig = px.scatter_mapbox(df_full, lat="latitude", lon="longitude", hover_name="title",
                        mapbox_style="light", zoom=3, height=600, opacity=0.2, template=TEMPLATE)
fig.write_image("../Visualisierungen/wikipedia_usa_coverage.png", width=1500, height=750)
fig.show()

In [None]:
error_df = pd.read_csv(PATH+"results/errors_soos_wiki.csv")
error_df.head(10)

In [None]:
fig = px.scatter_mapbox(error_df, lat="lat", lon="long", color="error", color_continuous_scale=px.colors.diverging.RdBu,
                        mapbox_style="light", zoom=9, height=600, opacity=0.7, template=TEMPLATE)
fig.write_image("../Visualisierungen/errors_soos_wiki.png", width=700, height=600, scale=1.5)
fig.show()

In [None]:
fig = px.scatter_mapbox(error_df, lat="lat", lon="long",
                        mapbox_style="light", zoom=9, height=600, opacity=0.7, template=TEMPLATE)
fig.write_image("../Visualisierungen/soos_quadrants.png", width=700, height=600, scale=1.5)
fig.show()

In [None]:
with open(PATH+"wikipedia/wikipedia_selected.ndjson") as fin:
    data_loaded = json.load(fin)

In [None]:
df = pd.DataFrame(data_loaded)
df.head(10)

In [None]:
len(df.iloc[0, 3])

## Choosing the right radius

In [None]:
results = dict()
result_path = PATH + "results"
for csv in os.listdir(result_path):
    if "structured_wiki_text" in csv:
        results[csv[-8:-4]] = pd.read_csv(result_path + "/" + csv)
        results[csv[-8:-4]].index = ["MAE", "RMSE", "MAPE", "R^2"]

In [None]:
results_lst = []
for key in results:
    # results_lst.append([int(key), results[key].loc["MAE", "Catboost: S+T"], "MAE"])
    results_lst.append([int(key), results[key].loc["RMSE", "Catboost: S+T"], "RMSE"])
    # results_lst.append([int(key), results[key].loc["MAPE", "Catboost: S+T"], "MAPE"])
    # results_lst.append([int(key), results[key].loc["R^2", "Catboost: S+T"], "R^2"])

results_df = pd.DataFrame(data=results_lst)
results_df.columns = ["radius", "value", "metric"]
results_df.head(10)

In [None]:
fig = px.line(results_df, x="radius", y="value", color="metric", template=TEMPLATE)
fig.show()