In [None]:
import os
import pandas as pd
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file
import leafmap.maplibregl as leafmap

In [None]:
zhvi = "https://github.com/opengeos/datasets/releases/download/us/zillow_home_value_index_by_county.csv"
zhvo_file = "zillow_home_value_index_by_county.csv"
if not os.path.exists(zhvo_file):
    download_file(zhvi, zhvo_file)

In [None]:
zhvi_df = pd.read_csv(zhvo_file, dtype={"StateCodeFIPS": str, "MunicipalCodeFIPS": str})
zhvi_df.index = "geoId/" + zhvi_df["StateCodeFIPS"] + zhvi_df["MunicipalCodeFIPS"]
zhvi_df.head()

In [None]:
county_geojson = "/home/zyang91/Desktop/us/county.geojson"

In [None]:
county_gdf = gpd.read_file(county_geojson)
county_gdf.set_index("place", inplace=True)
county_gdf.head()

In [None]:
df = zhvi_df.join(county_gdf)
df

In [None]:
zhvi_gdf = gpd.GeoDataFrame(df, geometry="geometry")
zhvi_gdf.head()

In [None]:
column = "2024-10-31"
gdf = zhvi_gdf[["RegionName", "State", column, "geometry"]]
gdf.head()

In [None]:
m = leafmap.Map(style="liberty")
m.add_data(
    gdf,
    cmap="Blues",
    column=column,
    legend_title="Zillow Home Median Home Value",
    name="Zillow Home Median Home Value",
)
m.add_layer_control()
m

In [None]:
m = leafmap.Map(style="liberty", pitch=60)
m.add_data(
    gdf,
    cmap="Blues",
    column=column,
    extrude=True,
    scale_factor=3,
    legend_title="Zillow Home Median Home Value",
    name="Zillow Home Median Home Value",
)
m.add_layer_control()
m

In [None]:
embeddings = pd.read_csv("/home/zyang91/Desktop/us/county_embeddings.csv").set_index(
    "place"
)
embeddings.head()

In [None]:
df = embeddings.join(county_gdf)

In [None]:
embeddings_gdf = gpd.GeoDataFrame(df, geometry="geometry")
embeddings_gdf.head()

In [None]:
column = "feature329"
gdf = embeddings_gdf[["state", column, "geometry"]]
gdf.head()

In [None]:
m = leafmap.Map(style="liberty")
m.add_data(
    gdf,
    cmap="Blues",
    column=column,
    legend_title=column,
    name=column,
)
m.add_layer_control()
m

In [None]:
m = leafmap.Map(style="liberty", pitch=60)
m.add_data(
    gdf,
    cmap="Blues",
    column=column,
    extrude=True,
    scale_factor=0.00005,
    legend_title=column,
    name=column,
)
m.add_layer_control()
m

In [None]:
data = zhvi_df.join(embeddings, how="inner")
data.head()

In [None]:
embedding_features = [f"feature{i}" for i in range(330)]
label = "2024-10-31"

In [None]:
data = data.dropna(subset=[label])

In [None]:
data = data[embedding_features + [label]]
x = data[embedding_features]
y = data[label]
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
evaluation_df = pd.DataFrame({"y": y_test, "y_pred": y_pred})
metrics = evaluate_model(evaluation_df)
print(metrics)

In [None]:
xy_lim = (0, 1000000)
plot_actual_vs_predicted(
    evaluation_df,
    xlim=xy_lim,
    ylim=xy_lim,
    title="Linear Regression: Actual vs Predicted",
    x_label="Actual Home Value",
    y_label="Predicted Home Value",
)

In [None]:
df = evaluation_df.join(gdf)
df["difference"] = df["y"] - df["y_pred"]

In [None]:
df.head()

In [None]:
gdf = gpd.GeoDataFrame(df, geometry="geometry")
gdf.head()

In [None]:
gdf.drop(columns=["category", "color", column], inplace=True)
gdf.head()

In [None]:
m = leafmap.Map(style="liberty")
m.add_data(
    gdf,
    cmap="Blues",
    column="difference",
    legend_title="Difference (Actual - Predicted)",
    name="Difference",
)
m.add_layer_control()
m