In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file

In [None]:
zhvi_file = "/home/zyang91/Desktop/data/zillow_home_value_index_by_zipcode.csv"

In [None]:
zhvi_df = pd.read_csv(zhvi_file, dtype={"RegionName": str})
zhvi_df.index = zhvi_df["RegionName"].apply(lambda x: f"zip/{x}")
zhvi_df

In [None]:
embeddings_file = "/home/zyang91/Desktop/us/zcta_embeddings.csv"

In [None]:
zipcode_embeddings = pd.read_csv(embeddings_file).set_index("place")
zipcode_embeddings

In [None]:
data = zhvi_df.join(zipcode_embeddings, how="inner")
data

In [None]:
embedding_features = [f"feature{x}" for x in range(330)]
label = "2024-10-31"

In [None]:
data = data.dropna(subset=[label])
data

In [None]:
data = data[embedding_features + [label]]
X = data[embedding_features]
y = data[label]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Initialize and train a simple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [None]:
evaluation_df = pd.DataFrame({"y": y_test, "y_pred": y_pred})
metrics = evaluate_model(evaluation_df)
print(metrics)

In [None]:
xy_lim = (0, 3_000_000)
plot_actual_vs_predicted(
    evaluation_df,
    xlim=xy_lim,
    ylim=xy_lim,
    title="Actual vs Predicted Home Values",
    x_label="Actual Home Value",
    y_label="Predicted Home Value",
)

In [None]:
evaluate_df = pd.DataFrame({"y": y_test, "y_pred": y_pred})
metrics = evaluate_model(evaluate_df)
print(metrics)

In [None]:
evaluate_df.head()

In [None]:
xy_lim = (0, 3000000)
plot_actual_vs_predicted(
    evaluate_df,
    x_label="Actual",
    y_label="Predicted",
    xlim=xy_lim,
    ylim=xy_lim,
    title="Actual vs Predicted ZHVI",
)

In [None]:
k = 5
knn_model = KNeighborsRegressor(n_neighbors=k)
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)

In [None]:
plot_actual_vs_predicted(
    evaluate_df,
    x_label="Actual",
    y_label="Predicted",
    xlim=xy_lim,
    ylim=xy_lim,
    title=f"Actual vs Predicted ZHVI (KNN, k={k})",
)

In [None]:
evaluation_df = pd.DataFrame({"y": y_test, "y_pred": y_pred})
metrics = evaluate_model(evaluation_df)
print(metrics)