In [1]:
import os
import pandas as pd
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file
import leafmap.maplibregl as leafmap

In [2]:
zhvi= 'https://github.com/opengeos/datasets/releases/download/us/zillow_home_value_index_by_county.csv'
zhvo_file = 'zillow_home_value_index_by_county.csv'
if not os.path.exists(zhvo_file):
    download_file(zhvi, zhvo_file)

In [3]:
zhvi_df= pd.read_csv(zhvo_file, dtype={"StateCodeFIPS": str, "MunicipalCodeFIPS": str})
zhvi_df.index= "geoId/" + zhvi_df["StateCodeFIPS"] + zhvi_df["MunicipalCodeFIPS"]
zhvi_df.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,Metro,StateCodeFIPS,MunicipalCodeFIPS,2000-01-31,...,2024-04-30,2024-05-31,2024-06-30,2024-07-31,2024-08-31,2024-09-30,2024-10-31,2024-11-30,2024-12-31,2025-01-31
geoId/06037,3101,0,Los Angeles County,county,CA,CA,"Los Angeles-Long Beach-Anaheim, CA",6,37,206685.940073,...,848100.109098,851249.814267,853328.259951,856958.247702,862129.887015,868712.125738,873819.306305,877921.805494,881313.863788,880547.372613
geoId/17031,139,1,Cook County,county,IL,IL,"Chicago-Naperville-Elgin, IL-IN-WI",17,31,145737.060609,...,297797.137247,299609.927472,300652.787054,301303.254592,302118.600852,302781.699599,303133.869223,303523.016515,304181.858348,305000.265124
geoId/48201,1090,2,Harris County,county,TX,TX,"Houston-The Woodlands-Sugar Land, TX",48,201,109327.540062,...,282689.212331,283279.667076,283161.721769,282792.559444,282454.169156,282239.614216,281873.680565,281423.466788,281191.55441,281000.895149
geoId/04013,2402,3,Maricopa County,county,AZ,AZ,"Phoenix-Mesa-Chandler, AZ",4,13,142829.577207,...,467600.239765,468996.512594,469164.40105,468562.421846,467451.263586,466509.197532,465782.610071,465120.188374,464544.544452,463613.340231
geoId/06073,2841,4,San Diego County,county,CA,CA,"San Diego-Chula Vista-Carlsbad, CA",6,73,212384.876155,...,918694.464981,927384.144048,931835.8125,933267.733074,932833.552309,932389.735948,931683.464702,931874.623336,932380.083872,932486.30519


In [4]:
county_geojson= '/home/zyang91/Desktop/us/county.geojson'

In [5]:
county_gdf = gpd.read_file(county_geojson)
county_gdf.set_index("place", inplace=True)
county_gdf.head()

Unnamed: 0_level_0,index,geometry
place,Unnamed: 1_level_1,Unnamed: 2_level_1
geoId/29510,0,"POLYGON ((-90.31662 38.60398, -90.32008 38.593..."
geoId/32510,1,"POLYGON ((-120.00431 39.16561, -120.00297 39.1..."
geoId/51580,2,"POLYGON ((-80.00796 37.76462, -80.0085 37.7611..."
geoId/51530,3,"POLYGON ((-79.38112 37.74236, -79.38063 37.740..."
geoId/51678,4,"POLYGON ((-79.45401 37.79249, -79.46155 37.785..."


In [6]:
df= zhvi_df.join(county_gdf)
df

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,Metro,StateCodeFIPS,MunicipalCodeFIPS,2000-01-31,...,2024-06-30,2024-07-31,2024-08-31,2024-09-30,2024-10-31,2024-11-30,2024-12-31,2025-01-31,index,geometry
geoId/06037,3101,0,Los Angeles County,county,CA,CA,"Los Angeles-Long Beach-Anaheim, CA",06,037,206685.940073,...,853328.259951,856958.247702,862129.887015,868712.125738,873819.306305,877921.805494,881313.863788,880547.372613,218.0,"MULTIPOLYGON (((-118.67855 33.02634, -118.6748..."
geoId/17031,139,1,Cook County,county,IL,IL,"Chicago-Naperville-Elgin, IL-IN-WI",17,031,145737.060609,...,300652.787054,301303.254592,302118.600852,302781.699599,303133.869223,303523.016515,304181.858348,305000.265124,635.0,"POLYGON ((-88.26308 42.06686, -88.26272 41.986..."
geoId/48201,1090,2,Harris County,county,TX,TX,"Houston-The Woodlands-Sugar Land, TX",48,201,109327.540062,...,283161.721769,282792.559444,282454.169156,282239.614216,281873.680565,281423.466788,281191.554410,281000.895149,2362.0,"POLYGON ((-95.96052 30.16368, -95.92658 30.068..."
geoId/04013,2402,3,Maricopa County,county,AZ,AZ,"Phoenix-Mesa-Chandler, AZ",04,013,142829.577207,...,469164.401050,468562.421846,467451.263586,466509.197532,465782.610071,465120.188374,464544.544452,463613.340231,88.0,"POLYGON ((-113.33376 33.99923, -113.33389 32.5..."
geoId/06073,2841,4,San Diego County,county,CA,CA,"San Diego-Chula Vista-Carlsbad, CA",06,073,212384.876155,...,931835.812500,933267.733074,932833.552309,932389.735948,931683.464702,931874.623336,932380.083872,932486.305190,212.0,"POLYGON ((-117.61109 33.33401, -117.57155 33.3..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
geoId/31007,846,3206,Banner County,county,NE,NE,"Scottsbluff, NE",31,007,,...,321838.022818,323894.525181,330340.241603,340048.056447,348255.690352,351834.125302,356282.247220,361271.153620,1577.0,"POLYGON ((-104.0527 41.69797, -104.0523 41.393..."
geoId/49009,1648,3207,Daggett County,county,UT,UT,,49,009,,...,308761.377915,306617.625821,306556.018166,308889.348086,313334.046432,316926.936602,320636.922888,323907.317207,2519.0,"POLYGON ((-110.00028 40.81368, -109.99332 40.8..."
geoId/31171,1432,3208,Thomas County,county,NE,NE,,31,171,,...,160305.518515,160113.204884,160637.460741,161362.378182,162603.377093,162802.298298,162976.347349,162721.311102,1586.0,"POLYGON ((-100.84611 42.08818, -100.84436 41.9..."
geoId/31117,2794,3212,McPherson County,county,NE,NE,"North Platte, NE",31,117,,...,288823.914423,291261.158616,298216.019609,308135.413909,317357.571317,322473.200324,328341.338278,331465.598015,1574.0,"POLYGON ((-101.40612 41.74321, -101.40724 41.3..."


In [7]:
zhvi_gdf = gpd.GeoDataFrame(df, geometry='geometry')
zhvi_gdf.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,Metro,StateCodeFIPS,MunicipalCodeFIPS,2000-01-31,...,2024-06-30,2024-07-31,2024-08-31,2024-09-30,2024-10-31,2024-11-30,2024-12-31,2025-01-31,index,geometry
geoId/06037,3101,0,Los Angeles County,county,CA,CA,"Los Angeles-Long Beach-Anaheim, CA",6,37,206685.940073,...,853328.259951,856958.247702,862129.887015,868712.125738,873819.306305,877921.805494,881313.863788,880547.372613,218.0,"MULTIPOLYGON (((-118.67855 33.02634, -118.6748..."
geoId/17031,139,1,Cook County,county,IL,IL,"Chicago-Naperville-Elgin, IL-IN-WI",17,31,145737.060609,...,300652.787054,301303.254592,302118.600852,302781.699599,303133.869223,303523.016515,304181.858348,305000.265124,635.0,"POLYGON ((-88.26308 42.06686, -88.26272 41.986..."
geoId/48201,1090,2,Harris County,county,TX,TX,"Houston-The Woodlands-Sugar Land, TX",48,201,109327.540062,...,283161.721769,282792.559444,282454.169156,282239.614216,281873.680565,281423.466788,281191.55441,281000.895149,2362.0,"POLYGON ((-95.96052 30.16368, -95.92658 30.068..."
geoId/04013,2402,3,Maricopa County,county,AZ,AZ,"Phoenix-Mesa-Chandler, AZ",4,13,142829.577207,...,469164.40105,468562.421846,467451.263586,466509.197532,465782.610071,465120.188374,464544.544452,463613.340231,88.0,"POLYGON ((-113.33376 33.99923, -113.33389 32.5..."
geoId/06073,2841,4,San Diego County,county,CA,CA,"San Diego-Chula Vista-Carlsbad, CA",6,73,212384.876155,...,931835.8125,933267.733074,932833.552309,932389.735948,931683.464702,931874.623336,932380.083872,932486.30519,212.0,"POLYGON ((-117.61109 33.33401, -117.57155 33.3..."


In [8]:
column= "2024-10-31"
gdf= zhvi_gdf[["RegionName","State", column, "geometry"]]
gdf.head()

Unnamed: 0,RegionName,State,2024-10-31,geometry
geoId/06037,Los Angeles County,CA,873819.306305,"MULTIPOLYGON (((-118.67855 33.02634, -118.6748..."
geoId/17031,Cook County,IL,303133.869223,"POLYGON ((-88.26308 42.06686, -88.26272 41.986..."
geoId/48201,Harris County,TX,281873.680565,"POLYGON ((-95.96052 30.16368, -95.92658 30.068..."
geoId/04013,Maricopa County,AZ,465782.610071,"POLYGON ((-113.33376 33.99923, -113.33389 32.5..."
geoId/06073,San Diego County,CA,931683.464702,"POLYGON ((-117.61109 33.33401, -117.57155 33.3..."


In [9]:
m=leafmap.Map(style="liberty")
m.add_data(
    gdf,
    cmap="Blues",
    column=column,
    legend_title="Zillow Home Median Home Value",
    name="Zillow Home Median Home Value",
)
m.add_layer_control()
m

Container(children=[Row(children=[Col(children=[Map(calls=[['addControl', ('NavigationControl', {'showCompass'…

In [10]:
m= leafmap.Map(style="liberty", pitch= 60)
m.add_data(
    gdf,
    cmap="Blues",
    column=column,
    extrude=True,
    scale_factor= 3,
    legend_title="Zillow Home Median Home Value",
    name="Zillow Home Median Home Value",
)
m.add_layer_control()
m

Container(children=[Row(children=[Col(children=[Map(calls=[['addControl', ('NavigationControl', {'showCompass'…

In [11]:
embeddings= pd.read_csv("/home/zyang91/Desktop/us/county_embeddings.csv").set_index("place")
embeddings.head()

Unnamed: 0_level_0,state,county,population,latitude,longitude,feature0,feature1,feature2,feature3,feature4,...,feature320,feature321,feature322,feature323,feature324,feature325,feature326,feature327,feature328,feature329
place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
geoId/01001,AL,Autauga County,58761,32.532237,-86.646439,-0.059204,1.153834,0.286746,0.324178,1.003599,...,-0.059415,-0.165617,0.816348,-0.167234,2.099072,-0.168644,1.155944,1.620912,-0.089127,4.831956
geoId/01003,AL,Baldwin County,233420,30.659218,-87.746067,-0.089298,0.710666,0.358453,1.138983,2.012795,...,-0.167371,0.049021,0.892724,-0.061048,2.791007,-0.166232,0.276117,2.892894,-0.128016,4.151655
geoId/01005,AL,Barbour County,24877,31.870253,-85.405103,-0.089458,1.756277,0.749515,0.796651,0.555815,...,-0.110057,-0.013691,0.346536,-0.106772,2.986117,-0.104402,-0.0596,1.893656,-0.147867,0.032731
geoId/01007,AL,Bibb County,22251,33.015893,-87.127148,-0.117877,0.725561,0.436291,0.30715,1.347822,...,-0.16997,-0.168179,1.79639,-0.128314,1.39575,-0.084409,3.489884,1.249768,-0.02833,4.688914
geoId/01009,AL,Blount County,59077,33.977357,-86.56644,-0.13315,0.648858,0.592053,0.382824,1.48673,...,-0.167436,0.689164,2.527887,-0.045288,0.530547,-0.156357,1.227069,1.439654,-0.045479,3.491446


In [12]:
df= embeddings.join(county_gdf)

In [13]:
embeddings_gdf = gpd.GeoDataFrame(df, geometry='geometry')
embeddings_gdf.head()

Unnamed: 0_level_0,state,county,population,latitude,longitude,feature0,feature1,feature2,feature3,feature4,...,feature322,feature323,feature324,feature325,feature326,feature327,feature328,feature329,index,geometry
place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
geoId/01001,AL,Autauga County,58761,32.532237,-86.646439,-0.059204,1.153834,0.286746,0.324178,1.003599,...,0.816348,-0.167234,2.099072,-0.168644,1.155944,1.620912,-0.089127,4.831956,56,"POLYGON ((-86.91743 32.66417, -86.9212 32.6565..."
geoId/01003,AL,Baldwin County,233420,30.659218,-87.746067,-0.089298,0.710666,0.358453,1.138983,2.012795,...,0.892724,-0.061048,2.791007,-0.166232,0.276117,2.892894,-0.128016,4.151655,64,"MULTIPOLYGON (((-87.44728 30.51503, -87.44562 ..."
geoId/01005,AL,Barbour County,24877,31.870253,-85.405103,-0.089458,1.756277,0.749515,0.796651,0.555815,...,0.346536,-0.106772,2.986117,-0.104402,-0.0596,1.893656,-0.147867,0.032731,51,"POLYGON ((-85.74827 31.61803, -85.54881 31.618..."
geoId/01007,AL,Bibb County,22251,33.015893,-87.127148,-0.117877,0.725561,0.436291,0.30715,1.347822,...,1.79639,-0.128314,1.39575,-0.084409,3.489884,1.249768,-0.02833,4.688914,20,"POLYGON ((-87.42192 33.0034, -87.42075 32.8846..."
geoId/01009,AL,Blount County,59077,33.977357,-86.56644,-0.13315,0.648858,0.592053,0.382824,1.48673,...,2.527887,-0.045288,0.530547,-0.156357,1.227069,1.439654,-0.045479,3.491446,15,"POLYGON ((-86.96227 33.85816, -86.96294 33.844..."


In [22]:
column="feature329"
gdf= embeddings_gdf[["state", column, "geometry"]]
gdf.head()

Unnamed: 0_level_0,state,feature329,geometry
place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
geoId/01001,AL,4.831956,"POLYGON ((-86.91743 32.66417, -86.9212 32.6565..."
geoId/01003,AL,4.151655,"MULTIPOLYGON (((-87.44728 30.51503, -87.44562 ..."
geoId/01005,AL,0.032731,"POLYGON ((-85.74827 31.61803, -85.54881 31.618..."
geoId/01007,AL,4.688914,"POLYGON ((-87.42192 33.0034, -87.42075 32.8846..."
geoId/01009,AL,3.491446,"POLYGON ((-86.96227 33.85816, -86.96294 33.844..."


In [16]:
m= leafmap.Map(style="liberty")
m.add_data(
    gdf,
    cmap="Blues",
    column=column,
    legend_title=column,
    name=column,
)
m.add_layer_control()
m

Container(children=[Row(children=[Col(children=[Map(calls=[['addControl', ('NavigationControl', {'showCompass'…

In [23]:
m=leafmap.Map(style="liberty", pitch=60)
m.add_data(
    gdf,
    cmap="Blues",
    column=column,
    extrude=True,
    scale_factor=0.00005,
    legend_title=column,
    name=column,
)
m.add_layer_control()
m

Container(children=[Row(children=[Col(children=[Map(calls=[['addControl', ('NavigationControl', {'showCompass'…

In [24]:
data=zhvi_df.join(embeddings, how="inner")
data.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,Metro,StateCodeFIPS,MunicipalCodeFIPS,2000-01-31,...,feature320,feature321,feature322,feature323,feature324,feature325,feature326,feature327,feature328,feature329
geoId/06037,3101,0,Los Angeles County,county,CA,CA,"Los Angeles-Long Beach-Anaheim, CA",6,37,206685.940073,...,-7e-08,-0.001461,-0.035212,1.910339,-0.147769,0.003667,-0.164598,-1.2e-05,-0.0,1.520711
geoId/17031,139,1,Cook County,county,IL,IL,"Chicago-Naperville-Elgin, IL-IN-WI",17,31,145737.060609,...,-0.0785554,-0.16023,2.425814,-0.048814,-0.038273,-0.084591,2.993729,2.485846,-4.2e-05,2.314278
geoId/48201,1090,2,Harris County,county,TX,TX,"Houston-The Woodlands-Sugar Land, TX",48,201,109327.540062,...,-0.0344177,-0.008643,-0.157201,2.186765,5.300742,6.407553,0.9754,3.072631,-6e-06,1.600721
geoId/04013,2402,3,Maricopa County,county,AZ,AZ,"Phoenix-Mesa-Chandler, AZ",4,13,142829.577207,...,-0.1682885,-0.099206,4.709348,0.263209,0.575408,4.380611,6.630384,-0.166023,-0.000919,3.876134
geoId/06073,2841,4,San Diego County,county,CA,CA,"San Diego-Chula Vista-Carlsbad, CA",6,73,212384.876155,...,-0.00021949,-0.000305,0.408058,0.252055,-0.164628,0.019952,2.901626,-2.1e-05,-2e-06,0.716236


In [25]:
embedding_features=[f"feature{i}" for i in range(330)]
label= "2024-10-31"

In [26]:
data=data.dropna(subset=[label])

In [27]:
data=data[embedding_features + [label]]
x= data[embedding_features]
y= data[label]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [28]:
model=LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [30]:
evaluation_df=pd.DataFrame({
    "y": y_test,
    "y_pred": y_pred
})
metrics=evaluate_model(evaluation_df)
print(metrics)

{'r2': 0.8269914580761699, 'r': np.float64(0.9125487145276073), 'rmse': 61093.7358171606, 'mae': 43101.94549189267, 'mape': 0.1957361870440549}


In [35]:
xy_lim=(0,1000000)
plot_actual_vs_predicted(
    evaluation_df,
    xlim=xy_lim,
    ylim=xy_lim,
    title="Linear Regression: Actual vs Predicted",
    x_label="Actual Home Value",
    y_label="Predicted Home Value",
)

In [36]:
df=evaluation_df.join(gdf)
df["difference"] = df["y"] - df["y_pred"]


In [38]:
df.head()

Unnamed: 0,y,y_pred,state,feature329,geometry,category,color,difference
geoId/34017,598888.295081,776849.887625,NJ,-6e-08,"POLYGON ((-74.16456 40.75596, -74.1651 40.7427...",1,#f7fbff,-177961.592544
geoId/48411,192620.109978,180320.802383,TX,0.1929096,"POLYGON ((-99.09189 30.92198, -98.72061 30.921...",1,#f7fbff,12299.307595
geoId/21137,170778.967173,259197.493542,KY,3.470986,"MULTIPOLYGON (((-84.74777 37.58648, -84.74418 ...",4,#2171b5,-88418.526369
geoId/55007,275854.538422,300917.601055,WI,2.753146,"POLYGON ((-91.54939 46.89321, -91.55343 46.502...",4,#2171b5,-25063.062633
geoId/22081,120883.872924,76727.777705,LA,1.799954,"MULTIPOLYGON (((-93.15244 32.00524, -93.15168 ...",3,#6baed6,44156.095219


In [39]:
gdf = gpd.GeoDataFrame(df, geometry='geometry')
gdf.head()

Unnamed: 0,y,y_pred,state,feature329,geometry,category,color,difference
geoId/34017,598888.295081,776849.887625,NJ,-6e-08,"POLYGON ((-74.16456 40.75596, -74.1651 40.7427...",1,#f7fbff,-177961.592544
geoId/48411,192620.109978,180320.802383,TX,0.1929096,"POLYGON ((-99.09189 30.92198, -98.72061 30.921...",1,#f7fbff,12299.307595
geoId/21137,170778.967173,259197.493542,KY,3.470986,"MULTIPOLYGON (((-84.74777 37.58648, -84.74418 ...",4,#2171b5,-88418.526369
geoId/55007,275854.538422,300917.601055,WI,2.753146,"POLYGON ((-91.54939 46.89321, -91.55343 46.502...",4,#2171b5,-25063.062633
geoId/22081,120883.872924,76727.777705,LA,1.799954,"MULTIPOLYGON (((-93.15244 32.00524, -93.15168 ...",3,#6baed6,44156.095219


In [40]:
gdf.drop(columns=["category","color",column], inplace=True)
gdf.head()

Unnamed: 0,y,y_pred,state,geometry,difference
geoId/34017,598888.295081,776849.887625,NJ,"POLYGON ((-74.16456 40.75596, -74.1651 40.7427...",-177961.592544
geoId/48411,192620.109978,180320.802383,TX,"POLYGON ((-99.09189 30.92198, -98.72061 30.921...",12299.307595
geoId/21137,170778.967173,259197.493542,KY,"MULTIPOLYGON (((-84.74777 37.58648, -84.74418 ...",-88418.526369
geoId/55007,275854.538422,300917.601055,WI,"POLYGON ((-91.54939 46.89321, -91.55343 46.502...",-25063.062633
geoId/22081,120883.872924,76727.777705,LA,"MULTIPOLYGON (((-93.15244 32.00524, -93.15168 ...",44156.095219


In [41]:
m=leafmap.Map(style="liberty")
m.add_data(
    gdf,
    cmap="Blues",
    column="difference",
    legend_title="Difference (Actual - Predicted)",
    name="Difference",
)
m.add_layer_control()
m

Container(children=[Row(children=[Col(children=[Map(calls=[['addControl', ('NavigationControl', {'showCompass'…