In [1]:
import pandas as pd
import numpy as np
import altair as alt
import config
import fidap
import geopandas as gpd
from shapely import wkt
import matplotlib.pyplot as plt

# set up fidap connection
fidap = fidap.fidap_client(api_key = config.api_key)

### America's Housing Crisis  
  
Everyone knows that the US is in the midst of a housing crisis where affordable housing is an oxymoronic phrase in some of the more desirable cities. An [NYT article](https://www.nytimes.com/2021/08/10/opinion/housing-crisis-eviction.html) from yesterday (August 10, 2021) shows why.   
  
[Research](https://www.apartmentlist.com/research/national-rent-data) has also pointed towards rising rental prices across the country.    
  
We can make use of data from Redfin data to identify which parts of the US have seen a revival in the housing market. We cannot use Zillow because the data ends at the end of March 2021.    

#### Transaction Volumes  
  
The easiest indicator is to look at the number of homes sold since January 2020.  

In [2]:
weekly_homes_sold = fidap.sql("""
SELECT period_end, SUM(CAST(total_homes_sold AS FLOAT64)) AS total_homes_sold
FROM fidap-301014.redfin.weekly_housing_market
WHERE duration = '1 weeks'
AND CAST(period_begin AS DATE) > "2019-12-31"
AND region_type = 'county'
GROUP BY period_end
""")

weekly_homes_sold.period_end = pd.to_datetime(weekly_homes_sold.period_end)

# plotting it
alt.Chart(weekly_homes_sold).mark_line(point = True).encode(
    x = alt.X('period_end', title = "Date"),
    y = alt.Y('total_homes_sold', title = "No. of Homes Sold")
).properties(title = "All Homes Sold")

What we can conclude is that across the country, transaction volumes have risen, and are higher than 2020.  
  
Which are some of the counties that have seen the most amount of action? 

In [3]:
# ranking query
redfin_sales_sorted_top = fidap.sql("""
WITH sales_sorted AS (
SELECT period_end, CAST(total_homes_sold AS FLOAT64) AS total_homes_sold, region_name,
ROW_NUMBER() OVER(PARTITION BY period_end ORDER BY CAST(total_homes_sold AS FLOAT64) DESC) AS weekly_rank
FROM fidap-301014.redfin.weekly_housing_market
WHERE duration = '1 weeks'
AND region_type = 'county'
AND CAST(period_begin AS DATE) > "2019-12-31")

SELECT *
FROM sales_sorted
WHERE weekly_rank < 4
""")

redfin_sales_sorted_top.period_end = pd.to_datetime(redfin_sales_sorted_top.period_end)

# top 3 counties by transaction volume
# redfin_sales_sorted_top = redfin_sales_sorted[(redfin_sales_sorted['weekly_rank']<4)]


# plotting
alt.Chart(redfin_sales_sorted_top).mark_line().encode(
    x = alt.X('period_end', title = "Date"),
    y = alt.Y('total_homes_sold', title = "No. of Homes Sold", impute = alt.ImputeParams(value = None)),
    color = alt.Color('region_name', title = "County")
)

The counties with the highest transaction volumes are the big urban counties corresponding to Chicago, Houston, Los Angeles, and Phoenix. This did not change despite the pandemic. Cities remain desirable for buyers.  
  
At the same time, we also want to identify counties which saw the biggest jump in sales expressed in percentage. We can compare July 2020 with July 2021 since housing sales generally peak in the summer months.By the way, it is way easier in SQL than in Python with CTEs.   

In [11]:
# calculate the differences 
biggest_jumps = fidap.sql("""
WITH july_2020 AS (
SELECT SUM(CAST(total_homes_sold AS FLOAT64)) AS ths_jul20, region_name,
FROM fidap-301014.redfin.weekly_housing_market
WHERE duration = '1 weeks'
AND region_type = 'county'
AND period_end LIKE '2020-07%'
GROUP BY region_name
),

july_2021 AS (
SELECT SUM(CAST(total_homes_sold AS FLOAT64)) AS ths_jul21, region_name,
FROM fidap-301014.redfin.weekly_housing_market
WHERE duration = '1 weeks'
AND region_type = 'county'
AND period_end LIKE '2021-07%'
GROUP BY region_name
)

SELECT ju.region_name, ths_jul20, ths_jul21, ROUND(100*(ths_jul21-ths_jul20)/ths_jul20,2) AS homes_sold_delta_pct
FROM july_2021 AS ju 
INNER JOIN july_2020 AS ja
ON ju.region_name = ja.region_name
WHERE ths_jul20 > 10
""")

# reshaping
biggest_jumps = biggest_jumps.rename(columns = {
    'ths_jul20':'July 2020',
    'ths_jul21':'July 2021'
})

biggest_jumps = biggest_jumps.sort_values('homes_sold_delta_pct', ascending = False)
biggest_jumps = biggest_jumps.reset_index(drop = True)
biggest_jumps_top10 = biggest_jumps.loc[1:10,:]

biggest_jumps_long = pd.melt(biggest_jumps_top10, id_vars = ['region_name', 'homes_sold_delta_pct'], value_vars = ['July 2020', 'July 2021'])

# plotting

alt.Chart(biggest_jumps_long).mark_bar().encode(
    x = alt.X('variable', axis = alt.Axis(labels = False), title = None),
    y = alt.Y('value', title = "No. of Homes Sold"),
    color = alt.Color('variable', title = "Month"),
    column = alt.Column('region_name', title = "", header = alt.Header(labelAngle = 15, labelPadding = -30, labelOrient = 'top'))
)

In [23]:
ytd_transaction_vols = fidap.sql("""
WITH ths_2020 AS (
SELECT SUM(CAST(total_homes_sold AS FLOAT64)) AS ths_20, region_name
FROM fidap-301014.redfin.weekly_housing_market
WHERE duration = '1 weeks'
AND region_type = 'county'
AND period_end LIKE '2020%'
GROUP BY region_name 
),

ths_2021 AS (
SELECT SUM(CAST(total_homes_sold AS FLOAT64)) AS ths_21, region_name,
FROM fidap-301014.redfin.weekly_housing_market
WHERE duration = '1 weeks'
AND region_type = 'county'
AND period_end LIKE '2021%'
GROUP BY region_name
),

county_geom AS (
SELECT CONCAT(c.lsad_name, ", ", s.state) AS full_county_name, c.geo_id 
FROM bigquery-public-data.geo_us_boundaries.counties AS c
INNER JOIN bigquery-public-data.geo_us_boundaries.states AS s
ON s.state_fips_code = c.state_fips_code
)

SELECT ju.region_name AS county, cg.geo_id AS geoid,  ths_20, ths_21, (100*(ths_21/ths_20)) AS homes_sold_pct,
ROW_NUMBER() OVER (ORDER BY ths_20 DESC) AS rank_2020,
ROW_NUMBER() OVER (ORDER BY ths_21 DESC) AS rank_2021,
ROW_NUMBER() OVER (ORDER BY ths_20 DESC) - ROW_NUMBER() OVER (ORDER BY ths_21 DESC) AS rank_change
FROM ths_2021 AS ju 
INNER JOIN ths_2020 AS ja
ON ju.region_name = ja.region_name
INNER JOIN county_geom AS cg
ON ju.region_name = cg.full_county_name
WHERE ths_20 > 50 
""")

In [26]:
alt.Chart(ytd_transaction_vols).mark_bar().encode(
    x = alt.X('homes_sold_bin:Q',title = "Homes Sold in 2021 as % of 2020"),
    y = 'count()'
).transform_bin('homes_sold_bin', 'homes_sold_pct', bin = alt.Bin(step = 25))

What we can see is that generally as of the end of July, transaction volumes for the year 2021 across most counties are up to 75% of transaction volumes for the entire year of 2020. This is generally little higher than expected. At the same time, we see that there is an extremely severe right skew which suggests that demand has spiked in certain counties.  

In [24]:
ytd_transaction_vols = ytd_transaction_vols.sort_values('rank_change', ascending = False)
ytd_transaction_vols.head(n = 10)

Unnamed: 0,county,geoid,ths_20,ths_21,homes_sold_pct,rank_2020,rank_2021,rank_change
140,"Marion County, FL",12083,141,5594,3967.375887,1079,141,938
519,"Camden County, GA",13039,62,782,1261.290323,1252,520,732
575,"Bradley County, TN",47011,64,659,1029.6875,1246,576,670
581,"Coryell County, TX",48099,75,641,854.666667,1208,582,626
571,"Bibb County, GA",13021,101,669,662.376238,1147,572,575
590,"Bulloch County, GA",13031,128,616,481.25,1097,591,506
597,"Bay County, MI",26017,131,596,454.961832,1090,598,492
451,"Houston County, GA",13153,260,1045,401.923077,936,452,484
495,"Flagler County, FL",12035,225,894,397.333333,974,496,478
805,"Rabun County, GA",13241,61,300,491.803279,1256,806,450


Each county is first ranked by transaction volume in each year. And then, counties are ranked in descending order by the number of places it has moved up. We see that the counties which have seen the biggest increments in transaction volumes are largely in the South. 

In [25]:
ytd_transaction_vols = ytd_transaction_vols.sort_values('homes_sold_pct', ascending = False)

top_transaction_counties = ytd_transaction_vols[(ytd_transaction_vols['homes_sold_pct']>=100)]
top_transaction_counties.head(n=10)

Unnamed: 0,county,geoid,ths_20,ths_21,homes_sold_pct,rank_2020,rank_2021,rank_change
140,"Marion County, FL",12083,141,5594,3967.375887,1079,141,938
519,"Camden County, GA",13039,62,782,1261.290323,1252,520,732
575,"Bradley County, TN",47011,64,659,1029.6875,1246,576,670
581,"Coryell County, TX",48099,75,641,854.666667,1208,582,626
571,"Bibb County, GA",13021,101,669,662.376238,1147,572,575
805,"Rabun County, GA",13241,61,300,491.803279,1256,806,450
590,"Bulloch County, GA",13031,128,616,481.25,1097,591,506
597,"Bay County, MI",26017,131,596,454.961832,1090,598,492
828,"Jefferson County, TN",47089,70,284,405.714286,1223,829,394
451,"Houston County, GA",13153,260,1045,401.923077,936,452,484


If we rank counties by increments in transaction volumes in terms of percentage change, again we see that Southern counties take the top spots.

In [20]:
state_col = top_transaction_counties['county'].str.split(", ", n = 1, expand = True).copy()
top_transaction_counties.loc[:,'state'] = state_col.loc[:,1]
top_transaction_counties_states = top_transaction_counties.groupby('state').agg('count')
top_transaction_counties_states = top_transaction_counties_states.loc[:, "county"].reset_index().sort_values('county', ascending = False)
top_transaction_counties_states.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,state,county
5,GA,22
4,FL,8
15,TX,7
13,OK,5
16,UT,4


Really, if we look at counties in the United States that have already surpassed 2020's transaction volumes by end-July 2021, they are most commonly found in Southern states like GA, TX, TN, OK, and FL.

#### Transaction Prices

We can view changes in property prices across the entire US first. 

In [27]:
weekly_price_psf = fidap.sql("""
SELECT period_end, APPROX_QUANTILES(CAST(median_sale_ppsf AS FLOAT64), 100)[OFFSET(50)] AS Median, APPROX_QUANTILES(CAST(median_sale_ppsf AS FLOAT64),100)[OFFSET(75)] AS ThirdQuartile, APPROX_QUANTILES(CAST(median_sale_ppsf AS FLOAT64),100)[OFFSET(25)] AS FirstQuartile
FROM fidap-301014.redfin.weekly_housing_market
WHERE duration = '1 weeks'
AND region_type = 'county'
AND CAST(period_begin AS DATE) > '2020-01-01'
GROUP BY period_end 
""")

weekly_price_psf = weekly_price_psf.reset_index()

weekly_price_psf.period_end = pd.to_datetime(weekly_price_psf.period_end)
weekly_price_psf = pd.melt(weekly_price_psf, id_vars = 'period_end', 
                           value_vars = ['Median', 'FirstQuartile', 'ThirdQuartile'])

alt.Chart(weekly_price_psf).mark_line(point = True).encode(
    x = alt.X('period_end', title = "Date"),
    y = alt.Y('value', title = 'Sale Price ($/psf)'),
    color = 'variable'
)

What we can see is that prices have risen across the board. The rate of sales price increase per square foot is generally the same at different price levels. However, prices in the higher range seem to have increased at a faster rate. I suppose we can look at states where prices have risen the most.  
  
Due to data querying and downloading restrictions, we have to smoothen out the data a litle by increasing the duration to 12 weeks and only looking at tranactions starting March 2020 when Covid-19 was just making its presence felt in the US.   

In [11]:
state_weekly_price_psf = fidap.sql("""
SELECT RIGHT(region_name, 2) AS state, period_end, APPROX_QUANTILES(CAST(median_sale_ppsf AS FLOAT64), 100)[OFFSET(50)] AS Median, APPROX_QUANTILES(CAST(median_sale_ppsf AS FLOAT64),100)[OFFSET(75)] AS ThirdQuartile, APPROX_QUANTILES(CAST(median_sale_ppsf AS FLOAT64),100)[OFFSET(25)] AS FirstQuartile
FROM fidap-301014.redfin.weekly_housing_market
WHERE duration = '12 weeks'
AND region_type = 'county'
AND region_name NOT LIKE 'All Redfin Metros'
AND CAST(period_begin AS DATE) > '2020-03-01'
GROUP BY period_end, RIGHT(region_name, 2)
""")

state_weekly_price_psf = state_weekly_price_psf.reset_index()

state_weekly_price_psf.period_end = pd.to_datetime(state_weekly_price_psf.period_end)
state_weekly_price_psf = pd.melt(state_weekly_price_psf, id_vars = ['period_end', 'state'], 
                                 value_vars = ['Median', 'FirstQuartile', 'ThirdQuartile'])

# plotting
alt.data_transformers.disable_max_rows()
alt.Chart(state_weekly_price_psf).mark_line().encode(
    x = alt.X('period_end', title = "Date"),
    y = alt.Y('value', title = 'Sale Price ($/psf)'),
    color = 'variable',
    facet = alt.Facet('state:O', columns = 5)
).properties(width = 100, height = 100)

Unfortunately, we do not have information on MT, ND, and SD. But what we have paints a rather interesting picture. There are states where housing prices have remained flat and others that have consistently risen such as HI, CA, WA, NJ, MA, and AZ. Interestingly, prices seem to have flatlined or even decreased in NE. In UT, CA, and HI, we see a much larger increase in prices at the higher price ranges than at the lower ranges.  

In [28]:
price_jumps = fidap.sql("""
WITH jan_2020 AS (
SELECT AVG(CAST(median_sale_ppsf AS FLOAT64)) AS jan20, region_name,
FROM fidap-301014.redfin.weekly_housing_market
WHERE duration = '1 weeks'
AND region_type = 'county'
AND period_end LIKE '2020-01%'
GROUP BY region_name
),

july_2021 AS (
SELECT AVG(CAST(median_sale_ppsf AS FLOAT64)) AS jul21, region_name,
FROM fidap-301014.redfin.weekly_housing_market
WHERE duration = '1 weeks'
AND region_type = 'county'
AND period_end LIKE '2021-07%'
GROUP BY region_name
)

SELECT ju.region_name, jan20, jul21, ROUND(100*(jul21-jan20)/jan20,2) AS median_delta_pct,
ROW_NUMBER() OVER (ORDER BY jan20 DESC) AS jan20_rank,
ROW_NUMBER() OVER (ORDER BY jul21 DESC) AS jul21_rank,
(ROW_NUMBER() OVER (ORDER BY jan20 DESC) - ROW_NUMBER() OVER (ORDER BY jul21 DESC)) AS delta_rank 
FROM july_2021 AS ju 
INNER JOIN jan_2020 AS ja
ON ju.region_name = ja.region_name
WHERE jan20 IS NOT NULL
ORDER BY delta_rank DESC
""")

In [29]:
price_jumps = price_jumps.rename(columns = {
    'region_name': 'County',
    'jan20': 'Jan 2020',
    'jul21': 'Jul 2021',
    'jan20_rank': 'Jan 2020 PPSF Rank',
    'jul21_rank': 'Jul 2021 PPSF Rank',
    'delta_rank': 'PPSF Rank Change',
    'median_delta_pct': 'PPSF Change Pct'
})

price_jumps = price_jumps.sort_values('PPSF Rank Change', ascending = False)

price_jumps.head(n = 10)

Unnamed: 0,County,Jan 2020,Jul 2021,PPSF Change Pct,Jan 2020 PPSF Rank,Jul 2021 PPSF Rank,PPSF Rank Change
0,"Greene County, IA",44.186047,396.986923,798.44,1394,43,1351
1,"Montgomery County, IA",37.775516,337.059209,792.27,1408,69,1339
2,"Coleman County, TX",39.741484,263.110925,562.06,1404,135,1269
3,"Owyhee County, ID",68.649886,253.625074,269.45,1256,151,1105
4,"Young County, TX",60.670139,185.671608,206.03,1318,343,975
5,"Hardy County, WV",53.261042,169.39424,218.05,1359,442,917
6,"Manistee County, MI",32.407407,162.156609,400.37,1416,506,910
7,"Fleming County, KY",64.519928,177.001825,174.34,1296,390,906
8,"Eastland County, TX",67.196504,176.498175,162.66,1276,393,883
9,"Lincoln County, WA",97.423606,508.394329,421.84,910,29,881


In [30]:
price_jumps = price_jumps.sort_values('PPSF Change Pct', ascending = False)
price_jumps.head(n = 10)

Unnamed: 0,County,Jan 2020,Jul 2021,PPSF Change Pct,Jan 2020 PPSF Rank,Jul 2021 PPSF Rank,PPSF Rank Change
182,"Wabasha County, MN",179.850163,7647.848763,4152.34,180,1,179
122,"Gallatin County, KY",5.045389,104.550181,1972.19,1439,1182,257
406,"Jackson County, AR",6.870706,72.183642,950.6,1438,1374,64
111,"Jackson County, IN",10.26393,106.969121,942.18,1437,1162,275
0,"Greene County, IA",44.186047,396.986923,798.44,1394,43,1351
1,"Montgomery County, IA",37.775516,337.059209,792.27,1408,69,1339
242,"Crawford County, IN",11.431108,88.206661,671.64,1434,1304,130
29,"Angelina County, TX",19.843049,137.60764,593.48,1428,786,642
62,"Gibson County, TN",18.181818,120.941667,565.18,1430,999,431
2,"Coleman County, TX",39.741484,263.110925,562.06,1404,135,1269


Interestingly, we see that counties in the South are seeing a huge spike in prices. Some of these places are starting at a very low base. 

#### Relationship between Transaction Prices and Transaction Volumes  
  
Interestingly enough, there is no clear relationship between the two. Prices have not necessarily spiked in places where transaction volumes have shot up.   

In [34]:
# renaming cols
ytd_transaction_vols_renamed = ytd_transaction_vols.rename(columns = {
    'rank_change': 'Transaction Rank Change',
    'county': 'County',
    'homes_sold_pct': 'Transaction Pct'
})

# inner join
rank_change_comp = pd.merge(price_jumps[['County', 'PPSF Rank Change', 'PPSF Change Pct']], ytd_transaction_vols_renamed[['County', 'Transaction Rank Change', 'Transaction Pct']])

In [16]:
# plotting
alt.Chart(rank_change_comp).mark_point().encode(
    x = 'Transaction Pct',
    y = 'PPSF Change Pct'
)

In [17]:
rank_change_comp.corr()

Unnamed: 0,PPSF Rank Change,PPSF Change Pct,Transaction Rank Change,Transaction Pct
PPSF Rank Change,1.0,0.296447,-0.003809,-0.022869
PPSF Change Pct,0.296447,1.0,-0.013349,-0.004335
Transaction Rank Change,-0.003809,-0.013349,1.0,0.749835
Transaction Pct,-0.022869,-0.004335,0.749835,1.0


Okay, so there is no correlation between median sales price per square foot and transaction volumes. 

#### Affordable Urban Housing  
  
An even more interesting point to note here is that most of the counties highlighted above are not even major metropolitan areas! So does that imply that beneath the noise of rising housing prices in America's biggest cities, prices and transaction volumes are rising in other parts of the country?  
  
Let us look at the top 50 Metropolitan Statistical Areas and the counties that fall within these zones.  

In [31]:
top_cbsa = fidap.sql("""
WITH top_cbsa AS (
SELECT c2.name, c1.total_pop, c2.cbsa_geom
FROM bigquery-public-data.census_bureau_acs.cbsa_2018_5yr AS c1
INNER JOIN bigquery-public-data.geo_us_boundaries.cbsa AS c2
ON c1.geo_id = c2.geo_id
WHERE c2.msa_indicator = '1'
AND RIGHT(c2.name, 2) NOT IN ('GM', 'MP', 'HI', 'VI', 'AK', 'PR')
ORDER BY c1.total_pop DESC 
LIMIT 50)

SELECT c.lsad_name AS county_name, s.state, CONCAT(c.lsad_name, ", ", s.state) AS full_county_name, 
c.geo_id, tc.name AS msa_name, tc.total_pop, c.county_geom 
FROM bigquery-public-data.geo_us_boundaries.counties AS c, top_cbsa AS tc
INNER JOIN bigquery-public-data.geo_us_boundaries.states AS s
ON s.state_fips_code = c.state_fips_code
WHERE c.state_fips_code NOT IN ('02', '15', '66', '72', '69', '78', '60')
AND ST_CONTAINS(tc.cbsa_geom, c.county_geom);
""")

# convert to gdf
top_cbsa['county_geom'] = gpd.GeoSeries.from_wkt(top_cbsa['county_geom'])
top_cbsa_counties_gdf = gpd.GeoDataFrame(top_cbsa, geometry = 'county_geom', crs = 'epsg:4326')

So let's take the top 100 counties which have seen the biggest jump in transaction volumes and median sale price per square foot.

In [32]:
top_100_price_rank_change = price_jumps.sort_values('PPSF Rank Change', ascending = False).reset_index(drop = True).loc[:99, ['County', 'PPSF Rank Change']]
top_cbsa_counties_price_gdf = top_cbsa_counties_gdf.merge(top_100_price_rank_change, left_on = 'full_county_name', right_on = 'County')
top_cbsa_counties_price_gdf

Unnamed: 0,county_name,state,full_county_name,geo_id,msa_name,total_pop,county_geom,County,PPSF Rank Change
0,Bates County,MO,"Bates County, MO",29013,"Kansas City, MO-KS",2106632,"POLYGON ((-94.61271 38.31312, -94.61271 38.312...","Bates County, MO",376
1,Morgan County,GA,"Morgan County, GA",13211,"Atlanta-Sandy Springs-Alpharetta, GA",5779463,"POLYGON ((-83.59019 33.71141, -83.59212 33.708...","Morgan County, GA",527


Of the top 100 counties which have seen the biggest jump in median sale price, only 3 counties were found in America's top 50 metropolitan areas. All three are in the South. 

In [35]:
top_100_txn_rank_change = ytd_transaction_vols_renamed.sort_values('Transaction Rank Change', ascending = False).reset_index(drop = True).loc[:99, ['County', 'Transaction Rank Change']]
top_cbsa_counties_txn_gdf = top_cbsa_counties_gdf.merge(top_100_txn_rank_change, left_on = 'full_county_name', right_on = 'County')
top_cbsa_counties_txn_gdf

Unnamed: 0,county_name,state,full_county_name,geo_id,msa_name,total_pop,county_geom,County,Transaction Rank Change
0,Nassau County,FL,"Nassau County, FL",12089,"Jacksonville, FL",1475386,"POLYGON ((-81.46540 30.71126, -81.46862 30.712...","Nassau County, FL",360
1,Lafayette County,MO,"Lafayette County, MO",29107,"Kansas City, MO-KS",2106632,"POLYGON ((-94.11032 39.00456, -94.11039 39.002...","Lafayette County, MO",58
2,Medina County,TX,"Medina County, TX",48325,"San Antonio-New Braunfels, TX",2426204,"POLYGON ((-99.20038 29.09081, -99.20005 29.090...","Medina County, TX",48
3,Suffolk County,MA,"Suffolk County, MA",25025,"Boston-Cambridge-Newton, MA-NH",4811732,"POLYGON ((-71.03318 42.41421, -71.03323 42.414...","Suffolk County, MA",47
4,San Francisco County,CA,"San Francisco County, CA",6075,"San Francisco-Oakland-Berkeley, CA",4673221,"MULTIPOLYGON (((-123.17382 37.77573, -123.1737...","San Francisco County, CA",54
5,Pike County,GA,"Pike County, GA",13231,"Atlanta-Sandy Springs-Alpharetta, GA",5779463,"POLYGON ((-84.51836 33.10682, -84.51836 33.106...","Pike County, GA",44
6,Morgan County,GA,"Morgan County, GA",13211,"Atlanta-Sandy Springs-Alpharetta, GA",5779463,"POLYGON ((-83.59019 33.71141, -83.59212 33.708...","Morgan County, GA",163
7,Butts County,GA,"Butts County, GA",13035,"Atlanta-Sandy Springs-Alpharetta, GA",5779463,"POLYGON ((-83.96375 33.37851, -83.96354 33.378...","Butts County, GA",83
8,Lamar County,GA,"Lamar County, GA",13171,"Atlanta-Sandy Springs-Alpharetta, GA",5779463,"POLYGON ((-84.24837 33.07856, -84.24838 33.075...","Lamar County, GA",68
9,Chambers County,TX,"Chambers County, TX",48071,"Houston-The Woodlands-Sugar Land, TX",6779104,"POLYGON ((-94.93081 29.67376, -94.93096 29.673...","Chambers County, TX",43


And we see the same thing with transaction volumes. Of the top 100 counties with the highest change in transaction volumes, only 15 were found in America's biggest metropolitan areas. New York City's 5 boroughs made the list together with Jersey City, and so did Atlanta, and San Francisco.  
  
To expand a little on this, let us look at CBSA price movements next. 

In [36]:
cbsa_price = fidap.sql("""
WITH top_cbsa AS (
SELECT c2.name, c2.cbsa_geom
FROM bigquery-public-data.census_bureau_acs.cbsa_2018_5yr AS c1
INNER JOIN bigquery-public-data.geo_us_boundaries.cbsa AS c2
ON c1.geo_id = c2.geo_id
WHERE c2.msa_indicator = '1'
AND RIGHT(c2.name, 2) NOT IN ('GM', 'MP', 'HI', 'VI', 'AK', 'PR')
ORDER BY c1.total_pop DESC 
LIMIT 10),

cbsa_counties AS (
SELECT c.lsad_name AS county_name, s.state, CONCAT(c.lsad_name, ", ", s.state) AS full_county_name, tc.name AS msa_name, c.county_geom 
FROM bigquery-public-data.geo_us_boundaries.counties AS c, top_cbsa AS tc
INNER JOIN bigquery-public-data.geo_us_boundaries.states AS s
ON s.state_fips_code = c.state_fips_code
WHERE c.state_fips_code NOT IN ('02', '15', '66', '72', '69', '78', '60')
AND ST_CONTAINS(tc.cbsa_geom, c.county_geom))

SELECT cc.msa_name, period_end, 
AVG(CAST(median_sale_ppsf AS FLOAT64)) AS ppsf
FROM fidap-301014.redfin.weekly_housing_market AS w
INNER JOIN cbsa_counties AS cc
ON cc.full_county_name = w.region_name
WHERE duration = '1 weeks'
AND region_type = 'county'
AND CAST(period_begin AS DATE) > '2020-01-01'
GROUP BY period_end, cc.msa_name;
""")

cbsa_price.period_end = pd.to_datetime(cbsa_price.period_end)

If we look at the chart below, what we can observe is an increase in median sale price per square foot, yes, but expressed as a percentage, it is not the craziest either. But we must acknowledge that they started from a much higher base. 

In [37]:
alt.Chart(cbsa_price).mark_line().encode(
    x = alt.X('period_end', title = "Date"),
    y = alt.Y('ppsf', title = "Sale Price ($/ft2)"),
    color = 'msa_name'
)