In [1]:
import pandas as pd
import numpy as np
import altair as alt
import config
import fidap

# set up fidap connection
fidap = fidap.fidap_client(api_key = config.api_key)

### America's Housing Crisis  
  
Everyone knows that the US is in the midst of a housing crisis where affordable housing is an oxymoronic phrase in some of the more desirable cities. An [NYT article](https://www.nytimes.com/2021/08/10/opinion/housing-crisis-eviction.html) from yesterday (August 10, 2021) shows why.   
  
[Research](https://www.apartmentlist.com/research/national-rent-data) has also pointed towards rising rental prices across the country.    
  
We can make use of data from Redfin, Zillow, and Apartment List to identify which parts of the US have seen a revival in the housing market.   

In [3]:
# querying for redfin sales data since 2020
redfin_sales = fidap.sql("""
SELECT region_name, period_begin, period_end, total_homes_sold, total_active_listings, total_new_listings, median_active_list_ppsf, median_new_listing_ppsf, inventory, age_of_inventory, median_days_on_market, median_sale_ppsf, median_pending_sqft, percent_active_listings_with_price_drops, price_drop_percent_of_old_list_price, percent_homes_sold_above_list, percent_homes_sold_with_price_drops,
FROM fidap-301014.redfin.weekly_housing_market
WHERE CAST(period_begin AS DATE) > "2019-12-31"
AND duration = "1 weeks"
AND region_type = "county"
""")

# converting period into datetime64
redfin_sales.period_begin = pd.to_datetime(redfin_sales.period_begin)
redfin_sales.period_end = pd.to_datetime(redfin_sales.period_end)

# obtain month 
redfin_sales['period_month'] = pd.DatetimeIndex(redfin_sales['period_end']).month
redfin_sales['period_year'] = pd.DatetimeIndex(redfin_sales['period_end']).year

In [19]:
redfin_sales.head()

Unnamed: 0,region_name,period_begin,period_end,total_homes_sold,total_active_listings,total_new_listings,median_active_list_ppsf,median_new_listing_ppsf,inventory,age_of_inventory,median_days_on_market,median_sale_ppsf,median_pending_sqft,percent_active_listings_with_price_drops,price_drop_percent_of_old_list_price,percent_homes_sold_above_list,percent_homes_sold_with_price_drops,period_month,period_year
0,"Calhoun County, AL",2020-01-06,2020-01-12,11.0,436.0,22.0,73.781388,84.894664,417.0,96.0,62.0,50.898204,1680.5,0.03211,0.07116,0.181818,0.181818,1,2020
1,"Etowah County, AL",2020-01-06,2020-01-12,13.0,383.0,21.0,75.478931,63.753582,367.0,107.0,76.0,66.666667,1650.0,0.041775,0.05159,0.076923,0.307692,1,2020
2,"Limestone County, AL",2020-01-06,2020-01-12,31.0,449.0,40.0,112.673213,113.056253,408.0,75.0,62.0,99.939394,1954.0,0.033408,0.033255,0.290323,0.16129,1,2020
3,"Talladega County, AL",2020-01-06,2020-01-12,14.0,287.0,16.0,87.889688,80.791223,274.0,100.0,100.5,89.505612,1759.0,0.031359,0.032676,0.214286,0.428571,1,2020
4,"Pima County, AZ",2020-01-06,2020-01-12,242.0,3894.0,416.0,147.095179,149.460422,3591.0,59.0,51.0,143.128821,1569.0,0.047252,0.033224,0.157025,0.268595,1,2020


#### Transaction Volumes  
  
The easiest indicator is to look at transaction volumes.  

In [27]:
# weekly homes sold
weekly_homes_sold = redfin_sales.groupby(['period_end']).agg(
    homes_sold = ('total_homes_sold', sum)
)
weekly_homes_sold = weekly_homes_sold.reset_index()

# plotting it
alt.Chart(weekly_homes_sold).mark_line(point = True).encode(
    x = alt.X('period_end', title = "Date"),
    y = alt.Y('homes_sold', title = "No. of Homes Sold")
)

In [31]:
monthly_homes_sold = redfin_sales.groupby(['period_month', 'period_year']).agg(
    homes_sold = ('total_homes_sold', sum)
)

monthly_homes_sold = monthly_homes_sold.reset_index()

# plotting it
alt.Chart(monthly_homes_sold).mark_line(point = True).encode(
    x = alt.X('period_month', title = "Month of Year"),
    y = alt.Y('homes_sold', title = "No. of Homes Sold"),
    color = 'period_year:N'
)

What we can conclude is that across the country, transaction volumes have risen, and are higher than 2020.  
  
Which are some of the counties that have seen the most amount of action? 

In [72]:
redfin_sales_sorted = redfin_sales.copy()
redfin_sales_sorted['weekly_rank'] = redfin_sales_sorted.groupby('period_end')['total_homes_sold'].rank('dense', ascending = False)

# top 3 counties by transaction volume
redfin_sales_sorted_top = redfin_sales_sorted[(redfin_sales_sorted['weekly_rank']<4)]

# plotting
alt.Chart(redfin_sales_sorted_top).mark_line().encode(
    x = alt.X('period_end', title = "Date"),
    y = alt.Y('total_homes_sold', title = "No. of Homes Sold", impute = alt.ImputeParams(value = None)),
    color = alt.Color('region_name', title = "County")
)

The counties with the highest transaction volumes are the big urban counties corresponding to Chicago, Houston, Los Angeles, and Phoenix. This did not change despite the pandemic. Cities remain desirable for buyers. 

In [146]:
biggest_jumps = redfin_sales.copy()
biggest_jumps = biggest_jumps[(biggest_jumps['period_end'] == np.datetime64('2020-01-12')) | (biggest_jumps['period_end'] == np.datetime64('2021-07-25'))]
biggest_jumps = biggest_jumps[['region_name', 'period_end', 'total_homes_sold']]
biggest_jumps.period_end = biggest_jumps.period_end.astype(str)

# pivot
biggest_jumps = pd.pivot(biggest_jumps, values='total_homes_sold', columns = 'period_end', index = 'region_name')

# drop na
biggest_jumps = biggest_jumps.fillna(0)
biggest_jumps = biggest_jumps.reset_index()

# calculate biggest difference
biggest_jumps = biggest_jumps.assign(
    difference_pct = lambda x: round(100*((x["2021-07-25"] - x["2020-01-12"])/x["2020-01-12"]),2)
)
biggest_jumps = biggest_jumps.sort_values('difference_pct', ascending = False)

# we also want those that have more than say 10 transactions in the beginning
biggest_jumps = biggest_jumps[(biggest_jumps['2020-01-12']>10)]
biggest_jumps = biggest_jumps.reset_index(drop = True)

# biggest jump counties
biggest_jumps_counties = biggest_jumps.region_name.tolist()[0:10]
biggest_jump_redfin = biggest_jumps[(biggest_jumps['region_name'].isin(biggest_jumps_counties))]
biggest_jump_redfin = pd.melt(biggest_jump_redfin, id_vars = ['region_name', 'difference_pct'], value_vars = ["2021-07-25", "2020-01-12"] )

In [200]:
alt.Chart(biggest_jump_redfin).mark_bar().encode(
    x = alt.X('period_end', axis = alt.Axis(labels = False), title = None),
    y = alt.Y('value', title = "No. of Homes Sold"),
    color = alt.Color('period_end'),
    column = alt.Column('region_name', title = "", header = alt.Header(labelAngle = 15, labelPadding = -30, labelOrient = 'top'))
)

In [155]:
alt.Chart(biggest_jump_redfin).mark_line(opacity = 0.5).encode(
    x = alt.X('period_end', title = 'Date'),
    y = alt.Y('total_homes_sold', title = "No. of Homes Sold"),
    color = alt.Color('region_name', title = "County")
)

#### Transaction Prices

In [56]:
def q1(x):
    return np.percentile(x, q = 0.25)
def q3(x):
    return np.percentile(x, q = 0.75)

# price psf
weekly_price_psf = redfin_sales.groupby(['period_end']).agg(
    Median = ('median_sale_ppsf', 'median'),
    FirstQuantile = pd.NamedAgg('median_sale_ppsf', lambda x: np.nanquantile(x, 0.25)),
    ThirdQuantile = pd.NamedAgg('median_sale_ppsf', lambda x: np.nanquantile(x, 0.75))
)

weekly_price_psf = weekly_price_psf.reset_index()

weekly_price_psf = pd.melt(weekly_price_psf, id_vars = 'period_end', 
                           value_vars = ['Median', 'FirstQuantile', 'ThirdQuantile'])

alt.Chart(weekly_price_psf).mark_line(point = True).encode(
    x = alt.X('period_end', title = "Date"),
    y = alt.Y('value', title = 'Sale Price ($/psf)'),
    color = 'variable'
)

What we can see is that prices have risen across the board. The rate of sales price increase per square foot is generally the same at different price levels. However, prices in the higher range seem to have increased at a faster rate. 

In [21]:
# group by 
aggregated_sales = redfin_sales.groupby(['region_name', 'period_month', 'period_year']).agg(
    total_homes_sold = ('total_homes_sold', sum),
    active_listings = ('total_active_listings', 'mean'),
    price_psf = ('median_active_list_ppsf', 'mean')
)

In [16]:
redfin_sales.dtypes

region_name                                         object
period_begin                                datetime64[ns]
period_end                                  datetime64[ns]
total_homes_sold                                   float64
total_active_listings                              float64
total_new_listings                                 float64
median_active_list_ppsf                            float64
median_new_listing_ppsf                            float64
inventory                                          float64
age_of_inventory                                   float64
median_days_on_market                              float64
median_sale_ppsf                                   float64
median_pending_sqft                                float64
percent_active_listings_with_price_drops           float64
price_drop_percent_of_old_list_price               float64
percent_homes_sold_above_list                      float64
percent_homes_sold_with_price_drops                float