First, use QGIS to intersect (or spatial join) census tracts fips data to the listings point data for the metro area of interest. Then run this notebook.

In [1]:
import pandas as pd, json
from geopandas import GeoDataFrame

In [2]:
names = ['ca', 'wa', 'ny']

tract_shapes = ['ca_tracts/cb_2014_06_tract_500k', 
                'wa_tracts/cb_2014_53_tract_500k',
                'ny_tracts/cb_2014_36_tract_500k']

point_shapes = ['ca_points/ca_points', #sf bay area
                'wa_points/wa_points', #seattle
                'ny_points/ny_points'] #new york city

metro_counties = [['001', '013', '041', '055', '075', '081', '085', '087', '095', '097'], 
                  ['033'],
                  ['005', '047', '061', '081', '085']]

In [3]:
# get fips code of state + county + tract
def get_tract_fips(row):
    return '{}{}{}'.format(row['STATEFP'], row['COUNTYFP'], row['TRACTCE'])

# for each tract in the shapefile, pull its attribute value from the passed in data set
def get_attribute(tract_fips, data):
    if tract_fips in data.index:
        return data.loc[tract_fips]

In [4]:
for name, tract_shape, point_shape, counties in zip(names, tract_shapes, point_shapes, metro_counties):
    # load the tract shapefiles from the census bureau
    tracts = GeoDataFrame.from_file('census_shapefiles/{}.shp'.format(tract_shape))
    
    # load the filtered data sets
    points = GeoDataFrame.from_file('listings/{}.shp'.format(point_shape))
    print '{}: {:,} points'.format(name, len(points))
    
    # retain points only in the desired counties
    points = points[points['COUNTYFP'].isin(counties)]
    print points['COUNTYFP'].value_counts()
    
    # get fips code of state + county + tract
    grouping = 'tract_fips'
    points[grouping] = points.apply(get_tract_fips, axis=1)
    tracts[grouping] = tracts.apply(get_tract_fips, axis=1)
    
    # calculate summary stats per tract, then add them as new columns to the tract shapefile
    tracts['median_rent'] = tracts[grouping].apply(get_attribute, data=points.groupby(grouping)['rent'].median())
    tracts['median_sqft'] = tracts[grouping].apply(get_attribute, data=points.groupby(grouping)['sqft'].median())
    tracts['median_rent_sqft'] = tracts[grouping].apply(get_attribute, data=points.groupby(grouping)['rent_sqft'].median())
    tracts['mean_bedrooms'] = tracts[grouping].apply(get_attribute, data=points.groupby(grouping)['bedrooms'].mean())
    
    # create a new categorical variable 'half' if row is in first or second half of data set by date
    points['date'] = pd.to_datetime(points['date'], format='%Y-%m-%d')
    points = points.sort_values(by='date')
    first_half = points.iloc[:(len(points) / 2)].index
    points['half'] = points.index.map(lambda x: '1' if x in first_half else '2')
    
    # calculate ratio of second half to first half med rent/sqft per tract
    halves = points.groupby(['tract_fips', 'half'])['rent_sqft'].median().unstack()
    change = halves['2'] / halves['1']
    tracts['change_median_rent_sqft'] = tracts[grouping].apply(get_attribute, data=change)
    
    # create categorical variables for median rent/sqft and change in median rent/sqft over time
    num_bins = 5
    bin_labels = range(num_bins)
    tracts['median_rent_sqft_bin'] = pd.qcut(x=tracts['median_rent_sqft'], q=num_bins, labels=bin_labels)
    tracts['change_median_rent_sqft_bin'] = pd.qcut(x=tracts['change_median_rent_sqft'], q=num_bins, labels=bin_labels)
    
    # dump the tract shapefile to geojson string
    geojson = tracts.to_json()
    
    # save the geojson to file
    filename = 'choropleth-maps/{}_tracts.geojson'.format(name)
    with open(filename, 'wb') as output_file:
        output_file.write(geojson)
    print '{:,} tracts saved to {}'.format(len(json.loads(geojson)['features']), filename)
    
    # get the quantiles' value ranges
    for cat in tracts['median_rent_sqft_bin'].sort_values().unique():
        if cat >= 0:
            min_val = tracts[tracts['median_rent_sqft_bin']==cat]['median_rent_sqft'].min()
            max_val = tracts[tracts['median_rent_sqft_bin']==cat]['median_rent_sqft'].max()
            print '  {} bin {}: ${:.2f}-{:.2f}'.format(name.upper(), int(cat), min_val, max_val)
    print ''

ca: 106,588 points
085    46923
001    17386
013    12514
081     8694
075     8151
095     3944
097     2769
041     2442
087     1225
055      650
Name: COUNTYFP, dtype: int64
8,043 tracts saved to choropleth-maps/ca_tracts.geojson
  CA bin 0: $0.34-1.66
  CA bin 1: $1.66-2.01
  CA bin 2: $2.01-2.36
  CA bin 3: $2.36-2.81
  CA bin 4: $2.81-5.88

wa: 127,492 points
033    75077
Name: COUNTYFP, dtype: int64
1,454 tracts saved to choropleth-maps/wa_tracts.geojson
  WA bin 0: $0.64-1.13
  WA bin 1: $1.14-1.32
  WA bin 2: $1.32-1.50
  WA bin 3: $1.51-1.83
  WA bin 4: $1.83-2.83

ny: 68,029 points
061    29257
047    18156
081     5101
005      855
085      399
Name: COUNTYFP, dtype: int64
4,906 tracts saved to choropleth-maps/ny_tracts.geojson
  NY bin 0: $0.46-1.79
  NY bin 1: $1.79-2.18
  NY bin 2: $2.18-2.69
  NY bin 3: $2.69-3.46
  NY bin 4: $3.46-6.82

