In [1]:
#!pip install packagename
# importing modules
import geopandas as gpd
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib
import matplotlib.pyplot as plt
import os
from os import chdir as cd
import time
import fiona



In [2]:
# # RUN ONCE TO CRETAE THE COMPILED FILE
# # import shapefiles and merge by GEOID
# # we do it once and then import the compiled file for analysis since compiling it everytime takes longer to run
# # ========================================

# from pathlib import Path
# # define the file location
# folder = Path(r"D:\Work\Box Sync\Trends_all states\Census Tract HUs\\")
# # reading the zip file
# shapefiles = folder.glob(r"Shapefiles\tl_2020_*_tract.zip")
# gdf_CTs = pd.concat([gpd.read_file(shp) for shp in shapefiles]).pipe(gpd.GeoDataFrame)
# gdf_CTs = gdf_CTs.set_crs("EPSG:4269")
# gdf_CTs.to_file(folder / 'compiled_CTs.shp')

In [None]:
US_CTs = gpd.read_file(r'D:\Work\Box Sync\Trends_all states\Census Tract HUs\compiled_CTs.shp')
US_CTs.head()
US_CTs = US_CTs.to_crs('EPSG:9311')
# converting census tract area to square mile, TigerLine shapefile unit is square meter
US_CTs['CT_area_sqmi'] = US_CTs['ALAND'] * 0.386102/ 1000000
census_tracts = US_CTs[['STATEFP', 'COUNTYFP', 'TRACTCE', 'GEOID', 'NAME', 'NAMELSAD', 'CT_area_sqmi',]]

# Housing density data from ACS 2020
# downloaded from https://data.census.gov/advanced--files can be found searching by Table no, i.e. B25001
df_HousingUnits = pd.read_csv(r'D:\Work\Box Sync\Trends_all states\Census Tract HUs\ACSDT5Y2020.B25001-Data.csv')
# extracting only necessary columns
df_HUs = df_HousingUnits.iloc[1:,:3].reset_index()
df_HUs['GEOID'] = df_HUs['GEO_ID'].str[9:]
# changing total housing unit column to float from str
df_HUs['B25001_001E'] = df_HUs['B25001_001E'].astype(float)
# changing column name for better understanding
df_HUs.rename(columns = {'B25001_001E':'HousingUnits'}, inplace = True)
housing_units = df_HUs[['HousingUnits', 'GEOID',]]

In [3]:
# TRACT TO PLACE CONVERSION FILES FROM GEOCORR2022 population weighted using 2020 ACS data
# df_tract_2_place_aw = pd.read_csv(r'D:\Work\Box Sync\Trends_all states\Census Tract HUs\Tract_2_Place_areaweighted.csv', header=1, encoding='latin-1')
# df_tract_2_place_pw = pd.read_csv(r'D:\Work\Box Sync\Trends_all states\Census Tract HUs\Tract_2_Place_populationweighted.csv', header=1, encoding='latin-1')
df_tract_2_place_huw = pd.read_csv(r'D:\Work\Box Sync\Trends_all states\Census Tract HUs\Tract_2_Place_HUweighted.csv', header=1, encoding='latin-1')
# Here, we are using housing unit weighted CT to place conversion file
df_tract_2_place = df_tract_2_place_huw
# df_tract_2_place.columns

In [4]:
# RENAMING AND FORMATING VARIABLES
# Creating matching GEOID from id values
df_tract_2_place['State code'] = df_tract_2_place['State code'].astype(str).str.rjust(2,'0')
df_tract_2_place['County code'] = df_tract_2_place['County code'].astype(str).str.rjust(5,'0')
df_tract_2_place['Place code'] = df_tract_2_place['Place code'].astype(str).str.rjust(5,'0')
df_tract_2_place['GEOID_place'] = df_tract_2_place[['State code','Place code']].astype(str).agg(''.join, axis=1)

df_tract_2_place[['Tract_a', 'Tract_b']] = df_tract_2_place['Tract'].astype(str).str.split('.',expand=True)
df_tract_2_place['Tract_a'] = df_tract_2_place['Tract_a'].str.rjust(4,'0')
df_tract_2_place['Tract_b'] = df_tract_2_place['Tract_b'].str.ljust(2,'0')
df_tract_2_place['GEOID'] = df_tract_2_place[['County code','Tract_a', 'Tract_b']].astype(str).agg(''.join, axis=1)

tract_2_place = df_tract_2_place[['GEOID_place', 'Place code', 'Place name', 
                                  'Total housing units (2020 Census)',
                                  'tract-to-place allocation factor', 'GEOID']]


In [5]:
census_tracts.shape, housing_units.shape, tract_2_place.shape

((85528, 7), (85395, 2), (143340, 6))

In [6]:
# Get the Land area (ALAND from census)
# Get the number of Housing units for census tracts 
# merging Census tract housing unit data with geography 
# outer keeps all values for both dataframes, indicator adds a columns _merge with indicator
df_CTs = tract_2_place.merge(housing_units, how='outer', on  = 'GEOID',  indicator = True)

df_HU_density = census_tracts.merge(housing_units, how='outer', on  = 'GEOID',  indicator = True)
# Housing density in each census tract as HUs/square miles
# keeping the values in sqmile to compare with this report
# report link: https://bjs.ojp.gov/library/publications/classification-urban-suburban-and-rural-areas-national-crime-victimization
df_HU_density['HU_density'] = df_HU_density['HousingUnits'] / df_HU_density['CT_area_sqmi']
# # unmatched rows
# df_censusTracts[df_censusTracts['_merge'] == 'left_only']

In [7]:
df_HU_density_merged = df_CTs.merge(df_HU_density[[ 'GEOID', 'CT_area_sqmi', 'HU_density']], on  = 'GEOID',)
df = df_HU_density_merged[df_HU_density_merged['_merge'] == 'both'].copy()

df['densityxHU'] = df['Total housing units (2020 Census)'] * df['HU_density']
# df['HUs'] = df['tract-to-place allocation factor'] * df['HousingUnits'] # just a check
df['GEOID_place'].nunique(), df.shape

(31882, (143340, 12))

In [8]:
df_weighted_density = df.groupby('GEOID_place').agg({'Place name':'first',
                                      'Total housing units (2020 Census)': 'sum',
                                      # 'HUs': 'sum', # this is calculated from ACS data using the conversion factors found from GeoCORR2022
                                      'densityxHU': 'sum',
                                      'HousingUnits':'sum'}).reset_index()

df_weighted_density['weighted_HU_density_sqmi'] = df_weighted_density['densityxHU'] / df_weighted_density['Total housing units (2020 Census)']
# df_weighted_density['weighted_HU_density'] = df_weighted_density['densityxHU'] / df_weighted_density['HUs']
df_weighted_density.sort_values(['weighted_HU_density_sqmi']).head(10)

Unnamed: 0,GEOID_place,Place name,Total housing units (2020 Census),densityxHU,HousingUnits,weighted_HU_density_sqmi
19181,3671620,"Stony Brook University CDP, NY",115.0,0.0,0.0,0.0
4668,1223005,"Florida Gulf Coast University CDP, FL",308.0,0.0,0.0,0.0
856,264380,"Prudhoe Bay CDP, AK",1.0,0.0,0.0,0.0
24630,4271210,"Slippery Rock University CDP, PA",436.0,0.0,0.0,0.0
23922,4240666,"Kutztown University CDP, PA",8.0,0.0,0.0,0.0
737,236990,"Kaktovik city, AK",91.0,0.946646,890.0,0.010403
616,204500,"Atqasuk city, AK",79.0,0.821813,890.0,0.010403
823,256320,"Nuiqsut city, AK",157.0,1.633224,890.0,0.010403
844,261700,"Point Lay CDP, AK",90.0,0.936243,890.0,0.010403
843,261630,"Point Hope city, AK",255.0,2.652689,890.0,0.010403


In [9]:
# df_weighted_density.isna().sum(), df.isna().sum(), 

In [10]:
# Downloaded for further use in urban–suburban–periurban–rural classification
df_weighted_density.to_csv('D:\Work\Box Sync\Trends_all states\Output from Analysis\weighted_housingUnits_for_places.csv')