In [1]:
# import libraries
from census import Census
import requests
import pandas as pd
import time

# --- USER CONFIG ---
CENSUS_API_KEY = "827d2bde6c4a712da4432fb6f1e392a040ee9c6b"   
c = Census(CENSUS_API_KEY)

In [2]:
# setup
# Representative Chinatown addresses - core locations for better geocoding
chinatowns = {
    "Boston": "88 Beach Street, Boston, MA 02111",
    "New York": "70 Bayard Street, New York, NY 10013",
    "Philadelphia": "1001 Race St, Philadelphia, PA 19107",
    "Washington DC": "701 H Street NW, Washington, DC 20001",
    "Cleveland": "2136 Rockwell Ave, Cleveland, OH 44114",
    "Chicago": "2206 S Wentworth Ave, Chicago, IL 60616",
    "Seattle": "668 S King St, Seattle, WA 98104",
    "Portland": "133 NW 4th Avenue, Portland, OR 97209",
    "Oakland": "388 9th St, Oakland, CA 94607",
    "San Francisco": "839 Stockton Street, San Francisco, CA 94108",
    "Fresno": "1001 F St, Fresno, CA 93706",
    "Los Angeles": "727 N Broadway, Los Angeles, CA 90012",
}

# Cities that need 2010 tract substitution (tract boundaries changed)
CROSSWALK_CITIES = ["Boston", "New York", "Washington DC", "Portland"]

# --- LOAD CROSSWALK FILE ---
# Assumes pipe-delimited file
crosswalk = pd.read_csv("crosswalk.txt", sep="|", dtype=str)
crosswalk = crosswalk[["GEOID_TRACT_20", "GEOID_TRACT_10"]].drop_duplicates()

In [4]:
# def functions
def get_2010_geoid_from_2020(geoid_20):
    """Return the 2010 GEOID that corresponds to the given 2020 GEOID."""
    match = crosswalk.loc[crosswalk["GEOID_TRACT_20"] == geoid_20, "GEOID_TRACT_10"]
    return match.iloc[0] if not match.empty else None

# --- FUNCTIONS ---
def geocode_to_tract(address, max_retries=3):
    """Use Census Geocoder to get tract GEOID for an address (2020 boundaries)."""
    url = "https://geocoding.geo.census.gov/geocoder/geographies/onelineaddress"
    
    for attempt in range(max_retries):
        try:
            params = {
                "address": address,
                "benchmark": "Public_AR_Current",
                "vintage": "Current_Current",
                "format": "json"
            }
            r = requests.get(url, params=params, timeout=15)
            r.raise_for_status()
            data = r.json()
            
            matches = data.get("result", {}).get("addressMatches", [])
            if not matches:
                print(f"    No matches found for: {address}")
                return None, None, None
                
            tract = matches[0]["geographies"]["Census Tracts"][0]
            geoid = tract["GEOID"]
            state = tract["STATE"]
            county = tract["COUNTY"]
            
            return geoid, state, county
            
        except (IndexError, KeyError) as e:
            print(f"    Parse error (attempt {attempt+1}/{max_retries}): {e}")
        except requests.exceptions.RequestException as e:
            print(f"    Request error (attempt {attempt+1}/{max_retries}): {e}")
        
        if attempt < max_retries - 1:
            time.sleep(2)
    
    return None, None, None


def fetch_acs5_population(year, state_fips, county_fips, tract):
    """Fetch total and Asian population from ACS5 for a single tract/year."""
    try:
        # Only need total population and Asian alone
        variables = ("B01003_001E", "B02001_005E")
        
        data = c.acs5.state_county_tract(
            variables, state_fips, county_fips, tract, year=year
        )
        
        if data and len(data) > 0:
            total_pop = int(data[0].get("B01003_001E", 0))
            asian_pop = int(data[0].get("B02001_005E", 0))
            
            return {
                "year": year,
                "source": "acs5",
                "total_pop": total_pop,
                "asian_pop": asian_pop,
            }
        else:
            print(f"    No ACS5 data returned for {year}")
            return None
            
    except Exception as e:
        print(f"    Error fetching {year}: {e}")
        return None

In [11]:
# --- MAIN EXECUTION ---
results = []

# Full year range from 2010 to 2023
years = list(range(2010, 2025))  # 2010-2023

print("="*60)
print("CHINATOWN DEMOGRAPHICS - ACS5 DATA COLLECTION (2010-2023)")
print("="*60)

for city, addr in chinatowns.items():
    print(f"\n📍 Processing {city}...")
    
    # Get 2020 tract boundaries
    geoid_2020, state_fips, county_fips = geocode_to_tract(addr)
    
    if not geoid_2020:
        print(f"  ❌ Could not find tract for {city}")
        continue

    tract_2020 = geoid_2020[-6:]
    print(f"  ✓ 2020 GEOID: {geoid_2020}")
    print(f"    State: {state_fips}, County: {county_fips}, Tract: {tract_2020}")
    
    # If city needs crosswalk, get 2010 tract
    geoid_2010 = None
    tract_2010 = None
    if city in CROSSWALK_CITIES:
        geoid_2010 = get_2010_geoid_from_2020(geoid_2020)
        if geoid_2010:
            tract_2010 = geoid_2010[-6:]
            print(f"  ✓ 2010 GEOID: {geoid_2010} (via crosswalk)")
        else:
            print(f"  ⚠ Warning: No 2010 tract found in crosswalk for {geoid_2020}")
            print(f"    Will attempt to use 2020 tract for all years")

    # Process each year
    for y in years:
        # Determine which tract to use based on year and city
        if city in CROSSWALK_CITIES and y <= 2019 and geoid_2010:
            # Use 2010 tract boundaries for 2010-2019
            tract_for_year = tract_2010
            geoid_for_year = geoid_2010
            period_label = "2010-2019"
        else:
            # Use 2020 tract boundaries for 2020-2023 (or all years for non-crosswalk cities)
            tract_for_year = tract_2020
            geoid_for_year = geoid_2020
            period_label = "2020-2023" if y >= 2020 else "default"
        
        # Only print tract info on first use or when switching periods
        if y == 2010 or (city in CROSSWALK_CITIES and y == 2020):
            print(f"    → Using tract {tract_for_year} for {period_label}")
        
        pop = fetch_acs5_population(y, state_fips, county_fips, tract_for_year)
        if pop:
            pop["city"] = city
            pop["tract_geoid"] = geoid_for_year
            results.append(pop)
            print(f"    ✓ {y}: Total Pop={pop['total_pop']:,}, Asian Pop={pop['asian_pop']:,}")
        
        time.sleep(0.3)  # Rate limiting

print("\n" + "="*60)
print("COLLECTED DATA")
print("="*60)

df = pd.DataFrame(results)
if not df.empty:
    # Display full results
    print(df[['city', 'year', 'total_pop', 'asian_pop', 'tract_geoid']].to_string(index=False))
    
    print("\n" + "="*60)
    print("DATA SUMMARY")
    print("="*60)
    print(f"Total records collected: {len(df)}")
    print(f"Cities with data: {df['city'].nunique()}")
    print(f"Year range: {df['year'].min()} - {df['year'].max()}")
    
    # Records per city
    print("\nRecords per city:")
    print(df.groupby('city').size().sort_values(ascending=False))
    
    # Save to CSV
    df.to_csv('chinatown_demographics.csv', index=False)
    print("\n✅ Data saved to chinatown_demographics.csv")
else:
    print("❌ No data collected.")

CHINATOWN DEMOGRAPHICS - ACS5 DATA COLLECTION (2010-2023)

📍 Processing Boston...
  ✓ 2020 GEOID: 25025070202
    State: 25, County: 025, Tract: 070202
  ✓ 2010 GEOID: 25025070101 (via crosswalk)
    → Using tract 070101 for 2010-2019
    ✓ 2010: Total Pop=4,694, Asian Pop=1,516
    ✓ 2011: Total Pop=5,174, Asian Pop=1,593
    ✓ 2012: Total Pop=5,494, Asian Pop=1,182
    ✓ 2013: Total Pop=5,597, Asian Pop=1,510
    ✓ 2014: Total Pop=5,499, Asian Pop=1,210
    ✓ 2015: Total Pop=5,668, Asian Pop=1,301
    ✓ 2016: Total Pop=6,212, Asian Pop=1,901
    ✓ 2017: Total Pop=6,496, Asian Pop=1,842
    ✓ 2018: Total Pop=7,075, Asian Pop=2,447
    ✓ 2019: Total Pop=7,409, Asian Pop=2,364
    → Using tract 070202 for 2020-2023
    ✓ 2020: Total Pop=3,727, Asian Pop=2,638
    ✓ 2021: Total Pop=3,626, Asian Pop=2,531
    ✓ 2022: Total Pop=4,233, Asian Pop=2,413
    ✓ 2023: Total Pop=4,136, Asian Pop=2,419
    Error fetching 2024: Geography is not available in 2024. Available years include (2023, 2022

In [10]:
df_demo = pd.read_csv('chinatown_demographics.csv')
df_housing = pd.read_csv('chinatown_housing_economics.csv')

df_merged = pd.merge(df_demo, df_housing, 
                     on=['city', 'year', 'tract_geoid'], 
                     how='left',
                     suffixes=('_demo', '_housing'))

df_merged.to_csv('merged.csv', index=False)