Data Collection through Census API

In [1]:
pip install census us pandas requests

Collecting census
  Downloading census-0.8.24-py3-none-any.whl (11 kB)
Collecting us
  Downloading us-3.2.0-py3-none-any.whl (13 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hCollecting requests
  Downloading requests-2.32.5-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jellyfish
  Downloading jellyfish-1.2.0-cp311-cp311-macosx_11_0_arm64.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.2/325.2 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy>=1.23.2
  Downloading numpy-2.3.3-cp311-cp311-macosx_14_0_arm64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0

In [36]:
from census import Census
import requests
import pandas as pd
import time

# --- USER CONFIG ---
CENSUS_API_KEY = "827d2bde6c4a712da4432fb6f1e392a040ee9c6b"   
c = Census(CENSUS_API_KEY)

In [42]:
# Representative Chinatown addresses - core locations for better geocoding
chinatowns = {
    "Boston": "88 Beach Street, Boston, MA 02111",
    "New York": "70 Bayard Street, New York, NY 10013",
    "Philadelphia": "1001 Race St, Philadelphia, PA 19107",
    "Washington DC": "701 H Street NW, Washington, DC 20001",
    "Cleveland": "2136 Rockwell Ave, Cleveland, OH 44114",
    "Chicago": "2206 S Wentworth Ave, Chicago, IL 60616",
    "Seattle": "668 S King St, Seattle, WA 98104",
    "Portland": "133 NW 4th Avenue, Portland, OR 97209",
    "Oakland": "388 9th St, Oakland, CA 94607",
    "San Francisco": "839 Stockton Street, San Francisco, CA 94108",
    "Fresno": "1001 F St, Fresno, CA 93706",
    "Los Angeles": "727 N Broadway, Los Angeles, CA 90012",
}

# Cities that need 2010 tract substitution
CROSSWALK_CITIES = ["Boston", "New York", "Washington DC", "Portland"]

# --- LOAD CROSSWALK FILE ---
# Assumes tab- or pipe-delimited file
crosswalk = pd.read_csv("crosswalk.txt", sep="|", dtype=str)
crosswalk = crosswalk[["GEOID_TRACT_20", "GEOID_TRACT_10"]].drop_duplicates()

def get_2010_geoid_from_2020(geoid_20):
    """Return the 2010 GEOID that corresponds to the given 2020 GEOID."""
    match = crosswalk.loc[crosswalk["GEOID_TRACT_20"] == geoid_20, "GEOID_TRACT_10"]
    return match.iloc[0] if not match.empty else None

# --- FUNCTIONS ---
def geocode_to_tract(address, max_retries=3):
    """Use Census Geocoder to get tract GEOID for an address."""
    url = "https://geocoding.geo.census.gov/geocoder/geographies/onelineaddress"
    
    for attempt in range(max_retries):
        try:
            params = {
                "address": address,
                "benchmark": "Public_AR_Current",
                "vintage": "Current_Current",
                "format": "json"
            }
            r = requests.get(url, params=params, timeout=15)
            r.raise_for_status()
            data = r.json()
            
            matches = data.get("result", {}).get("addressMatches", [])
            if not matches:
                print(f"    No matches found for: {address}")
                return None, None, None
                
            tract = matches[0]["geographies"]["Census Tracts"][0]
            geoid = tract["GEOID"]
            state = tract["STATE"]
            county = tract["COUNTY"]
            
            return geoid, state, county
            
        except (IndexError, KeyError) as e:
            print(f"    Parse error (attempt {attempt+1}/{max_retries}): {e}")
        except requests.exceptions.RequestException as e:
            print(f"    Request error (attempt {attempt+1}/{max_retries}): {e}")
        
        if attempt < max_retries - 1:
            time.sleep(2)
    
    return None, None, None


def fetch_acs5_population(year, state_fips, county_fips, tract):
    """Fetch total, Asian, and Chinese population from ACS5 for a single tract/year."""
    try:
        variables = ("B01003_001E", "B02001_005E", "B02015_006E")
        try:
            data = c.acs5.state_county_tract(
                variables, state_fips, county_fips, tract, year=year
            )
            if data and len(data) > 0:
                total_pop = int(data[0].get("B01003_001E", 0))
                asian_pop = int(data[0].get("B02001_005E", 0))
                chinese_pop = int(data[0].get("B02015_006E", 0))
                return {
                    "year": year,
                    "source": "acs5",
                    "total_pop": total_pop,
                    "asian_pop": asian_pop,
                    "chinese_pop": chinese_pop,
                }
            else:
                print(f"    No ACS5 data returned for {year}")
                return None
        except Exception as e:
            if "B02015_006E" in str(e) or "unknown variable" in str(e).lower():
                print(f"    Note: Chinese detail not available for {year}, using Asian only")
                data = c.acs5.state_county_tract(
                    ("B01003_001E", "B02001_005E"), state_fips, county_fips, tract, year=year
                )
                if data and len(data) > 0:
                    total_pop = int(data[0].get("B01003_001E", 0))
                    asian_pop = int(data[0].get("B02001_005E", 0))
                    return {
                        "year": year,
                        "source": "acs5",
                        "total_pop": total_pop,
                        "asian_pop": asian_pop,
                        "chinese_pop": None,
                    }
                else:
                    print(f"    No ACS5 data returned for {year}")
                    return None
            else:
                raise
    except Exception as e:
        print(f"    Error fetching {year}: {e}")
        return None

In [43]:
# --- MAIN EXECUTION ---
results = []

# ACS5 years only
years = [2010, 2015, 2020, 2023]

print("="*60)
print("CHINATOWN DEMOGRAPHICS - ACS5 DATA COLLECTION")
print("="*60)

for city, addr in chinatowns.items():
    print(f"\n📍 Processing {city}...")
    geoid, state_fips, county_fips = geocode_to_tract(addr)
    
    if not geoid:
        print(f"  ❌ Could not find tract for {city}")
        continue

    # Extract tract code (last 6 digits)
    tract = geoid[-6:]
    print(f"  ✓ GEOID: {geoid}")
    print(f"    State: {state_fips}, County: {county_fips}, Tract: {tract}")

    for y in years:
        tract_for_year = tract  # default to current tract

        # If city is in crosswalk list and year is 2010 or 2015 → use 2010 tract
        if city in CROSSWALK_CITIES and y in [2010, 2015]:
            old_geoid = get_2010_geoid_from_2020(geoid)
            if old_geoid:
                tract_for_year = old_geoid[-6:]
                print(f"    ↳ Using 2010 tract {old_geoid} for {y}")
            else:
                print(f"    ⚠ No 2010 tract found in crosswalk for {geoid}")

        pop = fetch_acs5_population(y, state_fips, county_fips, tract_for_year)
        if pop:
            pop["city"] = city
            pop["tract_geoid"] = geoid
            results.append(pop)
            chinese_str = f", Chinese={pop['chinese_pop']}" if pop['chinese_pop'] is not None else ""
            print(f"    ✓ {y}: Pop={pop['total_pop']:,}, Asian={pop['asian_pop']:,}{chinese_str}")
        time.sleep(0.3)  # Rate limiting

print("\n" + "="*60)
print("COLLECTED DATA")
print("="*60)

df = pd.DataFrame(results)
if not df.empty:
    # Calculate percentages
    df['asian_pct'] = (df['asian_pop'] / df['total_pop'] * 100).round(1)
    df['chinese_pct'] = df.apply(
        lambda row: round((row['chinese_pop'] / row['total_pop'] * 100), 1) if pd.notnull(row['chinese_pop']) and row['total_pop'] > 0 else None,
        axis=1
    )
    
    # Display full results
    display_cols = ['city', 'year', 'total_pop', 'asian_pop', 'chinese_pop', 'asian_pct', 'chinese_pct', 'tract_geoid']
    print(df[display_cols].to_string(index=False))
    
    print("\n" + "="*60)
    print("SUMMARY BY CITY (Average 2010-2023)")
    print("="*60)
    summary = df.groupby('city').agg({
        'total_pop': 'mean',
        'asian_pop': 'mean',
        'asian_pct': 'mean'
    }).round(0)
    summary.columns = ['Avg Total Pop', 'Avg Asian Pop', 'Avg Asian %']
    print(summary.sort_values('Avg Asian %', ascending=False).to_string())
    
    print("\n" + "="*60)
    print("TRENDS (2010 vs 2023)")
    print("="*60)
    
    # Get 2010 and 2023 data
    df_2010 = df[df['year'] == 2010].set_index('city')
    df_2023 = df[df['year'] == 2023].set_index('city')
    
    if not df_2010.empty and not df_2023.empty:
        trends = pd.DataFrame({
            '2010 Asian %': df_2010['asian_pct'],
            '2023 Asian %': df_2023['asian_pct'],
            'Change': df_2023['asian_pct'] - df_2010['asian_pct']
        }).round(1)
        print(trends.sort_values('Change', ascending=False).to_string())
    
    # Save to CSV
    df.to_csv('chinatown_demographics.csv', index=False)
    print("\n✅ Data saved to chinatown_demographics.csv")
    print(f"Total records collected: {len(df)}")
else:
    print("❌ No data collected.")

CHINATOWN DEMOGRAPHICS - ACS5 DATA COLLECTION

📍 Processing Boston...
    Request error (attempt 1/3): HTTPSConnectionPool(host='geocoding.geo.census.gov', port=443): Max retries exceeded with url: /geocoder/geographies/onelineaddress?address=88+Beach+Street%2C+Boston%2C+MA+02111&benchmark=Public_AR_Current&vintage=Current_Current&format=json (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:992)')))
  ✓ GEOID: 25025070202
    State: 25, County: 025, Tract: 070202
    ↳ Using 2010 tract 25025070101 for 2010
    Note: Chinese detail not available for 2010, using Asian only
    ✓ 2010: Pop=4,694, Asian=1,516
    ↳ Using 2010 tract 25025070101 for 2015
    ✓ 2015: Pop=5,668, Asian=1,301, Chinese=0
    ✓ 2020: Pop=3,727, Asian=2,638, Chinese=0
    ✓ 2023: Pop=4,136, Asian=2,419, Chinese=0

📍 Processing New York...
  ✓ GEOID: 36061002902
    State: 36, County: 061, Tract: 002902
    ↳ Using 2010 tract 36061002900 for 2010
    Note: Chinese detail not availabl