In [1]:
key = "657b8c50fb70f15fe42d010ccb4a49b5a14b4518"
#pip install censusdata

In [2]:
import censusdata
import pandas as pd

# Search for variables related to rent
# This helps you find the right variable codes
#censusdata.search('acs5', 2021, 'label', 'median gross rent')

# Pull median gross rent for all tracts in Kings County, NY
# State code: 36 (New York), County code: 047 (Kings/Brooklyn)
#data = censusdata.download(
 #   'acs5', 2021,
  #  censusdata.censusgeo([('state', '36'), ('county', '047'), ('tract', '*')]),
   # ['B25064_001E']  # Median gross rent variable
#)



In [3]:
import censusdata
import pandas as pd

# Skip search, use known variable codes directly
data = censusdata.download(
    'acs5', 2021,
    censusdata.censusgeo([('state', '36'), ('county', '047'), ('tract', '*')]),
    ['B25064_001E']  # Median gross rent
)

print(data.head())

                                                    B25064_001E
Census Tract 1, Kings County, New York: Summary...         2210
Census Tract 2, Kings County, New York: Summary...         1652
Census Tract 3.01, Kings County, New York: Summ...         2794
Census Tract 5.01, Kings County, New York: Summ...         2408
Census Tract 5.02, Kings County, New York: Summ...         2319


In [4]:
import censusdata
import pandas as pd

nyc_counties = {
    '005': 'Bronx',
    '047': 'Brooklyn', 
    '061': 'Manhattan',
    '081': 'Queens',
    '085': 'Staten Island'
}

all_data = []

for county_code, borough_name in nyc_counties.items():
    print(f"Pulling data for {borough_name}...")
    
    data = censusdata.download(
        'acs5', 2021,
        censusdata.censusgeo([('state', '36'), ('county', county_code), ('tract', '*')]),
        [
            'B25064_001E',  # Median gross rent
            'B19013_001E',  # Median household income
            'B01003_001E',  # Total population
            'B03002_003E',  # White alone, not Hispanic/Latino
            'B15003_022E',  # Bachelor's degree
            'B15003_023E',  # Master's degree
            'B15003_024E',  # Professional degree
            'B15003_025E',  # Doctorate degree
            'B25002_001E',  # Total housing units
            'B25002_003E',  # Vacant housing units
        ]
    )
    
    data['borough'] = borough_name
    all_data.append(data)

# Combine all boroughs
data = pd.concat(all_data)

# Rename columns
data.columns = [
    'median_rent',
    'median_income', 
    'total_pop',
    'white_non_hisp',
    'bachelors',
    'masters',
    'professional',
    'doctorate',
    'total_housing_units',
    'vacant_housing_units',
    'borough'
]

Pulling data for Bronx...
Pulling data for Brooklyn...
Pulling data for Manhattan...
Pulling data for Queens...
Pulling data for Staten Island...


In [5]:
# Extract tract GEOID
data['geoid'] = data.index.map(lambda x: x.geo[2][1])

print(data.head())
print(f"\nShape: {data.shape}")
print(f"\nRecords per borough:\n{data['borough'].value_counts()}")

                                                    median_rent  \
Census Tract 1, Bronx County, New York: Summary...   -666666666   
Census Tract 2, Bronx County, New York: Summary...         1740   
Census Tract 4, Bronx County, New York: Summary...         1534   
Census Tract 16, Bronx County, New York: Summar...         1065   
Census Tract 19.01, Bronx County, New York: Sum...         1425   

                                                    median_income  total_pop  \
Census Tract 1, Bronx County, New York: Summary...     -666666666       6661   
Census Tract 2, Bronx County, New York: Summary...          70867       4453   
Census Tract 4, Bronx County, New York: Summary...          98090       6000   
Census Tract 16, Bronx County, New York: Summar...          40033       6038   
Census Tract 19.01, Bronx County, New York: Sum...          55924       2168   

                                                    white_non_hisp  bachelors  \
Census Tract 1, Bronx County, New Y

In [6]:
# Check what the actual negative values are
print(data[data['median_rent'] < 0]['median_rent'].unique())



[-666666666]


In [7]:
# Filter out rows with missing/invalid rent data
# Assuming -666666666 is the missing data code
data_clean = data[data['median_rent'] > 0].copy()

print(f"Original shape: {data.shape}")
print(f"After filtering: {data_clean.shape}")
print(f"Rows removed: {data.shape[0] - data_clean.shape[0]}")

Original shape: (2327, 12)
After filtering: (2186, 12)
Rows removed: 141


In [8]:
# Filter to keep only valid data across key variables
data_clean = data[
    (data['median_rent'] > 0) & 
    (data['median_income'] > 0) &
    (data['total_pop'] > 0)
].copy()

In [9]:
import censusdata
import pandas as pd

nyc_counties = {
    '005': 'Bronx',
    '047': 'Brooklyn', 
    '061': 'Manhattan',
    '081': 'Queens',
    '085': 'Staten Island'
}

# Years spanning 2009-2023 with minimal overlap
years = [2009, 2013, 2017, 2021, 2023]

all_data = []

for year in years:
    print(f"\n=== Pulling {year} ===")
    
    for county_code, borough_name in nyc_counties.items():
        print(f"  {borough_name}...")
        
        data = censusdata.download(
            'acs5', year,
            censusdata.censusgeo([('state', '36'), ('county', county_code), ('tract', '*')]),
            [
                'B25064_001E',  # Median gross rent
                'B19013_001E',  # Median household income
                'B01003_001E',  # Total population
                'B03002_003E',  # White alone, not Hispanic/Latino
                'B25002_001E',  # Total housing units
                'B25002_003E',  # Vacant housing units
            ]
        )
        
        data['borough'] = borough_name
        data['year'] = year
        all_data.append(data)

# Combine everything
data = pd.concat(all_data)

# Rename columns
data.columns = [
    'median_rent',
    'median_income', 
    'total_pop',
    'white_non_hisp',
    'total_housing_units',
    'vacant_housing_units',
    'borough',
    'year'
]

# Extract tract GEOID
data['geoid'] = data.index.map(lambda x: x.geo[2][1])

print(f"\nOriginal shape: {data.shape}")

# Filter out invalid/missing data
data_clean = data[
    (data['median_rent'] > 0) & 
    (data['median_income'] > 0) &
    (data['total_pop'] > 0) &
    (data['total_housing_units'] > 0)
].copy()

print(f"After cleaning: {data_clean.shape}")
print(f"Rows removed: {data.shape[0] - data_clean.shape[0]}")
print(f"\nRecords per year:\n{data_clean['year'].value_counts().sort_index()}")
print(f"\nRecords per borough:\n{data_clean['borough'].value_counts()}")

# Save to CSV
data_clean.to_csv('nyc_acs_data_2009_2023.csv', index=False)
print("\nData saved to nyc_acs_data_2009_2023.csv")


=== Pulling 2009 ===
  Bronx...
  Brooklyn...
  Manhattan...
  Queens...
  Staten Island...

=== Pulling 2013 ===
  Bronx...
  Brooklyn...
  Manhattan...
  Queens...
  Staten Island...

=== Pulling 2017 ===
  Bronx...
  Brooklyn...
  Manhattan...
  Queens...
  Staten Island...

=== Pulling 2021 ===
  Bronx...
  Brooklyn...
  Manhattan...
  Queens...
  Staten Island...

=== Pulling 2023 ===
  Bronx...
  Brooklyn...
  Manhattan...
  Queens...
  Staten Island...

Original shape: (11205, 9)
After cleaning: (10622, 9)
Rows removed: 583

Records per year:
year
2009    2105
2013    2097
2017    2084
2021    2168
2023    2168
Name: count, dtype: int64

Records per borough:
borough
Brooklyn         3776
Queens           3211
Bronx            1660
Manhattan        1431
Staten Island     544
Name: count, dtype: int64

Data saved to nyc_acs_data_2009_2023.csv


In [10]:
import os
os.getcwd()

'/Users/michaelcarter/nycGentrification'