In [None]:
import pandas as pd
from sodapy import Socrata

# No API key needed for public data, but you can get one for higher rate limits
client = Socrata("data.cityofnewyork.us", None)

# DOB Job Application Filings dataset
# This is the main permit dataset
dataset_id = "ic3t-wcy2"

# Pull a sample to see what's in it
sample = client.get(dataset_id, limit=10)
sample_df = pd.DataFrame.from_records(sample)

print("Columns available:")
print(sample_df.columns.tolist())
print("\nSample data:")
print(sample_df.head())

In [None]:
import pandas as pd
from sodapy import Socrata

client = Socrata("data.cityofnewyork.us", None)
dataset_id = "ic3t-wcy2"

print("Pulling permit data... this may take a few minutes")

# Simplified query - just job type and status filters, handle dates in pandas
permits = client.get(
    dataset_id,
    where="job_type IN ('A1', 'A2', 'A3', 'NB') AND job_status IN ('R', 'D', 'X', 'C') AND gis_census_tract IS NOT NULL",
    limit=500000
)

permits_df = pd.DataFrame.from_records(permits)

print(f"Records pulled: {len(permits_df)}")
print(f"\nColumns: {permits_df.columns.tolist()}")


In [None]:
# Convert date column to datetime and filter to 2009-2023
permits_df['pre_filing_date'] = pd.to_datetime(permits_df['pre__filing_date'], errors='coerce')
permits_df = permits_df[
    (permits_df['pre_filing_date'] >= '2009-01-01') & 
    (permits_df['pre_filing_date'] <= '2023-12-31')
]

print(f"After date filtering: {len(permits_df)}")
print(f"\nSample data:")
print(permits_df[['gis_census_tract', 'borough', 'job_type', 'pre_filing_date']].head(10))

# Save raw data
permits_df.to_csv('nyc_permits_raw.csv', index=False)
print("\nRaw permit data saved to nyc_permits_raw.csv")

In [None]:
APIkey = '2hnd1i382w021imxvlh4y1b2i'
APIsecret = '4etvotjxg4m4pbkxbayvshn063fuo0ach2shyc81nt6i6oxtzx'
app_token = "fuC2aHDOv6zYuul5fLkSbfidv"

In [None]:
import pandas as pd
from sodapy import Socrata

app_token = "fuC2aHDOv6zYuul5fLkSbfidv"

client = Socrata("data.cityofnewyork.us", app_token=app_token)
dataset_id = "ic3t-wcy2"

print("Pulling permit data... this may take several minutes")

permits = client.get(
    dataset_id,
    where="job_type IN ('A1', 'A2', 'A3', 'NB') AND job_status IN ('R', 'D', 'X', 'C') AND gis_census_tract IS NOT NULL",
    limit=1000000
)

permits_df = pd.DataFrame.from_records(permits)

print(f"Records pulled: {len(permits_df)}")

# Convert date column to datetime and filter to 2009-2023
# Note: double underscore in column name
permits_df['pre__filing_date'] = pd.to_datetime(permits_df['pre__filing_date'], errors='coerce')
permits_df = permits_df[
    (permits_df['pre__filing_date'] >= '2009-01-01') & 
    (permits_df['pre__filing_date'] <= '2023-12-31')
]

print(f"After date filtering: {len(permits_df)}")
print(f"\nDate range: {permits_df['pre__filing_date'].min()} to {permits_df['pre__filing_date'].max()}")
print(f"\nRecords by borough:")
print(permits_df['borough'].value_counts())
print(f"\nRecords by job type:")
print(permits_df['job_type'].value_counts())

# Save raw data
permits_df.to_csv('nyc_permits_raw.csv', index=False)
print("\nRaw permit data saved to nyc_permits_raw.csv")

In [None]:
import pandas as pd
from sodapy import Socrata
import time

app_token = "fuC2aHDOv6zYuul5fLkSbfidv"
client = Socrata("data.cityofnewyork.us", app_token=app_token, timeout=60)  # Increased timeout
dataset_id = "ic3t-wcy2"

print("Pulling permit data in batches...")

# Pagination parameters
limit = 50000  # Records per batch
offset = 0
all_permits = []
max_retries = 3

while True:
    print(f"Pulling batch starting at offset {offset}...")
    
    # Retry logic for timeouts
    for attempt in range(max_retries):
        try:
            batch = client.get(
                dataset_id,
                where="job_type IN ('A1', 'A2', 'A3', 'NB') AND job_status IN ('R', 'D', 'X', 'C') AND gis_census_tract IS NOT NULL",
                limit=limit,
                offset=offset
            )
            break  # Success, exit retry loop
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"  Timeout, retrying... (attempt {attempt + 1}/{max_retries})")
                time.sleep(5)  # Wait 5 seconds before retry
            else:
                print(f"  Failed after {max_retries} attempts. Error: {e}")
                batch = []
                break
    
    if not batch:  # No more records or final failure
        break
    
    all_permits.extend(batch)
    offset += limit
    
    print(f"  Got {len(batch)} records. Total so far: {len(all_permits)}")
    
    # Stop if we got fewer records than the limit
    if len(batch) < limit:
        break
    
    # Small delay between requests to be nice to the API
    time.sleep(1)

print(f"\nTotal records pulled: {len(all_permits)}")



In [None]:
# Convert to DataFrame
permits_df = pd.DataFrame.from_records(all_permits)import pandas as pd

# Load the permit data we just saved
permits_df = pd.read_csv('nyc_permits_raw.csv')

# Convert date to datetime
permits_df['pre__filing_date'] = pd.to_datetime(permits_df['pre__filing_date'])

# Extract year
permits_df['year'] = permits_df['pre__filing_date'].dt.year

# Aggregate by census tract and year
# Count total permits and break down by job type
permits_agg = permits_df.groupby(['gis_census_tract', 'borough', 'year']).agg({
    'job__': 'count',  # Total permit count
    'job_type': lambda x: (x == 'A1').sum(),  # Major alterations
}).rename(columns={'job__': 'total_permits', 'job_type': 'major_alterations'})

# Also count other job types
permits_agg['minor_alterations'] = permits_df.groupby(['gis_census_tract', 'borough', 'year'])['job_type'].apply(
    lambda x: ((x == 'A2') | (x == 'A3')).sum()
)
permits_agg['new_buildings'] = permits_df.groupby(['gis_census_tract', 'borough', 'year'])['job_type'].apply(
    lambda x: (x == 'NB').sum()
)

permits_agg = permits_agg.reset_index()

# Rename tract column to match ACS data
permits_agg = permits_agg.rename(columns={'gis_census_tract': 'geoid'})

print(f"Aggregated permit data shape: {permits_agg.shape}")
print(f"\nSample aggregated data:")
print(permits_agg.head(10))

# Save aggregated data
permits_agg.to_csv('nyc_permits_aggregated.csv', index=False)
print("\nAggregated permit data saved to nyc_permits_aggregated.csv")

# Show summary stats
print(f"\nYears covered: {permits_agg['year'].min()} to {permits_agg['year'].max()}")
print(f"\nPermits per year:")
print(permits_agg.groupby('year')['total_permits'].sum().sort_index())

# Convert date and filter to 2009-2023
permits_df['pre__filing_date'] = pd.to_datetime(permits_df['pre__filing_date'], errors='coerce')
permits_df = permits_df[
    (permits_df['pre__filing_date'] >= '2009-01-01') & 
    (permits_df['pre__filing_date'] <= '2023-12-31')
]

print(f"After date filtering: {len(permits_df)}")
print(f"\nDate range: {permits_df['pre__filing_date'].min()} to {permits_df['pre__filing_date'].max()}")
print(f"\nRecords by borough:")
print(permits_df['borough'].value_counts())
print(f"\nRecords by job type:")
print(permits_df['job_type'].value_counts())

# Save raw data
permits_df.to_csv('nyc_permits_raw.csv', index=False)
print("\nRaw permit data saved to nyc_permits_raw.csv")

In [None]:
import pandas as pd

# Load the permit data
permits_df = pd.read_csv('nyc_permits_raw.csv', low_memory=False)

# Convert date to datetime
permits_df['pre__filing_date'] = pd.to_datetime(permits_df['pre__filing_date'], errors='coerce')

# Extract year
permits_df['year'] = permits_df['pre__filing_date'].dt.year

# Make sure geoid is clean (string, no nulls)
permits_df['gis_census_tract'] = permits_df['gis_census_tract'].astype(str).str.strip()

# Filter out any rows without valid year or tract
permits_df = permits_df[
    (permits_df['year'].notna()) & 
    (permits_df['gis_census_tract'] != '') & 
    (permits_df['gis_census_tract'] != 'nan')
]

print(f"After cleaning: {len(permits_df)} records")

# Aggregate by census tract and year
permits_summary = permits_df.groupby(['gis_census_tract', 'borough', 'year', 'job_type']).size().unstack(fill_value=0)

# Flatten and rename
permits_summary = permits_summary.reset_index()
permits_summary.columns.name = None

# Calculate totals and specific counts
permits_agg = permits_summary.copy()
permits_agg['total_permits'] = permits_agg[['A1', 'A2', 'A3', 'NB']].sum(axis=1)
permits_agg = permits_agg.rename(columns={
    'A1': 'major_alterations',
    'A2': 'minor_alterations_a2',
    'A3': 'minor_alterations_a3',
    'NB': 'new_buildings',
    'gis_census_tract': 'geoid'
})

# Combine A2 and A3 into one minor alterations column
permits_agg['minor_alterations'] = permits_agg['minor_alterations_a2'] + permits_agg['minor_alterations_a3']
permits_agg = permits_agg.drop(columns=['minor_alterations_a2', 'minor_alterations_a3'])

print(f"\nAggregated permit data shape: {permits_agg.shape}")
print(f"\nSample aggregated data:")
print(permits_agg.head(10))

# Save aggregated data
permits_agg.to_csv('nyc_permits_aggregated.csv', index=False)
print("\nAggregated permit data saved to nyc_permits_aggregated.csv")

# Show summary stats
print(f"\nYears covered: {permits_agg['year'].min()} to {permits_agg['year'].max()}")
print(f"\nTotal permits per year:")
yearly_totals = permits_agg.groupby('year')['total_permits'].sum().sort_index()
print(yearly_totals)

print(f"\nPermits by borough (all years):")
borough_totals = permits_agg.groupby('borough')['total_permits'].sum().sort_values(ascending=False)
print(borough_totals)