In [2]:
import requests
import pandas as pd
import json
from azure.storage.blob import BlobServiceClient
from azure.core.exceptions import ResourceExistsError
import geopandas as gpd
from io import BytesIO
import os
from pathlib import Path
from dotenv import load_dotenv
import zipfile


load_dotenv()
# Base URL for the ArcGIS services
base_url = "https://services.arcgis.com/OjftlhRHkAABcyiF/arcgis/rest/services"

datasets = [
    # Current Parcel and Property Data
    'parcel_2023_class_code',
    'Parcel_dims_2024',
    'structures_2023',
    'Parcel',  # Main parcel dataset
    'Parcel_Civic',  # Civic property information
    
    # Sales and Transaction Data
    '2023_St_Joseph_County_Sales_20240301',
    'Parcel_Trans_History_gdb',
    
    # Tax Related
    'tax_district_dashboard',
    'county_tax_districts',
    'SJC_TIF_Districts_Summary',
    
    # Land Use and Zoning
    'Zoning',
    'Zoning_Boundaries',
    
    # Additional Context
    'parcel_boundaries',
    'South_Bend',
    '2024_SB_Desired_Properties',
     'SB_City_Landmark_Data',
      'smallstructures_gdb',
       'structures_2023',
       'Parcel_dims_2024',
       'MISH_Parcel'# City boundary/information
]
# Function to get field information for a dataset
def get_fields(dataset_name):
    url = f"{base_url}/{dataset_name}/FeatureServer/0"
    params = {'f': 'json'}
    
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        if 'fields' in data:
            return [field['name'] for field in data['fields']]
    return None

# Get and print fields for each dataset
for dataset in datasets:
    print(f"\nFields for {dataset}:")
    fields = get_fields(dataset)
    if fields:
        print(json.dumps(fields, indent=2))
    else:
        print("Could not fetch fields")


Fields for parcel_2023_class_code:
[
  "FID",
  "PARCELID",
  "PARCELSTAT",
  "NAME_1",
  "REALLANDVA",
  "REALIMPROV",
  "CLASSCODE",
  "PARCPRC",
  "PROVALWBLK",
  "PROPTYPE",
  "YEAR_BUILT",
  "Shape__Area",
  "Shape__Length"
]

Fields for Parcel_dims_2024:
[
  "FID",
  "TextStri_1",
  "Angle_1",
  "ORIG_FID"
]

Fields for structures_2023:
[
  "FID",
  "OBJECTID",
  "PARCELID",
  "SHAPE_Leng",
  "SHAPE_Area",
  "Shape__Area",
  "Shape__Length"
]

Fields for Parcel:
[
  "OBJECTID",
  "PARCELID",
  "PARCELSTAT",
  "NAME1",
  "ACREAGE",
  "PROP_ADDR",
  "PROP_CITY",
  "PROP_STATE",
  "PROP_ZIP",
  "PARCPRC",
  "PROVALWBLK",
  "PAYTAXURL",
  "TAXINFOURL",
  "DATEUPDATE",
  "GISWEBLK",
  "SHAPE__Area",
  "SHAPE__Length"
]

Fields for Parcel_Civic:
[
  "FID",
  "PARCELID",
  "TAXTYPE",
  "TAXUNITUID",
  "PARCELSTAT",
  "LEGALDESCR",
  "PAYYEAR",
  "NAME_1",
  "MAILINGADD",
  "MAILINGA_1",
  "MAILINGCIT",
  "MAILINGSTA",
  "MAILINGZIP",
  "TIFAREAUID",
  "TIFAREASSE",
  "REALLANDVA",
  "R

In [5]:
folder_name = "raw_arcgis_data"
connection_string = os.getenv('AZURE_CONNECTION_STRING')
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_name = "southbend-project"
# Create container if it doesn't exist
try:
    container_client = blob_service_client.create_container(container_name)
    print(f"Created container: {container_name}")
except ResourceExistsError:
    container_client = blob_service_client.get_container_client(container_name)
    print(f"Container {container_name} already exists")

def get_feature_data(dataset_name, layer_id=0):
    """
    Get all features from a service using pagination
    """
    url = f"{base_url}/{dataset_name}/FeatureServer/{layer_id}/query"
    
    # First, get the count of all features
    count_params = {
        'f': 'json',
        'where': '1=1',
        'returnCountOnly': 'true'
    }
    
    try:
        count_response = requests.get(url, params=count_params)
        count_response.raise_for_status()
        total_records = count_response.json().get('count', 0)
        
        print(f"Total records in {dataset_name}: {total_records}")
        
        # Now fetch the actual data in chunks
        all_features = []
        offset = 0
        chunk_size = 2000  # ArcGIS typically limits to 2000 records per request
        
        while offset < total_records:
            params = {
                'f': 'json',
                'where': '1=1',
                'outFields': '*',
                'returnGeometry': 'false',
                'resultOffset': offset,
                'resultRecordCount': chunk_size
            }
            
            response = requests.get(url, params=params)
            response.raise_for_status()
            data = response.json()
            
            if 'features' in data:
                features = [feature['attributes'] for feature in data['features']]
                all_features.extend(features)
                
                print(f"Fetched records {offset} to {offset + len(features)} for {dataset_name}")
                
                if len(features) < chunk_size:
                    break
                    
                offset += chunk_size
            else:
                print(f"No features found in response for {dataset_name}")
                break
                
        return pd.DataFrame(all_features)
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from {dataset_name}: {e}")
        return None

def save_to_azure(df, dataset_name):
    """
    Save DataFrame to Azure Blob Storage
    """
    if df is None or df.empty:
        print(f"No data to save for {dataset_name}")
        return
        
    # Create CSV in memory
    csv_buffer = BytesIO()
    df.to_csv(csv_buffer, index=False)
    csv_buffer.seek(0)
    
    # Generate blob name with timestamp
    timestamp = datetime.now().strftime("%Y%m%d")
    blob_name = f"{folder_name}/{dataset_name}_{timestamp}.csv"
    
    # Upload to Azure
    try:
        blob_client = container_client.get_blob_client(blob_name)
        blob_client.upload_blob(csv_buffer, overwrite=True)
        print(f"Successfully uploaded {dataset_name} to Azure")
        
        # Save data dictionary separately
        data_dict = {
            'columns': list(df.columns),
            'dtypes': df.dtypes.astype(str).to_dict(),
            'record_count': len(df),
            'timestamp': timestamp
        }
        
        dict_blob_name = f"{folder_name}/{dataset_name}_{timestamp}_dictionary.json"
        dict_blob_client = container_client.get_blob_client(dict_blob_name)
        dict_blob_client.upload_blob(json.dumps(data_dict, indent=2), overwrite=True)
        
    except Exception as e:
        print(f"Error saving {dataset_name} to Azure: {e}")

# Process each dataset
for dataset in datasets:
    print(f"\nProcessing {dataset}...")
    try:
        # Fetch data
        df = get_feature_data(dataset)
        
        if df is not None:
            print(f"Retrieved {len(df)} records for {dataset}")
            
            # Save to Azure
            save_to_azure(df, dataset)
        else:
            print(f"No data retrieved for {dataset}")
            
    except Exception as e:
        print(f"Error processing {dataset}: {e}")
        continue

print("\nData pipeline complete!")

Created container: southbend-project

Processing parcel_2023_class_code...
Total records in parcel_2023_class_code: 119980
Fetched records 0 to 2000 for parcel_2023_class_code
Fetched records 2000 to 4000 for parcel_2023_class_code
Fetched records 4000 to 6000 for parcel_2023_class_code
Fetched records 6000 to 8000 for parcel_2023_class_code
Fetched records 8000 to 10000 for parcel_2023_class_code
Fetched records 10000 to 12000 for parcel_2023_class_code
Fetched records 12000 to 14000 for parcel_2023_class_code
Fetched records 14000 to 16000 for parcel_2023_class_code
Fetched records 16000 to 18000 for parcel_2023_class_code
Fetched records 18000 to 20000 for parcel_2023_class_code
Fetched records 20000 to 22000 for parcel_2023_class_code


KeyboardInterrupt: 

In [11]:
# Fetch the main parcel dataset with tax info
parcel_civic_df = get_feature_data('Parcel_Civic')

# Basic data cleaning
df = parcel_civic_df.copy()

# Restrict to South Bend properties
df = df[df['PROP_CITY'].str.upper().str.contains('SOUTH BEND', na=False)]

# Convert values to numeric, handling any non-numeric values
df['REALLANDVA'] = pd.to_numeric(df['REALLANDVA'], errors='coerce')
df['REALIMPROV'] = pd.to_numeric(df['REALIMPROV'], errors='coerce')

# Fill NaN values with 0
df['REALLANDVA'] = df['REALLANDVA'].fillna(0)
df['REALIMPROV'] = df['REALIMPROV'].fillna(0)

# Calculate total value
df['TOTAL_VALUE'] = df['REALLANDVA'] + df['REALIMPROV']

# Current system analysis with $5 millage rate
millage_rate = 0.005  # $5 per $1000 of value

# Calculate current tax
df['CURRENT_TAX'] = df['TOTAL_VALUE'] * millage_rate
current_revenue = df['CURRENT_TAX'].sum()

print(f"Total number of properties: {len(df):,}")
print(f"Current annual revenue with ${millage_rate*1000}/1000 millage rate: ${current_revenue:,.2f}")
print(f"Total land value: ${df['REALLANDVA'].sum():,.2f}")
print(f"Total improvement value: ${df['REALIMPROV'].sum():,.2f}")

# Now calculate split-rate tax (4:1 ratio)
# We need to solve for x where:
# x*4*total_land_value + x*total_improvement_value = current_revenue
# where x is the base rate for improvements

total_land_value = df['REALLANDVA'].sum()
total_improvement_value = df['REALIMPROV'].sum()

# Solve for base rate
base_rate = current_revenue / (4 * total_land_value + total_improvement_value)
land_rate = base_rate * 4

# Calculate new taxes
df['NEW_LAND_TAX'] = df['REALLANDVA'] * land_rate
df['NEW_IMPROVEMENT_TAX'] = df['REALIMPROV'] * base_rate
df['NEW_TOTAL_TAX'] = df['NEW_LAND_TAX'] + df['NEW_IMPROVEMENT_TAX']

# Calculate change in tax burden
df['TAX_CHANGE'] = df['NEW_TOTAL_TAX'] - df['CURRENT_TAX']
df['TAX_CHANGE_PCT'] = (df['TAX_CHANGE'] / df['CURRENT_TAX']) * 100

print(f"\nSplit Rate Analysis:")
print(f"Land tax rate: ${land_rate*1000:.2f}/1000")
print(f"Improvement tax rate: ${base_rate*1000:.2f}/1000")
print(f"New total revenue: ${df['NEW_TOTAL_TAX'].sum():,.2f}")

print("\nImpact Analysis:")
print(f"Properties with tax increase: {(df['TAX_CHANGE'] > 0).sum():,}")
print(f"Properties with tax decrease: {(df['TAX_CHANGE'] < 0).sum():,}")
print(f"Median tax change: ${df['TAX_CHANGE'].median():,.2f}")
print(f"Mean tax change: ${df['TAX_CHANGE'].mean():,.2f}")

# Display distribution of changes
print("\nDistribution of Tax Changes:")
print(df['TAX_CHANGE_PCT'].describe())

# Show some example properties
print("\nExample Properties (sorted by largest tax changes):")
example_cols = ['PROP_ADDR', 'REALLANDVA', 'REALIMPROV', 'CURRENT_TAX', 'NEW_TOTAL_TAX', 'TAX_CHANGE', 'TAX_CHANGE_PCT']
print(df[example_cols].nlargest(5, 'TAX_CHANGE_PCT'))

Total records in Parcel_Civic: 119679
Fetched records 0 to 2000 for Parcel_Civic
Fetched records 2000 to 4000 for Parcel_Civic
Fetched records 4000 to 6000 for Parcel_Civic
Fetched records 6000 to 8000 for Parcel_Civic
Fetched records 8000 to 10000 for Parcel_Civic
Fetched records 10000 to 12000 for Parcel_Civic
Fetched records 12000 to 14000 for Parcel_Civic
Fetched records 14000 to 16000 for Parcel_Civic
Fetched records 16000 to 18000 for Parcel_Civic
Fetched records 18000 to 20000 for Parcel_Civic
Fetched records 20000 to 22000 for Parcel_Civic
Fetched records 22000 to 24000 for Parcel_Civic
Fetched records 24000 to 26000 for Parcel_Civic
Fetched records 26000 to 28000 for Parcel_Civic
Fetched records 28000 to 30000 for Parcel_Civic
Fetched records 30000 to 32000 for Parcel_Civic
Fetched records 32000 to 34000 for Parcel_Civic
Fetched records 34000 to 36000 for Parcel_Civic
Fetched records 36000 to 38000 for Parcel_Civic
Fetched records 38000 to 40000 for Parcel_Civic
Fetched record

In [12]:
# For each column, show top 10 most common values and their counts
columns_to_analyze = ['CLASSCODE', 'TOWNSHIP', 'TAXDIST', 'Neighborho', 'PROPTYPE', 
                     'TAXTYPE', 'TIFAREAUID', 'LEGALDESCR']

for col in columns_to_analyze:
    print(f"\nTop 10 values for {col}:")
    value_counts = df[col].value_counts().head(10)
    print(value_counts)
    print(f"Total unique values: {df[col].nunique()}")
    print("-" * 50)

# Let's also look at some basic statistics about these groups
print("\nMedian tax changes by various groupings:")

for col in ['CLASSCODE', 'TOWNSHIP', 'TAXDIST', 'PROPTYPE']:
    print(f"\nMedian tax change by {col}:")
    median_changes = df.groupby(col)['TAX_CHANGE'].agg([
        'count',
        'median',
        lambda x: (x > 0).mean() * 100  # Percentage with increase
    ]).round(2)
    median_changes.columns = ['Count', 'Median Change ($)', '% With Increase']
    print(median_changes.sort_values('Count', ascending=False).head(10))


Top 10 values for CLASSCODE:
CLASSCODE
510    48695
500     3999
640     1040
550     1033
551      839
511      753
520      645
429      428
599      401
456      334
Name: count, dtype: int64
Total unique values: 120
--------------------------------------------------

Top 10 values for TOWNSHIP:
TOWNSHIP
Portage    39041
Clay        9197
Centre      6455
German      3650
Warren      3463
Greene      1125
Penn         764
             129
Madison       89
Union         35
Name: count, dtype: int64
Total unique values: 11
--------------------------------------------------

Top 10 values for TAXDIST:
TAXDIST
SB Portage    37414
Clay           7813
SB Centre      3857
Warren         3455
Centre         2598
German         1956
SB German      1694
Portage        1627
Greene         1125
SB Clay         853
Name: count, dtype: int64
Total unique values: 21
--------------------------------------------------

Top 10 values for Neighborho:
Neighborho
7126296    1824
7126222    1055
7126282 

In [13]:
# Create a summary DataFrame grouped by PROPTYPE
proptype_analysis = df.groupby('PROPTYPE').agg({
    'TAX_CHANGE_PCT': 'mean',  # Average percentage change
    'TAX_CHANGE': 'median',    # Median dollar change
    'PARCELID': 'count'        # Count of properties
}).round(2)

# Add percentage that increase
proptype_increases = df.groupby('PROPTYPE').agg({
    'TAX_CHANGE': lambda x: (x > 0).mean() * 100  # Percentage with increase
}).round(2)

proptype_analysis['Percent_Increased'] = proptype_increases['TAX_CHANGE']

# Rename columns for clarity
proptype_analysis.columns = [
    'Avg_Pct_Change',
    'Median_Dollar_Change',
    'Property_Count',
    'Pct_Properties_Increased'
]

# Sort by count of properties (descending)
proptype_analysis = proptype_analysis.sort_values('Property_Count', ascending=False)

# Print results
print("Analysis by Property Type:\n")
print("Note: All monetary values in dollars, percentages shown as %\n")
print(proptype_analysis.to_string())

# Print some summary statistics
print("\nOverall Summary:")
print(f"Total properties analyzed: {proptype_analysis['Property_Count'].sum():,}")
print(f"Overall median dollar change: ${df['TAX_CHANGE'].median():,.2f}")
print(f"Overall average percent change: {df['TAX_CHANGE_PCT'].mean():.2f}%")
print(f"Overall percent of properties with increase: {(df['TAX_CHANGE'] > 0).mean()*100:.2f}%")

Analysis by Property Type:

Note: All monetary values in dollars, percentages shown as %

                                                                                            Avg_Pct_Change  Median_Dollar_Change  Property_Count  Pct_Properties_Increased
PROPTYPE                                                                                                                                                                  
1 Family Dwell - Platted Lot                                                                         -1.40                -40.55           48695                     40.03
Vacant - Platted Lot                                                                                170.91                 31.89            3999                     97.55
Exempt, Municipality                                                                                 30.96                  0.00            1040                      0.10
Condominium Unit - Platted Lot                         

In [14]:
import numpy as np
from datetime import datetime

# Add per-area calculations
# Convert Shape__Area to acres (it's typically in square feet)
df['ACRES'] = df['ACREAGE'].fillna(0)

# Calculate price per acre (handling zero acres to avoid division by zero)
df['LAND_PRICE_PER_ACRE'] = np.where(
    df['ACRES'] > 0,
    df['REALLANDVA'] / df['ACRES'],
    np.nan
)

# Calculate land percentage of total value
df['LAND_PCT_OF_TOTAL'] = np.where(
    df['TOTAL_VALUE'] > 0,
    (df['REALLANDVA'] / df['TOTAL_VALUE']) * 100,
    np.nan
)

# Create final dataframe with all metrics
processed_df = df[[
    'PARCELID', 
    'PROP_ADDR',
    'PROPTYPE',
    'ACRES',
    'REALLANDVA',
    'REALIMPROV',
    'TOTAL_VALUE',
    'LAND_PRICE_PER_ACRE',
    'LAND_PCT_OF_TOTAL',
    'CURRENT_TAX',
    'NEW_LAND_TAX',
    'NEW_IMPROVEMENT_TAX',
    'NEW_TOTAL_TAX',
    'TAX_CHANGE',
    'TAX_CHANGE_PCT'
]].copy()

# Save to Azure in a new folder
processed_folder = "processed_tax_analysis"
timestamp = datetime.now().strftime("%Y%m%d")
blob_name = f"{processed_folder}/tax_analysis_metrics_{timestamp}.csv"

# Create CSV in memory
csv_buffer = BytesIO()
processed_df.to_csv(csv_buffer, index=False)
csv_buffer.seek(0)

# Upload to Azure
try:
    blob_client = container_client.get_blob_client(blob_name)
    blob_client.upload_blob(csv_buffer, overwrite=True)
    print(f"Successfully uploaded processed data to Azure as {blob_name}")
    
    # Save data dictionary
    data_dict = {
        'columns': list(processed_df.columns),
        'dtypes': processed_df.dtypes.astype(str).to_dict(),
        'record_count': len(processed_df),
        'timestamp': timestamp,
        'metrics_description': {
            'LAND_PRICE_PER_ACRE': 'Land value divided by acreage',
            'LAND_PCT_OF_TOTAL': 'Land value as percentage of total property value',
            'TAX_CHANGE': 'Absolute change in tax under new system',
            'TAX_CHANGE_PCT': 'Percentage change in tax under new system'
        }
    }
    
    dict_blob_name = f"{processed_folder}/tax_analysis_metrics_{timestamp}_dictionary.json"
    dict_blob_client = container_client.get_blob_client(dict_blob_name)
    dict_blob_client.upload_blob(json.dumps(data_dict, indent=2), overwrite=True)
    
    # Print summary statistics
    print("\nSummary Statistics:")
    print(f"Mean land price per acre: ${processed_df['LAND_PRICE_PER_ACRE'].mean():,.2f}")
    print(f"Median land price per acre: ${processed_df['LAND_PRICE_PER_ACRE'].median():,.2f}")
    print(f"Mean land % of total value: {processed_df['LAND_PCT_OF_TOTAL'].mean():.1f}%")
    print(f"Records processed: {len(processed_df):,}")
    
except Exception as e:
    print(f"Error saving to Azure: {e}")

Successfully uploaded processed data to Azure as processed_tax_analysis/tax_analysis_metrics_20250225.csv

Summary Statistics:
Mean land price per acre: $100,181.14
Median land price per acre: $61,923.08
Mean land % of total value: 23.1%
Records processed: 63,949


In [6]:
# First get the boundaries data
boundaries_df = get_feature_data('parcel_boundaries')
print("\nBoundaries Data Stats:")
print(f"Number of records: {len(boundaries_df) if boundaries_df is not None else 0}")
print(f"Number of unique PARCELIDs: {boundaries_df['PARCELID'].nunique() if boundaries_df is not None else 0}")

# Print info about our main dataframe
print("\nMain Analysis Data Stats (before merge):")
print(f"Number of records: {len(df)}")
print(f"Number of unique PARCELIDs: {df['PARCELID'].nunique()}")

# Merge with boundaries
merged_df = df.merge(
    boundaries_df[['PARCELID', 'Shape__Area', 'Shape__Length']],
    on='PARCELID',
    how='left',
    suffixes=('', '_boundary')
)

# Print merge results
print("\nMerged Data Stats:")
print(f"Number of records after merge: {len(merged_df)}")
print(f"Number of records with shape data: {merged_df['Shape__Area'].notna().sum()}")
print(f"Percentage of records with shape data: {(merged_df['Shape__Area'].notna().sum() / len(merged_df)) * 100:.2f}%")

# Check for any records that didn't get shape data
print("\nSample of records without shape data (if any):")
missing_shape = merged_df[merged_df['Shape__Area'].isna()][['PARCELID', 'PROP_ADDR', 'PROPTYPE']].head()
print(missing_shape)

# Check distribution of data to ensure it looks reasonable
print("\nShape Area Statistics:")
print(merged_df['Shape__Area'].describe())

Total records in parcel_boundaries: 119651
Fetched records 0 to 2000 for parcel_boundaries
Fetched records 2000 to 4000 for parcel_boundaries
Fetched records 4000 to 6000 for parcel_boundaries
Fetched records 6000 to 8000 for parcel_boundaries
Fetched records 8000 to 10000 for parcel_boundaries
Fetched records 10000 to 12000 for parcel_boundaries
Fetched records 12000 to 14000 for parcel_boundaries
Fetched records 14000 to 16000 for parcel_boundaries
Fetched records 16000 to 18000 for parcel_boundaries
Fetched records 18000 to 20000 for parcel_boundaries
Fetched records 20000 to 22000 for parcel_boundaries
Fetched records 22000 to 24000 for parcel_boundaries
Fetched records 24000 to 26000 for parcel_boundaries
Fetched records 26000 to 28000 for parcel_boundaries
Fetched records 28000 to 30000 for parcel_boundaries
Fetched records 30000 to 32000 for parcel_boundaries
Fetched records 32000 to 34000 for parcel_boundaries
Fetched records 34000 to 36000 for parcel_boundaries
Fetched records

NameError: name 'df' is not defined

In [3]:
from shapely.geometry import Polygon, Point, LineString

def get_feature_data_with_geometry(dataset_name, layer_id=0):
    """Get all features from a service including geometry with pagination"""
    url = f"{base_url}/{dataset_name}/FeatureServer/{layer_id}/query"
    
    # First, get the count of all features
    count_params = {
        'f': 'json',
        'where': '1=1',
        'returnCountOnly': 'true'
    }
    
    try:
        count_response = requests.get(url, params=count_params)
        count_response.raise_for_status()
        total_records = count_response.json().get('count', 0)
        
        print(f"Total records in {dataset_name}: {total_records}")
        
        # Now fetch the actual data in chunks
        all_features = []
        offset = 0
        chunk_size = 2000  # ArcGIS typically limits to 2000 records per request
        
        while offset < total_records:
            params = {
                'f': 'json',
                'where': '1=1',
                'outFields': '*',
                'returnGeometry': 'true',
                'geometryPrecision': 6,
                'resultOffset': offset,
                'resultRecordCount': chunk_size
            }
            
            response = requests.get(url, params=params)
            response.raise_for_status()
            data = response.json()
            
            if 'features' in data:
                # Convert ESRI features to GeoDataFrame format
                for feature in data['features']:
                    # Extract attributes
                    attributes = feature['attributes']
                    
                    # Convert ESRI rings to Shapely polygon
                    if feature['geometry'] and 'rings' in feature['geometry']:
                        # Take the first ring (exterior ring)
                        ring = feature['geometry']['rings'][0]
                        # Create Shapely polygon
                        polygon = Polygon(ring)
                        
                        # Combine attributes and geometry
                        attributes['geometry'] = polygon
                        all_features.append(attributes)
                
                print(f"Fetched records {offset} to {offset + len(data['features'])} of {total_records}")
                
                if len(data['features']) < chunk_size:
                    break
                    
                offset += chunk_size
            else:
                print(f"No features found in response for {dataset_name}")
                break
        
        if all_features:
            # Create GeoDataFrame
            gdf = gpd.GeoDataFrame(all_features, crs='EPSG:3857')  # Web Mercator
            
            # Convert to WGS84
            gdf = gdf.to_crs('EPSG:4326')
            
            return gdf
        
        return None
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from {dataset_name}: {e}")
        return None

print("Fetching parcel boundaries with geometries...")
boundary_gdf = get_feature_data_with_geometry('parcel_boundaries')


Fetching parcel boundaries with geometries...
Total records in parcel_boundaries: 119651
Fetched records 0 to 2000 of 119651
Fetched records 2000 to 4000 of 119651
Fetched records 4000 to 6000 of 119651
Fetched records 6000 to 8000 of 119651
Fetched records 8000 to 10000 of 119651
Fetched records 10000 to 12000 of 119651
Fetched records 12000 to 14000 of 119651
Fetched records 14000 to 16000 of 119651
Fetched records 16000 to 18000 of 119651
Fetched records 18000 to 20000 of 119651
Fetched records 20000 to 22000 of 119651
Fetched records 22000 to 24000 of 119651
Fetched records 24000 to 26000 of 119651
Fetched records 26000 to 28000 of 119651
Fetched records 28000 to 30000 of 119651
Fetched records 30000 to 32000 of 119651
Fetched records 32000 to 34000 of 119651
Fetched records 34000 to 36000 of 119651
Fetched records 36000 to 38000 of 119651
Fetched records 38000 to 40000 of 119651
Fetched records 40000 to 42000 of 119651
Fetched records 42000 to 44000 of 119651
Fetched records 44000

NameError: name 'df' is not defined

In [7]:
boundary_gdf_two = get_feature_data_with_geometry('parcel_civic')

#print column names  of boundary_gdf_two
print(boundary_gdf_two.columns)

Total records in parcel_civic: 119651
Fetched records 0 to 2000 of 119651
Fetched records 2000 to 4000 of 119651
Fetched records 4000 to 6000 of 119651
Fetched records 6000 to 8000 of 119651
Fetched records 8000 to 10000 of 119651
Fetched records 10000 to 12000 of 119651
Fetched records 12000 to 14000 of 119651
Fetched records 14000 to 16000 of 119651
Fetched records 16000 to 18000 of 119651
Fetched records 18000 to 20000 of 119651
Fetched records 20000 to 22000 of 119651
Fetched records 22000 to 24000 of 119651
Fetched records 24000 to 26000 of 119651
Fetched records 26000 to 28000 of 119651
Fetched records 28000 to 30000 of 119651
Fetched records 30000 to 32000 of 119651
Fetched records 32000 to 34000 of 119651
Fetched records 34000 to 36000 of 119651
Fetched records 36000 to 38000 of 119651
Fetched records 38000 to 40000 of 119651
Fetched records 40000 to 42000 of 119651
Fetched records 42000 to 44000 of 119651
Fetched records 44000 to 46000 of 119651
Fetched records 46000 to 48000 

In [8]:
boundary_gdf_two = boundary_gdf_two[boundary_gdf_two['PROP_CITY'].str.upper().str.contains('SOUTH BEND', na=False)]
#print the number of records in boundary_gdf_two
print(len(boundary_gdf_two))

63940


In [None]:

if boundary_gdf is not None:
    print(f"\nSuccessfully created GeoDataFrame with {len(boundary_gdf)} parcels")
    
    # Merge with our tax analysis data
    merged_gdf = boundary_gdf.merge(
        df,
        on='PARCELID',
        how='inner'
    )
    
    print(f"\nMerged data has {len(merged_gdf)} parcels")
    print("Sample of merge matches:")
    print(merged_gdf[['PARCELID', 'TAX_CHANGE_PCT']].head())
    
    # Save as GeoJSON
    processed_folder = "processed_tax_analysis"
    timestamp = datetime.now().strftime("%Y%m%d")
    blob_name = f"{processed_folder}/tax_analysis_with_polygons_{timestamp}.geojson"
    
    try:
        # Convert to GeoJSON string
        geojson_str = merged_gdf.to_json()
        
        # Upload to Azure
        blob_client = container_client.get_blob_client(blob_name)
        blob_client.upload_blob(geojson_str, overwrite=True)
        print(f"\nSuccessfully uploaded GeoJSON to Azure as {blob_name}")
        
        # Print some statistics about the saved data
        print(f"\nFinal Statistics:")
        print(f"Total parcels: {len(merged_gdf):,}")
        print(f"Parcels with tax increase: {(merged_gdf['TAX_CHANGE_PCT'] > 0).sum():,}")
        print(f"Parcels with tax decrease: {(merged_gdf['TAX_CHANGE_PCT'] < 0).sum():,}")
        
    except Exception as e:
        print(f"Error saving to Azure: {e}")
else:
    print("Failed to fetch boundary data")

In [17]:
from census import Census
from us import states
import pandas as pd

# Replace 'YOUR_API_KEY' with your actual Census API key
c = Census('YOUR_API_KEY')

def get_south_bend_data():
    # Get block group data for St. Joseph County
    data = c.acs5.state_county_blockgroup(
        fields=['NAME',
                'B19013_001E',  # Median income
                'B01003_001E',  # Total population
                'B03002_003E',  # White alone
                'B03002_004E',  # Black alone
                'B03002_012E'],  # Hispanic/Latino
        state_fips=states.IN.fips,  # Indiana
        county_fips='141',  # St. Joseph County
        blockgroup='*',  # All block groups
        year=2022  # Most recent ACS 5-year estimates
    )

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Rename columns
    df = df.rename(columns={
        'B19013_001E': 'median_income',
        'B01003_001E': 'total_pop',
        'B03002_003E': 'white_pop',
        'B03002_004E': 'black_pop',
        'B03002_012E': 'hispanic_pop'
    })

    # Extract geographic information from NAME field
    # NAME field format is typically:
    # "Block Group X, Census Tract YYYY.ZZ, St. Joseph County, Indiana"
    df['tract'] = df['NAME'].str.extract(r'Tract (\d+\.?\d*)')
    df['block_group'] = df['NAME'].str.extract(r'Block Group (\d+)')

    # Save to CSV
    df.to_csv('south_bend_demographics.csv', index=False)
    print("Created file: south_bend_demographics.csv")

    return df

# Run the function
south_bend_data = get_south_bend_data()
print(south_bend_data.head())

Created file: south_bend_demographics.csv
                                                NAME  median_income  \
0  Block Group 1; Census Tract 1; St. Joseph Coun...        18857.0   
1  Block Group 2; Census Tract 1; St. Joseph Coun...        30260.0   
2  Block Group 3; Census Tract 1; St. Joseph Coun...        54145.0   
3  Block Group 1; Census Tract 2; St. Joseph Coun...        64583.0   
4  Block Group 2; Census Tract 2; St. Joseph Coun...        40721.0   

   total_pop  white_pop  black_pop  hispanic_pop state county tract  \
0      768.0      260.0      453.0           2.0    18    141     1   
1      711.0      284.0      393.0          14.0    18    141     1   
2      667.0      429.0      201.0          21.0    18    141     1   
3      895.0      421.0      279.0         129.0    18    141     2   
4     1075.0      384.0      559.0         124.0    18    141     2   

  block group block_group  
0           1           1  
1           2           2  
2           3       

In [18]:
import geopandas as gpd
import requests
import json
from shapely.geometry import shape
import pandas as pd
from urllib.parse import urljoin
import os

def fetch_census_blocks():
    """
    Fetch Census Block data from ArcGIS FeatureServer and convert to GeoDataFrame
    """
    # Base URL for the feature service
    base_url = "https://services.arcgis.com/OjftlhRHkAABcyiF/ArcGIS/rest/services/2020_Census_Blocks/FeatureServer/0"
    
    # Query parameters
    params = {
        'where': '1=1',
        'outFields': '*',
        'geometryPrecision': 6,
        'f': 'geojson',
        'returnGeometry': 'true'
    }
    
    try:
        print("Fetching data from ArcGIS server...")
        # Method 1: Direct request
        response = requests.get(urljoin(base_url, "query"), params=params)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Convert response to GeoJSON
        geojson_data = response.json()
        
        # Create GeoDataFrame
        print("Converting data to GeoDataFrame...")
        gdf = gpd.GeoDataFrame.from_features(geojson_data['features'])
        
        # Set CRS to Web Mercator (EPSG:3857) as per the service
        gdf.set_crs(epsg=3857, inplace=True)
        
        # Transform to WGS 84 (EPSG:4326)
        print("Transforming coordinate system to WGS 84...")
        gdf = gdf.to_crs(epsg=4326)
        
        return gdf
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None
    except Exception as e:
        print(f"Error processing data: {e}")
        return None

def save_and_analyze_data(gdf, output_file="census_blocks_2020.shp"):
    """
    Save the GeoDataFrame to a shapefile and perform basic analysis
    """
    if gdf is None:
        print("No data to save or analyze")
        return
    
    try:
        # Save to shapefile
        print(f"Saving data to {output_file}...")
        gdf.to_file(output_file)
        
        # Basic data analysis
        print("\nData Summary:")
        print(f"Number of census blocks: {len(gdf)}")
        print("\nColumns in the dataset:")
        print(gdf.columns.tolist())
        
        print("\nGeometric properties:")
        print(f"Total area covered: {gdf.geometry.area.sum():.2f} square degrees")
        print(f"Coordinate Reference System: {gdf.crs}")
        
        # Display first few rows
        print("\nFirst few rows of the data:")
        print(gdf.head())
        
        # Basic statistics for numeric columns
        print("\nNumeric columns statistics:")
        numeric_cols = gdf.select_dtypes(include=['float64', 'int64']).columns
        if len(numeric_cols) > 0:
            print(gdf[numeric_cols].describe())
            
    except Exception as e:
        print(f"Error saving or analyzing data: {e}")

def main():
    """
    Main function to execute the data fetching and processing
    """
    try:
        # Fetch the data
        census_blocks = fetch_census_blocks()
        
        if census_blocks is not None:
            # Save and analyze the data
            save_and_analyze_data(census_blocks)
            
            print("\nCreated/Modified files during execution:")
            base_name = "census_blocks_2020"
            extensions = ['.shp', '.dbf', '.prj', '.shx']
            for ext in extensions:
                filename = base_name + ext
                if os.path.exists(filename):
                    print(filename)
        
    except Exception as e:
        print(f"Error in main execution: {e}")

if __name__ == "__main__":
    # Check if required packages are installed
    required_packages = ['geopandas', 'requests', 'shapely', 'pandas']
    
    try:
        import pkg_resources
        pkg_resources.require(required_packages)
    except pkg_resources.DistributionNotFound:
        print("Please install required packages using:")
        print("pip install geopandas requests shapely pandas")
        exit(1)
    
    main()

Fetching data from ArcGIS server...


  import pkg_resources


Error fetching data: 400 Client Error: Bad Request for url: https://services.arcgis.com/OjftlhRHkAABcyiF/ArcGIS/rest/services/2020_Census_Blocks/FeatureServer/query?where=1%3D1&outFields=%2A&geometryPrecision=6&f=geojson&returnGeometry=true
