In [14]:
import sys
import pandas as pd
sys.path.append('..')  # Add parent directory to path
from cloud_utils import get_feature_data, get_feature_data_with_geometry
from lvt_utils import model_split_rate_tax
from census_utils import get_census_data, get_census_blockgroups_shapefile, get_census_data_with_boundaries


# Base URL for the ArcGIS services
base_url = "https://services.arcgis.com/OjftlhRHkAABcyiF/arcgis/rest/services"

In [15]:
# Fetch the main parcel dataset with tax info
parcel_civic_df = get_feature_data('Parcel_Civic', base_url)


Total records in Parcel_Civic: 119569
Fetched records 0 to 2000 for Parcel_Civic
Fetched records 2000 to 4000 for Parcel_Civic
Fetched records 4000 to 6000 for Parcel_Civic
Fetched records 6000 to 8000 for Parcel_Civic
Fetched records 8000 to 10000 for Parcel_Civic
Fetched records 10000 to 12000 for Parcel_Civic
Fetched records 12000 to 14000 for Parcel_Civic
Fetched records 14000 to 16000 for Parcel_Civic
Fetched records 16000 to 18000 for Parcel_Civic
Fetched records 18000 to 20000 for Parcel_Civic
Fetched records 20000 to 22000 for Parcel_Civic
Fetched records 22000 to 24000 for Parcel_Civic
Fetched records 24000 to 26000 for Parcel_Civic
Fetched records 26000 to 28000 for Parcel_Civic
Fetched records 28000 to 30000 for Parcel_Civic
Fetched records 30000 to 32000 for Parcel_Civic
Fetched records 32000 to 34000 for Parcel_Civic
Fetched records 34000 to 36000 for Parcel_Civic
Fetched records 36000 to 38000 for Parcel_Civic
Fetched records 38000 to 40000 for Parcel_Civic
Fetched record

In [16]:
# Basic data cleaning and analysis
df = parcel_civic_df.copy()[parcel_civic_df['PROP_CITY'].str.upper().str.contains('SOUTH BEND', na=False)]
df['REALLANDVA'], df['REALIMPROV'] = pd.to_numeric(df['REALLANDVA'], errors='coerce').fillna(0), pd.to_numeric(df['REALIMPROV'], errors='coerce').fillna(0)
df['TOTAL_VALUE'] = df['REALLANDVA'] + df['REALIMPROV']
millage_rate = 0.0033  # $5 per $1000 of value
df['CURRENT_TAX'] = df['TOTAL_VALUE'] * millage_rate
current_revenue = df['CURRENT_TAX'].sum()

print(f"Total number of properties: {len(df):,}")
print(f"Current annual revenue with ${millage_rate*1000}/1000 millage rate: ${current_revenue:,.2f}")
print(f"Total land value: ${df['REALLANDVA'].sum():,.2f}")
print(f"Total improvement value: ${df['REALIMPROV'].sum():,.2f}")


Total number of properties: 63,879
Current annual revenue with $3.3/1000 millage rate: $41,762,853.10
Total land value: $1,991,126,800.00
Total improvement value: $10,664,283,230.00


In [17]:
# Calculate split-rate tax using model_split_rate_tax function
land_millage, improvement_millage, new_revenue, df = model_split_rate_tax(
    df=df,
    land_value_col='REALLANDVA',
    improvement_value_col='REALIMPROV',
    current_revenue=current_revenue,
    land_improvement_ratio=4  # 4:1 ratio as specified
)

# Calculate tax changes manually since they're not being added by the function
df['NEW_TAX'] = (df['REALLANDVA'] * land_millage) + (df['REALIMPROV'] * improvement_millage)
df['TAX_CHANGE'] = df['NEW_TAX'] - df['CURRENT_TAX']
df['TAX_CHANGE_PCT'] = (df['TAX_CHANGE'] / df['CURRENT_TAX']) * 100

# Show some example properties
print("\nExample Properties (sorted by largest tax changes):")
example_cols = ['PROP_ADDR', 'REALLANDVA', 'REALIMPROV', 'CURRENT_TAX', 'NEW_TAX', 'TAX_CHANGE', 'TAX_CHANGE_PCT']
print(df[example_cols].nlargest(5, 'TAX_CHANGE_PCT'))

Split-rate tax model (Land:Improvement = 4:1)
Land millage rate: 8.9674
Improvement millage rate: 2.2418
Total tax revenue: $41,762,853.10
Target revenue: $41,762,853.10
Revenue difference: $0.00 (0.0000%)

Example Properties (sorted by largest tax changes):
           PROP_ADDR  REALLANDVA  REALIMPROV  CURRENT_TAX       NEW_TAX  \
30     1419 ELMER ST        6100           0        20.13  54701.008068   
99    1601 OBRIEN ST        6100           0        20.13  54701.008068   
152  1647 FREMONT ST        6100           0        20.13  54701.008068   
392  1401 N OLIVE ST        6100           0        20.13  54701.008068   
394  1403 N OLIVE ST        6100           0        20.13  54701.008068   

       TAX_CHANGE  TAX_CHANGE_PCT  
30   54680.878068   271638.738541  
99   54680.878068   271638.738541  
152  54680.878068   271638.738541  
392  54680.878068   271638.738541  
394  54680.878068   271638.738541  


In [18]:
# For each column, show top 10 most common values and their counts
columns_to_analyze = ['CLASSCODE', 'TOWNSHIP', 'TAXDIST', 'Neighborho', 'PROPTYPE', 
                     'TAXTYPE', 'TIFAREAUID', 'LEGALDESCR']

for col in columns_to_analyze:
    print(f"\nTop 10 values for {col}:")
    value_counts = df[col].value_counts().head(10)
    print(value_counts)
    print(f"Total unique values: {df[col].nunique()}")
    print("-" * 50)

# Let's also look at some basic statistics about these groups
print("\nMedian tax changes by various groupings:")

for col in ['CLASSCODE', 'TOWNSHIP', 'TAXDIST', 'PROPTYPE']:
    print(f"\nMedian tax change by {col}:")
    median_changes = df.groupby(col)['TAX_CHANGE'].agg([
        'count',
        'median',
        lambda x: (x > 0).mean() * 100  # Percentage with increase
    ]).round(2)
    median_changes.columns = ['Count', 'Median Change ($)', '% With Increase']
    print(median_changes.sort_values('Count', ascending=False).head(10))


Top 10 values for CLASSCODE:
CLASSCODE
510    48709
500     4011
640     1041
550     1018
551      840
511      755
520      647
429      426
599      398
456      335
Name: count, dtype: int64
Total unique values: 120
--------------------------------------------------

Top 10 values for TOWNSHIP:
TOWNSHIP
Portage    39043
Clay        9192
Centre      6457
German      3652
Warren      3473
Greene      1125
Penn         763
Madison       89
              49
Union         35
Name: count, dtype: int64
Total unique values: 11
--------------------------------------------------

Top 10 values for TAXDIST:
TAXDIST
SB Portage    37415
Clay           7812
SB Centre      3858
Warren         3464
Centre         2599
German         1958
SB German      1694
Portage        1628
Greene         1125
SB Clay         852
Name: count, dtype: int64
Total unique values: 21
--------------------------------------------------

Top 10 values for Neighborho:
Neighborho
7126296    1823
7126222    1055
7126282 

In [19]:
# Create a summary DataFrame grouped by PROPTYPE
proptype_analysis = df.groupby('PROPTYPE').agg({
    'TAX_CHANGE_PCT': 'mean',  # Average percentage change
    'TAX_CHANGE': 'median',    # Median dollar change
    'PARCELID': 'count'        # Count of properties
}).round(2)

# Add percentage that increase
proptype_increases = df.groupby('PROPTYPE').agg({
    'TAX_CHANGE': lambda x: (x > 0).mean() * 100  # Percentage with increase
}).round(2)

proptype_analysis['Percent_Increased'] = proptype_increases['TAX_CHANGE']

# Rename columns for clarity
proptype_analysis.columns = [
    'Avg_Pct_Change',
    'Median_Dollar_Change',
    'Property_Count',
    'Pct_Properties_Increased'
]

# Sort by count of properties (descending)
proptype_analysis = proptype_analysis.sort_values('Property_Count', ascending=False)

# Print results
print("Analysis by Property Type:\n")
print("Note: All monetary values in dollars, percentages shown as %\n")
print(proptype_analysis.to_string())

# Print some summary statistics
print("\nOverall Summary:")
print(f"Total properties analyzed: {proptype_analysis['Property_Count'].sum():,}")
print(f"Overall median dollar change: ${df['TAX_CHANGE'].median():,.2f}")
print(f"Overall average percent change: {df['TAX_CHANGE_PCT'].mean():.2f}%")
print(f"Overall percent of properties with increase: {(df['TAX_CHANGE'] > 0).mean()*100:.2f}%")

Analysis by Property Type:

Note: All monetary values in dollars, percentages shown as %

                                                                                            Avg_Pct_Change  Median_Dollar_Change  Property_Count  Pct_Properties_Increased
PROPTYPE                                                                                                                                                                  
1 Family Dwell - Platted Lot                                                                      98398.05             501824.16           48709                     99.93
Vacant - Platted Lot                                                                             270022.37              34063.50            4011                     97.88
Exempt, Municipality                                                                             130543.62                  0.00            1041                      0.10
Condominium Unit - Platted Lot                         

In [20]:
import numpy as np
from datetime import datetime

# Add per-area calculations
# Convert Shape__Area to acres (it's typically in square feet)
df['ACRES'] = df['ACREAGE'].fillna(0)

# Calculate price per acre (handling zero acres to avoid division by zero)
df['LAND_PRICE_PER_ACRE'] = np.where(
    df['ACRES'] > 0,
    df['REALLANDVA'] / df['ACRES'],
    np.nan
)

# Calculate land percentage of total value
df['LAND_PCT_OF_TOTAL'] = np.where(
    df['TOTAL_VALUE'] > 0,
    (df['REALLANDVA'] / df['TOTAL_VALUE']) * 100,
    np.nan
)


In [21]:
boundary_gdf = get_feature_data_with_geometry('parcel_boundaries', base_url=base_url)


Total records in parcel_boundaries: 119668
Fetched records 0 to 2000 of 119668
Fetched records 2000 to 4000 of 119668
Fetched records 4000 to 6000 of 119668
Fetched records 6000 to 8000 of 119668
Fetched records 8000 to 10000 of 119668
Fetched records 10000 to 12000 of 119668
Fetched records 12000 to 14000 of 119668
Fetched records 14000 to 16000 of 119668
Fetched records 16000 to 18000 of 119668
Fetched records 18000 to 20000 of 119668
Fetched records 20000 to 22000 of 119668
Fetched records 22000 to 24000 of 119668
Fetched records 24000 to 26000 of 119668
Fetched records 26000 to 28000 of 119668
Fetched records 28000 to 30000 of 119668
Fetched records 30000 to 32000 of 119668
Fetched records 32000 to 34000 of 119668
Fetched records 34000 to 36000 of 119668
Fetched records 36000 to 38000 of 119668
Fetched records 38000 to 40000 of 119668
Fetched records 40000 to 42000 of 119668
Fetched records 42000 to 44000 of 119668
Fetched records 44000 to 46000 of 119668
Fetched records 46000 to 4

In [22]:

print(len(boundary_gdf))

# Merge with our tax analysis data
merged_gdf = boundary_gdf.merge(
    df,
    on='PARCELID',
    how='inner'
)

print(f"\nMerged data has {len(merged_gdf)} parcels")

119668

Merged data has 64655 parcels


In [None]:

# Get census data for St. Joseph County (FIPS code: 18141)
census_data, census_boundaries = get_census_data_with_boundaries(
    fips_code='18141',  # Indiana (18) + St. Joseph County (141)
    year=2022,
    api_key='YOUR_API_KEY'  # Replace with your actual Census API key
)

print(f"Number of census blocks: {len(census_boundaries)}")