---
title: "SDIS Summary Analysis"
author: "Tina Lasisi"
date: today
format:
  html:
    code-fold: false
    toc: true
execute:
  echo: true
  warning: false
---


## Overview

This analysis examines State DNA Index System (SDIS) data that includes information reported separately for each state's DNA database. The data captures key dimensions including:

- Total size of each state's DNA database
- Whether states collect DNA from arrestees (not just convicted offenders)
- Whether states allow familial DNA searching
- References to relevant state statutes (from Murphy & Tong appendix)

This information provides insight into the variation in DNA database policies, practices, and legal frameworks across U.S. states.

## Data Loading


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# Set up path to data file
base_dir = Path("..")
data_file = base_dir / "output" / "sdis" / "sdis_raw.csv"

# Load the SDIS data
sdis_data = pd.read_csv(data_file)

print(f"Loaded SDIS data: {len(sdis_data)} rows")
print(f"Columns: {list(sdis_data.columns)}")

# Display first few rows
display(sdis_data.head())

# Display data types for each column
print("\nData types:")
display(sdis_data.dtypes)

## Data Preprocessing

This section ensures data consistency between arrestee collection policies and reported arrestee counts. States that do not collect arrestee DNA (arrestee_collection = 'no') should have N_arrestees set to zero to avoid data inconsistencies.


In [None]:
# Create a copy of the data for processing
sdis_data_processed = sdis_data.copy()

# Count states affected by this adjustment
states_with_no_collection = sdis_data_processed[sdis_data_processed['arrestee_collection'] == 'no']
states_to_adjust = states_with_no_collection[states_with_no_collection['n_arrestees'].notna() & (states_with_no_collection['n_arrestees'] != 0)]

if len(states_to_adjust) > 0:
    print(f"States with arrestee_collection='no' but non-zero n_arrestees values:")
    for _, state in states_to_adjust.iterrows():
        print(f"  • {state['state']}: n_arrestees = {state['n_arrestees']:,.0f}")

# Set n_arrestees to 0 for states that don't collect arrestee DNA
sdis_data_processed.loc[sdis_data_processed['arrestee_collection'] == 'no', 'n_arrestees'] = 0

print(f"\nAdjusted N_arrestees to 0 for {len(states_with_no_collection)} states that do not collect arrestee DNA")

# Use processed data for all subsequent analyses
sdis_data = sdis_data_processed

## Data Availability Overview

This section provides an overview of states represented in the dataset and the completeness of data fields across states.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Identify states present in the dataset
states_in_data = sdis_data['state'].unique()
states_in_data = sorted(states_in_data)
print(f"Number of states with data: {len(states_in_data)}")

# Check if all 50 states are represented
all_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
              'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
              'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
              'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
              'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
              'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
              'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia',
              'Wisconsin', 'Wyoming']

missing_states = [state for state in all_states if state not in states_in_data]
if missing_states:
    print(f"\nMissing states: {', '.join(missing_states)}")
else:
    print("\nAll 50 states are represented in the dataset")

# Assess data completeness for each state
data_availability = pd.DataFrame(index=states_in_data)

for col in sdis_data.columns:
    if col != 'state':  # Exclude the state identifier column
        # Calculate non-null values per state
        availability = sdis_data.groupby('state')[col].apply(lambda x: x.notna().sum())
        data_availability[col] = availability

# Generate visualization of data completeness
if len(data_availability.columns) > 0:
    # Focus on key numeric and policy fields
    key_fields = ['n_total', 'n_arrestees', 'n_offenders', 'n_forensic', 
                  'arrestee_collection', 'fam_search']
    
    # Filter to include only key fields that exist in the data
    available_key_fields = [f for f in key_fields if f in data_availability.columns]
    
    if available_key_fields:
        plt.figure(figsize=(10, 14))
        
        # Create binary matrix for visualization
        availability_subset = data_availability[available_key_fields]
        availability_binary = (availability_subset > 0).astype(int)
        
        # Generate heatmap
        sns.heatmap(availability_binary, 
                    cmap=['#f0f0f0', '#2E86AB'],
                    cbar=False,  # Remove colorbar for binary data
                    linewidths=0.5,
                    linecolor='gray',
                    square=True,
                    vmin=0, vmax=1)
        
        plt.title('Data Availability by State', fontsize=14, pad=20)
        plt.xlabel('Data Fields', fontsize=12)
        plt.ylabel('States', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        
        # Add annotations for clarity
        for i in range(len(availability_binary)):
            for j in range(len(availability_binary.columns)):
                if availability_binary.iloc[i, j] == 1:
                    plt.text(j + 0.5, i + 0.5, '✓', ha='center', va='center', 
                            fontsize=8, color='white', fontweight='bold')
        
        plt.tight_layout()
        plt.show()
        
        print(f"\nTotal states shown in heatmap: {len(availability_binary)}")

# Data coverage summary
print(f"\nData field coverage across states:")

# Focus on key fields
key_fields = ['n_total', 'n_arrestees', 'n_offenders', 'n_forensic', 
              'arrestee_collection', 'fam_search', 'collection_statute']

for col in key_fields:
    if col in data_availability.columns:
        states_with_data = (data_availability[col] > 0).sum()
        coverage_pct = states_with_data/len(states_in_data)*100
        print(f"{col}: {states_with_data} states ({coverage_pct:.1f}%)")

## Total Profile Calculations Verification

This section examines states reporting n_total alongside component counts to determine whether totals represent:
1. Sum of all profile types including forensic (n_arrestees + n_offenders + n_forensic)
2. Sum of combined profiles only (n_arrestees + n_offenders)


In [None]:
# Note: This analysis uses the original data before renaming columns
# Create a temporary copy with the original column name for this analysis
sdis_data_temp = sdis_data.copy()
if 'n_total_reported' in sdis_data_temp.columns and 'n_total' not in sdis_data_temp.columns:
    sdis_data_temp['n_total'] = sdis_data_temp['n_total_reported']

# Identify states with n_total and at least one component count
states_with_totals = sdis_data_temp[sdis_data_temp['n_total'].notna()].copy()

# Calculate different possible sums
states_with_totals['sum_all'] = states_with_totals[['n_arrestees', 'n_offenders', 'n_forensic']].sum(axis=1, skipna=True)
states_with_totals['sum_forensic'] = states_with_totals[['n_arrestees', 'n_offenders']].sum(axis=1, skipna=True)

# Check which sum matches n_total (with small tolerance for rounding)
tolerance = 10  # Allow small differences due to rounding or timing

# Check matches_combined_forensic - will be False if any component is missing
states_with_totals['matches_combined_forensic'] = np.where(
    states_with_totals[['n_arrestees', 'n_offenders', 'n_forensic']].notna().all(axis=1),
    abs(states_with_totals['n_total'] - states_with_totals['sum_all']) <= tolerance,
    False
)

# Check matches_combined - will be False if n_arrestees or n_offenders is missing
# Additional condition: only True if arrestee_collection is 'no' OR n_arrestees > 0
states_with_totals['matches_combined'] = np.where(
    states_with_totals[['n_arrestees', 'n_offenders']].notna().all(axis=1),
    (abs(states_with_totals['n_total'] - states_with_totals['sum_forensic']) <= tolerance) & 
    ((states_with_totals['arrestee_collection'] == 'no') | (states_with_totals['n_arrestees'] > 0)),
    False
)

# Create summary dataframe for display
total_verification = states_with_totals[['state', 'n_total', 'n_arrestees', 'n_offenders', 'n_forensic', 
                                        'arrestee_collection', 'sum_all', 'sum_forensic', 
                                        'matches_combined_forensic', 'matches_combined']].copy()

# Filter to states with at least one component count
has_components = total_verification[
    (total_verification['n_arrestees'].notna()) | 
    (total_verification['n_offenders'].notna()) | 
    (total_verification['n_forensic'].notna())
]

print(f"States with n_total and component data: {len(has_components)}")
print("\nTotal calculation patterns:")

# Categorize states
includes_all = has_components[has_components['matches_combined_forensic'] == True]['state'].tolist()
forensic_only = has_components[(has_components['matches_combined'] == True) & (has_components['matches_combined_forensic'] == False)]['state'].tolist()
neither = has_components[(has_components['matches_combined_forensic'] == False) & (has_components['matches_combined'] == False)]['state'].tolist()

if includes_all:
    print(f"\nn_total includes combined profiles with forensic (arrestees + offenders + forensic):")
    for state in includes_all:
        print(f"  • {state}")

if forensic_only:
    print(f"\nn_total includes combined profiles only (arrestees + offenders):")
    for state in forensic_only:
        print(f"  • {state}")

if neither:
    print(f"\nn_total does not match calculated sums:")
    for state in neither:
        state_data = has_components[has_components['state'] == state].iloc[0]
        print(f"  • {state}: n_total={state_data['n_total']:,.0f}, "
              f"Sum_all={state_data['sum_all']:,.0f}, "
              f"Sum_forensic={state_data['sum_forensic']:,.0f}")

# Display detailed breakdown for verification
print("\nDetailed breakdown:")
display(has_components[['state', 'n_total', 'n_arrestees', 'n_offenders', 'n_forensic', 
                       'arrestee_collection', 'matches_combined_forensic', 'matches_combined']].style.format({
    'n_total': '{:,.0f}',
    'n_arrestees': lambda x: '{:,.0f}'.format(x) if pd.notna(x) else '',
    'n_offenders': lambda x: '{:,.0f}'.format(x) if pd.notna(x) else '',
    'n_forensic': lambda x: '{:,.0f}'.format(x) if pd.notna(x) else ''
}))

## Analysis of Database Totals and Data Quality Issues

This section examines states where N_total values reveal potential data quality issues or reporting inconsistencies.


In [None]:
# Create enhanced data quality analysis
sdis_enhanced = sdis_data.copy()

# Rename n_total to n_total_reported
sdis_enhanced = sdis_enhanced.rename(columns={'n_total': 'n_total_reported'})

# Calculate different total relationships with small tolerance
tolerance = 10

# Check if n_total equals different combinations
# Note: Only consider values > 0 as valid data (0 means no data)
# For offenders only: either arrestees are missing/zero OR state doesn't collect arrestees
sdis_enhanced['total_equals_offenders'] = np.where(
    sdis_enhanced['n_total_reported'].notna() & 
    sdis_enhanced['n_offenders'].notna() & 
    (sdis_enhanced['n_offenders'] > 0),
    abs(sdis_enhanced['n_total_reported'] - sdis_enhanced['n_offenders']) <= tolerance,
    False
)

sdis_enhanced['total_equals_off_arr'] = np.where(
    sdis_enhanced['n_total_reported'].notna() & 
    sdis_enhanced['n_offenders'].notna() & 
    sdis_enhanced['n_arrestees'].notna() &
    (sdis_enhanced['n_offenders'] > 0) &
    (sdis_enhanced['n_arrestees'] > 0),
    abs(sdis_enhanced['n_total_reported'] - (sdis_enhanced['n_offenders'] + sdis_enhanced['n_arrestees'])) <= tolerance,
    False
)

sdis_enhanced['total_equals_all'] = np.where(
    sdis_enhanced['n_total_reported'].notna() & 
    sdis_enhanced['n_offenders'].notna() & 
    sdis_enhanced['n_arrestees'].notna() & 
    sdis_enhanced['n_forensic'].notna() &
    (sdis_enhanced['n_offenders'] > 0) &
    (sdis_enhanced['n_arrestees'] > 0) &
    (sdis_enhanced['n_forensic'] > 0),
    abs(sdis_enhanced['n_total_reported'] - (sdis_enhanced['n_offenders'] + sdis_enhanced['n_arrestees'] + sdis_enhanced['n_forensic'])) <= tolerance,
    False
)

# Determine total calculation method for each state
# Use priority order: All components > Offenders + Arrestees > Offenders only
sdis_enhanced['total_method'] = 'Unknown'
sdis_enhanced.loc[sdis_enhanced['total_equals_offenders'], 'total_method'] = 'Offenders only'
sdis_enhanced.loc[sdis_enhanced['total_equals_off_arr'], 'total_method'] = 'Offenders + Arrestees'
sdis_enhanced.loc[sdis_enhanced['total_equals_all'], 'total_method'] = 'All components'

# Create n_total_estimated based on the rules specified
sdis_enhanced['n_total_estimated'] = np.nan
sdis_enhanced['n_total_estimated_comment'] = ''

# Rule 1: States where n_total == n_offenders + n_arrestees
mask_off_arr = sdis_enhanced['total_equals_off_arr']
sdis_enhanced.loc[mask_off_arr, 'n_total_estimated'] = sdis_enhanced.loc[mask_off_arr, 'n_total_reported']
sdis_enhanced.loc[mask_off_arr, 'n_total_estimated_comment'] = 'Used reported total (matches offenders + arrestees)'

# Rule 2: States where n_total == n_offenders + n_arrestees + n_forensic
mask_all = sdis_enhanced['total_equals_all']
sdis_enhanced.loc[mask_all, 'n_total_estimated'] = (
    sdis_enhanced.loc[mask_all, 'n_total_reported'] - sdis_enhanced.loc[mask_all, 'n_forensic']
)
sdis_enhanced.loc[mask_all, 'n_total_estimated_comment'] = 'Subtracted forensic from reported total'

# Rule 3: States where n_total == n_offenders
# This includes states that don't collect arrestees OR states where total just happens to equal offenders
mask_off_only = sdis_enhanced['total_equals_offenders']
sdis_enhanced.loc[mask_off_only, 'n_total_estimated'] = sdis_enhanced.loc[mask_off_only, 'n_total_reported']
sdis_enhanced.loc[mask_off_only, 'n_total_estimated_comment'] = 'Used reported total (matches offenders only)'

# Rule 4: For remaining states with n_total
# First check if they have ONLY total (no component breakdown)
mask_total_only = (
    sdis_enhanced['n_total_reported'].notna() & 
    sdis_enhanced['n_total_estimated'].isna() &
    (sdis_enhanced['n_arrestees'].isna() | (sdis_enhanced['n_arrestees'] == 0)) &
    (sdis_enhanced['n_offenders'].isna() | (sdis_enhanced['n_offenders'] == 0)) &
    (sdis_enhanced['n_forensic'].isna() | (sdis_enhanced['n_forensic'] == 0))
)
sdis_enhanced.loc[mask_total_only, 'n_total_estimated'] = sdis_enhanced.loc[mask_total_only, 'n_total_reported']
sdis_enhanced.loc[mask_total_only, 'n_total_estimated_comment'] = 'Total only reported (no breakdown available)'

# Special case: States like California with n_total and n_forensic only
# (no offenders/arrestees breakdown)
mask_total_forensic_only = (
    sdis_enhanced['n_total_reported'].notna() & 
    sdis_enhanced['n_total_estimated'].isna() &
    (sdis_enhanced['n_arrestees'].isna() | (sdis_enhanced['n_arrestees'] == 0)) &
    (sdis_enhanced['n_offenders'].isna() | (sdis_enhanced['n_offenders'] == 0)) &
    sdis_enhanced['n_forensic'].notna() & (sdis_enhanced['n_forensic'] > 0)
)
sdis_enhanced.loc[mask_total_forensic_only, 'n_total_estimated'] = sdis_enhanced.loc[mask_total_forensic_only, 'n_total_reported']
sdis_enhanced.loc[mask_total_forensic_only, 'n_total_estimated_comment'] = 'Total only reported (forensic reported separately)'

# States with total and some components but unclear calculation
mask_has_total_unclear = (
    sdis_enhanced['n_total_reported'].notna() & 
    sdis_enhanced['n_total_estimated'].isna()
)
sdis_enhanced.loc[mask_has_total_unclear, 'n_total_estimated'] = sdis_enhanced.loc[mask_has_total_unclear, 'n_total_reported']
sdis_enhanced.loc[mask_has_total_unclear, 'n_total_estimated_comment'] = 'Total with discrepancy (calculation unclear)'

# For states without any total but with offenders and arrestees (both > 0)
mask_no_total = (
    sdis_enhanced['n_total_reported'].isna() & 
    sdis_enhanced['n_offenders'].notna() & (sdis_enhanced['n_offenders'] > 0) &
    sdis_enhanced['n_arrestees'].notna() & (sdis_enhanced['n_arrestees'] > 0)
)
sdis_enhanced.loc[mask_no_total, 'n_total_estimated'] = (
    sdis_enhanced.loc[mask_no_total, 'n_offenders'] + sdis_enhanced.loc[mask_no_total, 'n_arrestees']
)
sdis_enhanced.loc[mask_no_total, 'n_total_estimated_comment'] = 'Calculated from offenders + arrestees (no total reported)'

# For states without total but with only offenders > 0
mask_no_total_off_only = (
    sdis_enhanced['n_total_reported'].isna() & 
    sdis_enhanced['n_offenders'].notna() & (sdis_enhanced['n_offenders'] > 0) &
    (sdis_enhanced['n_arrestees'].isna() | (sdis_enhanced['n_arrestees'] == 0))
)
sdis_enhanced.loc[mask_no_total_off_only, 'n_total_estimated'] = sdis_enhanced.loc[mask_no_total_off_only, 'n_offenders']
sdis_enhanced.loc[mask_no_total_off_only, 'n_total_estimated_comment'] = 'Used offenders count (no total reported, no arrestee data)'

# Create data availability matrix for heatmap
# Treat 0 values as missing data
availability_matrix = pd.DataFrame({
    'State': sdis_enhanced['state'],
    'Arrestees': (sdis_enhanced['n_arrestees'].notna() & (sdis_enhanced['n_arrestees'] > 0)),
    'Offenders': (sdis_enhanced['n_offenders'].notna() & (sdis_enhanced['n_offenders'] > 0)),
    'Forensic': (sdis_enhanced['n_forensic'].notna() & (sdis_enhanced['n_forensic'] > 0)),
    'Total Reported': sdis_enhanced['n_total_reported'].notna(),
    'Total Method': sdis_enhanced['total_method']
})

# Summary of how n_total_estimated was calculated
print("n_total_estimated Calculation Summary:")
print("=" * 50)

# Create single plot with better dimensions
fig, ax = plt.subplots(figsize=(10, 16)) 

# Prepare data for combined heatmap
# Data availability columns
availability_binary = availability_matrix.set_index('State')[['Arrestees', 'Offenders', 'Forensic', 'Total Reported']].astype(int)

# Add total method as a numeric column
method_mapping = {
    'Unknown': 0,
    'Offenders only': 1,
    'Offenders + Arrestees': 2,
    'All components': 3
}
availability_binary['Total Method'] = availability_matrix.set_index('State')['Total Method'].replace(method_mapping)

# Create custom colormap for the combined heatmap
from matplotlib.colors import ListedColormap
import matplotlib.patches as patches

# Create the heatmap
sns.heatmap(availability_binary.iloc[:, :4],  # First 4 columns (binary data)
            cmap=['#f0f0f0', '#2E86AB'],
            cbar=False,
            linewidths=0.5,
            linecolor='gray',
            square=True,
            ax=ax)  # Changed from ax1 to ax

# Add the Total Method column with a different colormap
x_pos = 4
method_colors = ['#f0f0f0', '#FF6B6B', '#4ECDC4', '#45B7D1']
for i, (idx, row) in enumerate(availability_binary.iterrows()):
    method_val = int(row['Total Method'])
    rect = patches.Rectangle((x_pos, i), 1, 1, 
                           linewidth=0.5, 
                           edgecolor='gray',
                           facecolor=method_colors[method_val])
    ax.add_patch(rect)  # Changed from ax1 to ax

# Update x-axis labels
column_labels = list(availability_binary.columns)
ax.set_xticks(np.arange(len(column_labels)) + 0.5)  # Changed from ax1 to ax
ax.set_xticklabels(column_labels, rotation=45, ha='right')  # Changed from ax1 to ax

# Add checkmarks for binary columns
for i in range(len(availability_binary)):
    for j in range(4):  # Only first 4 columns
        if availability_binary.iloc[i, j] == 1:
            ax.text(j + 0.5, i + 0.5, '✓', ha='center', va='center',  # Changed from ax1 to ax
                   fontsize=8, color='white', fontweight='bold')

# Add method labels for the Total Method column
method_labels = ['?', 'O', 'O+A', 'All']
for i in range(len(availability_binary)):
    method_val = int(availability_binary.iloc[i, 4])
    ax.text(4.5, i + 0.5, method_labels[method_val], ha='center', va='center',  # Changed from ax1 to ax
           fontsize=8, color='white' if method_val > 0 else 'black', fontweight='bold')

ax.set_title('Data Field Availability and Total Calculation Method by State', fontsize=14, pad=20)  # Changed from ax1 to ax
ax.set_xlabel('')  # Changed from ax1 to ax
ax.set_ylabel('')  # Changed from ax1 to ax

# Create legend at the bottom
legend_elements = [
    patches.Patch(facecolor='#f0f0f0', edgecolor='black', label='Unknown (?)'),
    patches.Patch(facecolor='#FF6B6B', edgecolor='black', label='Offenders only (O)'),
    patches.Patch(facecolor='#4ECDC4', edgecolor='black', label='Offenders + Arrestees (O+A)'),
    patches.Patch(facecolor='#45B7D1', edgecolor='black', label='All components (All)')
]

# Place legend below the plot
ax.legend(handles=legend_elements, 
          loc='upper center', 
          bbox_to_anchor=(0.5, -0.02),
          ncol=2,
          title='Total Method',
          frameon=True)

# Adjust layout
plt.tight_layout()
plt.show()

# Group states by how their n_total_estimated was determined
estimation_groups = sdis_enhanced.groupby('n_total_estimated_comment')['state'].apply(list)

print("\n1. States with only n_total reported (no component breakdown):")
if 'Total only reported (no breakdown available)' in estimation_groups:
    states = estimation_groups['Total only reported (no breakdown available)']
    print(f"   {len(states)} states: {', '.join(states)}")
if 'Total only reported (forensic reported separately)' in estimation_groups:
    states = estimation_groups['Total only reported (forensic reported separately)']
    print(f"   - With forensic reported separately: {len(states)} states ({', '.join(states)})")

print("\n2. States where n_total matches component calculations:")
if 'Used reported total (matches offenders + arrestees)' in estimation_groups:
    states = estimation_groups['Used reported total (matches offenders + arrestees)']
    print(f"   - Matches offenders + arrestees: {len(states)} states ({', '.join(states)})")
if 'Subtracted forensic from reported total' in estimation_groups:
    states = estimation_groups['Subtracted forensic from reported total']
    print(f"   - Matches all components (forensic subtracted): {len(states)} states ({', '.join(states)})")
if 'Used reported total (matches offenders only)' in estimation_groups:
    states = estimation_groups['Used reported total (matches offenders only)']
    print(f"   - Matches offenders only: {len(states)} states ({', '.join(states)})")

print("\n3. States without n_total reported:")
if 'Calculated from offenders + arrestees (no total reported)' in estimation_groups:
    states = estimation_groups['Calculated from offenders + arrestees (no total reported)']
    print(f"   - Calculated from offenders + arrestees: {len(states)} states ({', '.join(states)})")
if 'Used offenders count (no total reported, no arrestee data)' in estimation_groups:
    states = estimation_groups['Used offenders count (no total reported, no arrestee data)']
    print(f"   - Used offenders as proxy: {len(states)} states ({', '.join(states)})")

print("\n4. States with unclear calculation (total doesn't match expected patterns):")
if 'Total with discrepancy (calculation unclear)' in estimation_groups:
    unclear_states = sdis_enhanced[sdis_enhanced['n_total_estimated_comment'] == 'Total with discrepancy (calculation unclear)']
    
    # Filter to only show states that truly don't match any pattern
    states_to_display = []
    for _, state in unclear_states.iterrows():
        # Calculate different possible sums
        sum_offenders = state['n_offenders'] if pd.notna(state['n_offenders']) and state['n_offenders'] > 0 else 0
        sum_arrestees = state['n_arrestees'] if pd.notna(state['n_arrestees']) and state['n_arrestees'] > 0 else 0
        sum_forensic = state['n_forensic'] if pd.notna(state['n_forensic']) and state['n_forensic'] > 0 else 0
        
        sum_off_arr = sum_offenders + sum_arrestees
        sum_all = sum_offenders + sum_arrestees + sum_forensic
        
        # Check what the total matches
        matches_off = abs(state['n_total_reported'] - sum_offenders) <= tolerance
        matches_off_arr = abs(state['n_total_reported'] - sum_off_arr) <= tolerance
        matches_all = abs(state['n_total_reported'] - sum_all) <= tolerance
        
        # Only include states that don't match any expected pattern
        if not (matches_off or matches_off_arr or matches_all):
            states_to_display.append(state['state'])
    
    if states_to_display:
        print(f"   {len(states_to_display)} states: {', '.join(states_to_display)}")
        print("   (These states have totals that don't match any combination of their component counts)")

# Show full enhanced data for all states
print("\n\nFull Enhanced Data with Estimated Totals (All 50 States):")
cols_to_show = ['state', 'n_total_estimated', 'n_total_reported', 'n_offenders', 'n_arrestees', 
                'n_forensic', 'n_total_estimated_comment']
display(sdis_enhanced[cols_to_show].style.format({
    'n_total_estimated': lambda x: '{:,.0f}'.format(x) if pd.notna(x) else '',
    'n_total_reported': lambda x: '{:,.0f}'.format(x) if pd.notna(x) else '',
    'n_offenders': lambda x: '{:,.0f}'.format(x) if pd.notna(x) else '',
    'n_arrestees': lambda x: '{:,.0f}'.format(x) if pd.notna(x) else '',
    'n_forensic': lambda x: '{:,.0f}'.format(x) if pd.notna(x) else ''
}))

# Update the main dataframe for subsequent analyses
sdis_data = sdis_enhanced

## Export Enhanced Dataset

This section exports the enhanced dataset with the new n_total_estimated values and documentation.


In [None]:
# Prepare final dataset with key columns in logical order
final_columns = [
    'state', 
    'n_total_estimated',
    'n_total_reported',
    'n_arrestees', 
    'n_offenders', 
    'n_forensic',
    'arrestee_collection',
    'fam_search',
    'collection_statute',
    'n_total_estimated_comment',
    'total_method'
]

# Select columns that exist in the dataset
available_columns = [col for col in final_columns if col in sdis_enhanced.columns]
sdis_final = sdis_enhanced[available_columns].copy()

# Export to CSV
output_path = base_dir / "output" / "sdis" / "sdis_clean.csv"
sdis_final.to_csv(output_path, index=False)
print(f"Exported enhanced SDIS dataset to: {output_path}")

# Display final summary statistics
print("\n\nFinal Summary Statistics:")
print("=" * 50)
print(f"Total states in dataset: {len(sdis_final)}")
print(f"States with n_total_estimated: {sdis_final['n_total_estimated'].notna().sum()}")
print(f"States with n_total_reported: {sdis_final['n_total_reported'].notna().sum()}")
print(f"Total profiles (estimated): {sdis_final['n_total_estimated'].sum():,.0f}")

# Show comparison of reported vs estimated totals
comparison = sdis_final[sdis_final[['n_total_reported', 'n_total_estimated']].notna().all(axis=1)].copy()
comparison['difference'] = comparison['n_total_estimated'] - comparison['n_total_reported']
comparison_summary = comparison[comparison['difference'] != 0][['state', 'n_total_reported', 'n_total_estimated', 'difference', 'n_total_estimated_comment']]

if len(comparison_summary) > 0:
    print("\n\nStates where estimated differs from reported total:")
    display(comparison_summary.style.format({
        'n_total_reported': '{:,.0f}',
        'n_total_estimated': '{:,.0f}',
        'difference': '{:,.0f}'
    }).hide(axis='index'))