# Kaufman CAD Data Loader

This notebook loads Kaufman County Central Appraisal District property data from fixed-width text files into PostgreSQL.

In [None]:
# Setup and Imports
import sys
from pathlib import Path

project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from app.utils.logging_config import setup_logger
from app.models.layout import load_layout_config
from app.services.file_reader import read_fixed_width_file, discover_data_files, get_file_path
from app.services.database import DatabaseService
from app.services.loader import DataLoader
from app.config import DATA_DIR, CONFIG_DIR
import pandas as pd

logger = setup_logger("cad_loader", level="INFO")
print(f"Project root: {project_root}")
print("Setup complete!")

## Configuration

In [None]:
# Load layout configuration
layout_config = load_layout_config(CONFIG_DIR / "file_layouts.json")

print(f"Tax Year: {layout_config.taxYear}")
print(f"\nConfigured file types ({len(layout_config.files)}):")
for fc in layout_config.files:
    active_cols = [c.name for c in fc.columns if not c.skip]
    print(f"  {fc.fileName}: {len(active_cols)} active columns")

In [None]:
# Database configuration
db_config = {
    "host": "localhost",
    "port": 5432,
    "database": "kaufman_cad",
    "user": "cad_user",
    "password": "cad_password"
}

conn_string = "postgresql://cad_user:cad_password@localhost:5432/kaufman_cad"
db_service = DatabaseService(db_config)

# Test connection
try:
    with db_service.get_connection() as conn:
        print("✅ Database connection successful")
except Exception as e:
    print(f"❌ Connection failed: {e}")

## Discover Data Files

In [None]:
# Find available data files
available_files = discover_data_files(DATA_DIR, layout_config.filePrefix)
print(f"Data directory: {DATA_DIR}")
print(f"\nAvailable files ({len(available_files)}):")
for f in available_files:
    print(f"  - {f}")

## Load Reference Tables

In [None]:
# Load reference tables (small lookup tables)
reference_tables = ["HEADER", "STATE_CODE", "COUNTRY_CODE", "ABSTRACT_SUBDV", "AGENT", "ENTITY"]

loader = DataLoader(db_service, layout_config, DATA_DIR)

for table in reference_tables:
    result = loader.load_file(table)
    status = "✅" if result["status"] == "SUCCESS" else "❌"
    print(f"{status} {table}: {result['records']:,} records")

## Load Main Data Tables

In [None]:
# Load main data tables
main_tables = [
    "INFO", "LAND_DETAIL", "IMPROVEMENT_INFO", "IMPROVEMENT_DETAIL", 
    "IMPROVEMENT_DETAIL_ATTR", "LAWSUIT", "MOBILE_HOME_INFO", 
    "TAX_DEFERRAL_INFO", "UDI"
]

for table in main_tables:
    print(f"\nLoading {table}...")
    result = loader.load_file(table)
    status = "✅" if result["status"] == "SUCCESS" else "❌"
    print(f"{status} {table}: {result['records']:,} records in {result['duration']:.1f}s")

## Verify Loaded Data

In [None]:
# Check record counts for all tables
tables = [
    "appraisal_header", "appraisal_state_code", "appraisal_country_code",
    "appraisal_abstract_subdv", "appraisal_agent", "appraisal_entity",
    "appraisal_info", "appraisal_land_detail", "appraisal_improvement_info",
    "appraisal_improvement_detail", "appraisal_improvement_detail_attr",
    "appraisal_lawsuit", "appraisal_mobile_home_info", 
    "appraisal_tax_deferral_info", "appraisal_udi"
]

print("Table Record Counts:")
print("-" * 50)
total = 0
for table in tables:
    count = db_service.get_table_count(table)
    total += count
    print(f"{table:40} {count:>8,}")
print("-" * 50)
print(f"{'TOTAL':40} {total:>8,}")

## Query Examples

In [None]:
# Sample property records
query = """
SELECT prop_id, prop_type_cd, prop_val_yr, owner_name, 
       situs_street, situs_city, situs_zip
FROM cad.appraisal_info 
WHERE owner_name IS NOT NULL
ORDER BY prop_id
LIMIT 10
"""
df = pd.read_sql(query, conn_string)
df

In [None]:
# Property count by type
query = """
SELECT prop_type_cd, COUNT(*) as count
FROM cad.appraisal_info
WHERE prop_type_cd IS NOT NULL
GROUP BY prop_type_cd
ORDER BY count DESC
"""
df = pd.read_sql(query, conn_string)
print("Property counts by type:")
df

In [None]:
# Properties by city
query = """
SELECT situs_city, COUNT(*) as count
FROM cad.appraisal_info
WHERE situs_city IS NOT NULL AND situs_city != ''
GROUP BY situs_city
ORDER BY count DESC
LIMIT 15
"""
df = pd.read_sql(query, conn_string)
print("Top 15 cities by property count:")
df