In [1]:
# Kaufman CAD Data Loader - Testing Notebook
# This notebook is for prototyping and testing the data loading functions

import sys
from pathlib import Path

# Add app to path
project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print("Path configured successfully!")

Project root: /Users/tapiwamaruni/Documents/projects/housing1
Path configured successfully!


## 1. Setup and Imports
Import the application modules and set up logging.

In [2]:
# Import application modules
from app.utils.logging_config import setup_logger
from app.models.layout import load_layout_config, LayoutConfig
from app.services.file_reader import (
    read_fixed_width_file, 
    read_all_records,
    get_file_path,
    discover_data_files
)
from app.services.database import DatabaseService
from app.services.loader import DataLoader
from app.config import DATA_DIR, CONFIG_DIR

# Setup logger
logger = setup_logger("cad_loader", level="INFO")
print("Modules imported successfully!")

Modules imported successfully!


## 2. Load and Explore Configuration
Load the file layout configuration and explore available file types.

In [3]:
# Load the configuration
config_path = CONFIG_DIR / "file_layouts.json"
layout_config = load_layout_config(config_path)

print(f"Description: {layout_config.description}")
print(f"Version: {layout_config.version}")
print(f"Tax Year: {layout_config.taxYear}")
print(f"File Prefix: {layout_config.filePrefix}")
print(f"\nConfigured file types ({len(layout_config.files)}):")
for fc in layout_config.files:
    print(f"  - {fc.fileName}: {fc.description} ({len(fc.columns)} columns)")

Description: Kaufman County CAD 2025 Appraisal Data Export Layout Configuration
Version: 1.0.0
Tax Year: 2025
File Prefix: 2025-10-27_002174_APPRAISAL_

Configured file types (17):
  - HEADER: Export metadata and header information (11 columns)
  - INFO: Main property/parcel information (45 columns)
  - ENTITY: Taxing entity codes (3 columns)
  - ENTITY_INFO: Property-entity relationships (9 columns)
  - ENTITY_TOTALS: Aggregate totals by entity (5 columns)
  - LAND_DETAIL: Land segment details (16 columns)
  - IMPROVEMENT_INFO: Improvement summary records (11 columns)
  - IMPROVEMENT_DETAIL: Detailed improvement component records (8 columns)
  - IMPROVEMENT_DETAIL_ATTR: Improvement attribute records (6 columns)
  - ABSTRACT_SUBDV: Abstract and subdivision codes (2 columns)
  - AGENT: Agent/representative information (8 columns)
  - STATE_CODE: State classification codes (2 columns)
  - COUNTRY_CODE: Country codes reference (2 columns)
  - LAWSUIT: Lawsuit/protest information (5 column

## 3. Discover Available Data Files
Find what data files exist in the data directory.

In [4]:
# Discover available files
print(f"Data directory: {DATA_DIR}")
print(f"Directory exists: {DATA_DIR.exists()}")

available_files = discover_data_files(DATA_DIR, layout_config.filePrefix)
print(f"\nAvailable data files ({len(available_files)}):")
for f in available_files:
    print(f"  - {f}")

Data directory: /Users/tapiwamaruni/Documents/projects/housing1/Kaufman-CAD-2025-Certified-Full-Roll-Download-updated-with-Supp-5
Directory exists: True
2025-12-07 17:05:45 - cad_loader - INFO - Discovered 17 data files

Available data files (17):
  - ABSTRACT_SUBDV
  - AGENT
  - COUNTRY_CODE
  - ENTITY
  - ENTITY_INFO
  - ENTITY_TOTALS
  - HEADER
  - IMPROVEMENT_DETAIL
  - IMPROVEMENT_DETAIL_ATTR
  - IMPROVEMENT_INFO
  - INFO
  - LAND_DETAIL
  - LAWSUIT
  - MOBILE_HOME_INFO
  - STATE_CODE
  - TAX_DEFERRAL_INFO
  - UDI

Available data files (17):
  - ABSTRACT_SUBDV
  - AGENT
  - COUNTRY_CODE
  - ENTITY
  - ENTITY_INFO
  - ENTITY_TOTALS
  - HEADER
  - IMPROVEMENT_DETAIL
  - IMPROVEMENT_DETAIL_ATTR
  - IMPROVEMENT_INFO
  - INFO
  - LAND_DETAIL
  - LAWSUIT
  - MOBILE_HOME_INFO
  - STATE_CODE
  - TAX_DEFERRAL_INFO
  - UDI


## 4. Test File Reader
Test reading a small sample from a data file.

In [5]:
# Test reading Abstract/Subdivision file (small reference table)
file_type = "ABSTRACT_SUBDV"
file_config = layout_config.get_file_config(file_type)

if file_config:
    file_path = get_file_path(DATA_DIR, layout_config.filePrefix, file_type)
    print(f"Reading: {file_path.name}")
    print(f"Table: {file_config.tableName}")
    print(f"Columns: {[c.name for c in file_config.columns]}")
    
    # Read first 10 records
    records = read_all_records(file_path, file_config, max_records=10)
    print(f"\nFirst {len(records)} records:")
    for r in records:
        print(f"  {r}")

Reading: 2025-10-27_002174_APPRAISAL_ABSTRACT_SUBDV.TXT
Table: appraisal_abstract_subdv
Columns: ['abs_subdv_cd', 'abs_subdv_desc']
2025-12-07 17:06:07 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_ABSTRACT_SUBDV.TXT
2025-12-07 17:06:07 - cad_loader - INFO - Reached max records limit: 10
2025-12-07 17:06:07 - cad_loader - INFO - Reached max records limit: 10
2025-12-07 17:06:07 - cad_loader - INFO - Processed 10 records from 2025-10-27_002174_APPRAISAL_ABSTRACT_SUBDV.TXT

First 10 records:
  {'abs_subdv_cd': 'A0001', 'abs_subdv_desc': 'R ANTHONY'}
  {'abs_subdv_cd': 'A0002', 'abs_subdv_desc': 'C ASKINS'}
  {'abs_subdv_cd': 'A0003', 'abs_subdv_desc': 'J S ABLES'}
  {'abs_subdv_cd': 'A0004', 'abs_subdv_desc': 'J ABLES'}
  {'abs_subdv_cd': 'A0005', 'abs_subdv_desc': 'J ANGLIN'}
  {'abs_subdv_cd': 'A0006', 'abs_subdv_desc': 'E ABLES'}
  {'abs_subdv_cd': 'A0007', 'abs_subdv_desc': 'L H ADAMS'}
  {'abs_subdv_cd': 'A0008', 'abs_subdv_desc': 'S ATWOOD'}
  {'abs_subdv_cd': 'A0

In [7]:
# Test reading main property INFO file (sample only)
file_type = "INFO"
file_config = layout_config.get_file_config(file_type)

if file_config:
    file_path = get_file_path(DATA_DIR, layout_config.filePrefix, file_type)
    print(f"Reading sample from: {file_path.name}")
    
    # Read first 3 records
    records = read_all_records(file_path, file_config, max_records=3)
    
    # Display key fields
    print(f"\nSample property records ({len(records)}):")
    for r in records:
        print(f"\n  Property ID: {r.get('prop_id')}")
        print(f"  Owner: {r.get('owner_name')}")
        print(f"  Address: {r.get('situs_street')}, {r.get('situs_city')}")
        appraised_val = r.get('appraised_val') or 0
        print(f"  Appraised Value: ${appraised_val:,}")

Reading sample from: 2025-10-27_002174_APPRAISAL_INFO.TXT
2025-12-07 17:07:26 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_INFO.TXT
2025-12-07 17:07:26 - cad_loader - INFO - Reached max records limit: 3
2025-12-07 17:07:26 - cad_loader - INFO - Processed 3 records from 2025-10-27_002174_APPRAISAL_INFO.TXT

Sample property records (3):

  Property ID: 1
  Owner: None
  Address: None, CARROLLTON
  Appraised Value: $0

  Property ID: 2
  Owner: None
  Address: None, KAUFMAN
  Appraised Value: $0

  Property ID: 3
  Owner: None
  Address: None, KAUFMAN
  Appraised Value: $0
2025-12-07 17:07:26 - cad_loader - INFO - Reached max records limit: 3
2025-12-07 17:07:26 - cad_loader - INFO - Processed 3 records from 2025-10-27_002174_APPRAISAL_INFO.TXT

Sample property records (3):

  Property ID: 1
  Owner: None
  Address: None, CARROLLTON
  Appraised Value: $0

  Property ID: 2
  Owner: None
  Address: None, KAUFMAN
  Appraised Value: $0

  Property ID: 3
  Owner: None
  Addr

## 5. Convert to Pandas DataFrame
Convert parsed records to a DataFrame for analysis.

In [8]:
import pandas as pd

# Read a larger sample of land details
file_type = "LAND_DETAIL"
file_config = layout_config.get_file_config(file_type)

if file_config:
    file_path = get_file_path(DATA_DIR, layout_config.filePrefix, file_type)
    records = read_all_records(file_path, file_config, max_records=100)
    
    df = pd.DataFrame(records)
    print(f"DataFrame shape: {df.shape}")
    print(f"\nColumns: {list(df.columns)}")
    print(f"\nData types:\n{df.dtypes}")
    df.head(10)

2025-12-07 17:08:05 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_LAND_DETAIL.TXT
2025-12-07 17:08:05 - cad_loader - INFO - Reached max records limit: 100
2025-12-07 17:08:05 - cad_loader - INFO - Processed 100 records from 2025-10-27_002174_APPRAISAL_LAND_DETAIL.TXT
DataFrame shape: (100, 16)

Columns: ['prop_id', 'tax_year', 'land_seg_id', 'land_type_cd', 'land_type_desc', 'state_cd', 'ag_flag', 'land_sqft', 'land_acres', 'mkt_val', 'prod_val', 'land_class', 'soil_cd', 'appraised_val', 'ag_apply_cd', 'adj_cd']

Data types:
prop_id             int64
tax_year            int64
land_seg_id         int64
land_type_cd       object
land_type_desc     object
state_cd           object
ag_flag            object
land_sqft          object
land_acres          int64
mkt_val             int64
prod_val            int64
land_class         object
soil_cd            object
appraised_val     float64
ag_apply_cd        object
adj_cd             object
dtype: object
2025-12-07 17:08:05 -

## 6. Database Connection Test
Test connection to PostgreSQL database (requires Docker container running).

In [9]:
# Test database connection
# Make sure Docker container is running: docker-compose up -d

db_service = DatabaseService()
connected = db_service.test_connection()

if connected:
    print("✅ Database connection successful!")
else:
    print("❌ Database connection failed.")
    print("Make sure to run: docker-compose up -d")

2025-12-07 17:08:05 - cad_loader - INFO - Database connection successful
✅ Database connection successful!
✅ Database connection successful!


## 7. Initialize Database Schema
Run the SQL script to create all tables.

In [10]:
# Initialize database schema
sql_file = project_root / "sql" / "001_create_schema.sql"

if db_service.test_connection():
    success = db_service.execute_sql_file(str(sql_file))
    if success:
        print("✅ Database schema created successfully!")
    else:
        print("❌ Failed to create schema")
else:
    print("⚠️ Cannot initialize schema - no database connection")

2025-12-07 17:08:05 - cad_loader - INFO - Database connection successful
2025-12-07 17:08:05 - cad_loader - INFO - Executed SQL file: /Users/tapiwamaruni/Documents/projects/housing1/sql/001_create_schema.sql
✅ Database schema created successfully!
2025-12-07 17:08:05 - cad_loader - INFO - Executed SQL file: /Users/tapiwamaruni/Documents/projects/housing1/sql/001_create_schema.sql
✅ Database schema created successfully!


## 8. Load Data Using DataLoader
Use the DataLoader service to load files into the database.

In [11]:
# Initialize the data loader
loader = DataLoader()

# Show available files
print("Available files to load:")
for ft in loader.get_available_files():
    print(f"  - {ft}")

Available files to load:
2025-12-07 17:08:05 - cad_loader - INFO - Loaded layout config: Kaufman County CAD 2025 Appraisal Data Export Layout Configuration
2025-12-07 17:08:05 - cad_loader - INFO - Discovered 17 data files
  - ABSTRACT_SUBDV
  - AGENT
  - COUNTRY_CODE
  - ENTITY
  - ENTITY_INFO
  - ENTITY_TOTALS
  - HEADER
  - IMPROVEMENT_DETAIL
  - IMPROVEMENT_DETAIL_ATTR
  - IMPROVEMENT_INFO
  - INFO
  - LAND_DETAIL
  - LAWSUIT
  - MOBILE_HOME_INFO
  - STATE_CODE
  - TAX_DEFERRAL_INFO
  - UDI
2025-12-07 17:08:05 - cad_loader - INFO - Discovered 17 data files
  - ABSTRACT_SUBDV
  - AGENT
  - COUNTRY_CODE
  - ENTITY
  - ENTITY_INFO
  - ENTITY_TOTALS
  - HEADER
  - IMPROVEMENT_DETAIL
  - IMPROVEMENT_DETAIL_ATTR
  - IMPROVEMENT_INFO
  - INFO
  - LAND_DETAIL
  - LAWSUIT
  - MOBILE_HOME_INFO
  - STATE_CODE
  - TAX_DEFERRAL_INFO
  - UDI


In [12]:
# Load a single file (small reference table as test)
result = loader.load_file("ABSTRACT_SUBDV")
print(f"Load result: {result}")

2025-12-07 17:08:05 - cad_loader - INFO - Truncated table: cad.appraisal_abstract_subdv
2025-12-07 17:08:05 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_ABSTRACT_SUBDV.TXT
2025-12-07 17:08:05 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_ABSTRACT_SUBDV.TXT
2025-12-07 17:08:05 - cad_loader - INFO - Processed 2458 records from 2025-10-27_002174_APPRAISAL_ABSTRACT_SUBDV.TXT
2025-12-07 17:08:05 - cad_loader - INFO - Processed 2458 records from 2025-10-27_002174_APPRAISAL_ABSTRACT_SUBDV.TXT
2025-12-07 17:08:05 - cad_loader - INFO - Completed inserting 2458 records into cad.appraisal_abstract_subdv
2025-12-07 17:08:05 - cad_loader - INFO - Completed inserting 2458 records into cad.appraisal_abstract_subdv
Load result: {'file_type': 'ABSTRACT_SUBDV', 'status': 'SUCCESS', 'records_loaded': 2458, 'error': None, 'duration_seconds': 0.258335}
Load result: {'file_type': 'ABSTRACT_SUBDV', 'status': 'SUCCESS', 'records_loaded': 2458, 'error': None, 'duration_seco

In [13]:
# Load all reference tables first (smaller files)
reference_tables = ["STATE_CODE", "COUNTRY_CODE", "AGENT", "ENTITY", "ENTITY_TOTALS"]
results = loader.load_all_files(file_types=reference_tables)

summary = loader.get_load_summary(results)
print(f"\nLoad Summary:")
print(f"  Total files: {summary['total_files']}")
print(f"  Successful: {summary['successful']}")
print(f"  Failed: {summary['failed']}")
print(f"  Total records: {summary['total_records']:,}")

2025-12-07 17:08:05 - cad_loader - INFO - Loading 5 file types
2025-12-07 17:08:05 - cad_loader - INFO - Processing: STATE_CODE
2025-12-07 17:08:05 - cad_loader - INFO - Processing: STATE_CODE
2025-12-07 17:08:06 - cad_loader - INFO - Truncated table: cad.appraisal_state_code
2025-12-07 17:08:06 - cad_loader - INFO - Truncated table: cad.appraisal_state_code
2025-12-07 17:08:06 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_STATE_CODE.TXT
2025-12-07 17:08:06 - cad_loader - INFO - Processed 79 records from 2025-10-27_002174_APPRAISAL_STATE_CODE.TXT
2025-12-07 17:08:06 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_STATE_CODE.TXT
2025-12-07 17:08:06 - cad_loader - INFO - Processed 79 records from 2025-10-27_002174_APPRAISAL_STATE_CODE.TXT
2025-12-07 17:08:06 - cad_loader - INFO - Completed inserting 79 records into cad.appraisal_state_code
2025-12-07 17:08:06 - cad_loader - INFO - Completed inserting 79 records into cad.appraisal_state_code
2025-12-07 17

## 9. Load All Data
Load all files into the database. **Warning: This may take several minutes for large files.**

In [15]:
# Load ALL files - this will take some time for large files
# Uncomment to run full load

results = loader.load_all_files(truncate=True)
summary = loader.get_load_summary(results)
print(f"\nFull Load Summary:")
print(f"  Total files: {summary['total_files']}")
print(f"  Successful: {summary['successful']}")
print(f"  Failed: {summary['failed']}")
print(f"  Total records: {summary['total_records']:,}")
print(f"  Total time: {summary['total_duration']:.2f} seconds")

print("⚠️ Full load is commented out. Uncomment to run.")

2025-12-07 17:10:27 - cad_loader - INFO - Loading 17 file types
2025-12-07 17:10:27 - cad_loader - INFO - Processing: HEADER
2025-12-07 17:10:27 - cad_loader - INFO - Processing: HEADER
2025-12-07 17:10:27 - cad_loader - INFO - Truncated table: cad.appraisal_header
2025-12-07 17:10:27 - cad_loader - INFO - Truncated table: cad.appraisal_header
2025-12-07 17:10:27 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_HEADER.TXT
2025-12-07 17:10:27 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_HEADER.TXT
2025-12-07 17:10:27 - cad_loader - INFO - Processed 1 records from 2025-10-27_002174_APPRAISAL_HEADER.TXT
2025-12-07 17:10:27 - cad_loader - INFO - Completed inserting 1 records into cad.appraisal_header
2025-12-07 17:10:27 - cad_loader - INFO - Processed 1 records from 2025-10-27_002174_APPRAISAL_HEADER.TXT
2025-12-07 17:10:27 - cad_loader - INFO - Completed inserting 1 records into cad.appraisal_header
2025-12-07 17:10:27 - cad_loader - INFO - Loaded 1 recor

## 10. Verify Loaded Data
Query the database to verify data was loaded correctly.

In [16]:
# Check record counts in loaded tables
tables_to_check = [
    "appraisal_abstract_subdv",
    "appraisal_state_code", 
    "appraisal_country_code",
    "appraisal_agent",
    "appraisal_entity",
    "appraisal_entity_totals"
]

print("Table record counts:")
for table in tables_to_check:
    count = db_service.get_table_count(table)
    print(f"  {table}: {count:,} records")

Table record counts:
  appraisal_abstract_subdv: 2,458 records
  appraisal_state_code: 79 records
  appraisal_country_code: 7 records
  appraisal_agent: 998 records
  appraisal_entity: 65 records
  appraisal_entity_totals: 0 records


In [17]:
# Query sample data from database using pandas
if db_service.test_connection():
    import psycopg2
    from app.config import DATABASE_CONFIG
    
    conn_string = f"postgresql://{DATABASE_CONFIG['user']}:{DATABASE_CONFIG['password']}@{DATABASE_CONFIG['host']}:{DATABASE_CONFIG['port']}/{DATABASE_CONFIG['database']}"
    
    # Query abstract/subdivision table
    query = "SELECT * FROM cad.appraisal_abstract_subdv LIMIT 10"
    df = pd.read_sql(query, conn_string)
    print("Sample data from appraisal_abstract_subdv:")
    display(df)

2025-12-07 17:27:09 - cad_loader - INFO - Database connection successful
Sample data from appraisal_abstract_subdv:
Sample data from appraisal_abstract_subdv:


Unnamed: 0,abs_subdv_cd,abs_subdv_desc,created_at
0,A0001,R ANTHONY,2025-12-07 23:10:28.238474
1,A0002,C ASKINS,2025-12-07 23:10:28.238474
2,A0003,J S ABLES,2025-12-07 23:10:28.238474
3,A0004,J ABLES,2025-12-07 23:10:28.238474
4,A0005,J ANGLIN,2025-12-07 23:10:28.238474
5,A0006,E ABLES,2025-12-07 23:10:28.238474
6,A0007,L H ADAMS,2025-12-07 23:10:28.238474
7,A0008,S ATWOOD,2025-12-07 23:10:28.238474
8,A0009,S ANDERSON,2025-12-07 23:10:28.238474
9,A0010,H ABLE,2025-12-07 23:10:28.238474


In [18]:
# Query property info table
query = "SELECT * FROM cad.appraisal_info LIMIT 10"
df = pd.read_sql(query, conn_string)
print("Sample data from appraisal_info:")
display(df)

Sample data from appraisal_info:


Unnamed: 0,prop_id,prop_type_cd,prop_val_yr,sup_num,exemption_cd,exemption_desc,geo_id,owner_id,owner_name,confidential_flag,...,land_timber_mkt,land_timber_use,impr_hstd_val,impr_non_hstd_val,personal_val,mineral_val,appraised_val,hs_cap_val,assessed_val,created_at


In [19]:
# Debug: Check load results for INFO file
print("Load results from full load:")
for r in results:
    status = "✅" if r['status'] == 'SUCCESS' else "❌"
    error = f" - Error: {r.get('error')}" if r.get('error') else ""
    print(f"  {status} {r['file_type']}: {r['records_loaded']} records{error}")

Load results from full load:
  ✅ HEADER: 1 records
  ✅ STATE_CODE: 79 records
  ✅ COUNTRY_CODE: 7 records
  ✅ ABSTRACT_SUBDV: 2458 records
  ✅ AGENT: 998 records
  ✅ ENTITY: 65 records
  ❌ ENTITY_TOTALS: 0 records - Error: duplicate key value violates unique constraint "appraisal_entity_totals_pkey"
DETAIL:  Key (entity_cd)=(0000000000) already exists.

  ❌ INFO: 0 records - Error: null value in column "prop_val_yr" of relation "appraisal_info" violates not-null constraint
DETAIL:  Failing row contains (5463, M, null, 202500000000000, 1M EX, PROPOSED 2025 SB4/SB23 EXEMPTION CHANGES, null, null, null, null, null, null, 99.0151.0000.0930.97.03.00                        00, 0000229399HENDERSON DAVID L JR & JOHNNIE M, F000000000000, 6376 CO RD 152, null, null, null, null, null, null, KAUFMAN, null, TX                                                     75142    ..., KAUFMAN                       75142     IMP ONLY ON: PID 210134,..., null, null, null, null, null, null, null, null, null, nu

In [20]:
# Analyze the actual INFO file structure
info_file = DATA_DIR / f"{layout_config.filePrefix}INFO.TXT"

with open(info_file, 'r') as f:
    line = f.readline()

# Show character positions for first 50 chars
print("Character analysis (first 50 chars):")
print("Position: ", end="")
for i in range(50):
    print(i % 10, end="")
print()
print("Char:     ", end="")
print(line[:50])
print()

# Identify key fields by position
print(f"Pos 0-11 (prop_id): '{line[0:12]}'")
print(f"Pos 12 (prop_type_cd): '{line[12]}'")
print(f"Pos 13-16: '{line[13:17]}'")
print(f"Pos 17 (digit): '{line[17]}'")
print(f"Pos 18-21 (year?): '{line[18:22]}'")
print(f"Total line length: {len(line)}")

Character analysis (first 50 chars):
Position: 01234567890123456789012345678901234567890123456789
Char:     000000000001R    02025000000000000                

Pos 0-11 (prop_id): '000000000001'
Pos 12 (prop_type_cd): 'R'
Pos 13-16: '    '
Pos 17 (digit): '0'
Pos 18-21 (year?): '2025'
Total line length: 9263


In [21]:
# Reload the updated configuration
import importlib
import app.models.layout as layout_module
importlib.reload(layout_module)

from app.models.layout import load_layout_config
layout_config = load_layout_config(CONFIG_DIR / "file_layouts.json")

# Test parsing INFO with new config
file_config = layout_config.get_file_config("INFO")
print(f"INFO columns (first 10):")
for col in file_config.columns[:10]:
    skip = " (SKIP)" if col.skip else ""
    print(f"  {col.index}: {col.name} - length {col.length}{skip}")

# Test reading with new layout
file_path = get_file_path(DATA_DIR, layout_config.filePrefix, "INFO")
records = read_all_records(file_path, file_config, max_records=3)

print(f"\nTest parse results:")
for r in records:
    print(f"  prop_id={r.get('prop_id')}, prop_val_yr={r.get('prop_val_yr')}, owner={r.get('owner_name')[:30] if r.get('owner_name') else 'N/A'}...")

INFO columns (first 10):
  0: prop_id - length 12
  1: prop_type_cd - length 1
  2: filler_type - length 4 (SKIP)
  3: sup_num - length 1 (SKIP)
  4: prop_val_yr - length 4
  5: sup_num_rest - length 15
  4: exemption_cd - length 10
  5: exemption_desc - length 80
  6: filler_1 - length 200 (SKIP)
  7: geo_id - length 30
2025-12-07 17:41:42 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_INFO.TXT
2025-12-07 17:41:42 - cad_loader - INFO - Reached max records limit: 3
2025-12-07 17:41:42 - cad_loader - INFO - Processed 3 records from 2025-10-27_002174_APPRAISAL_INFO.TXT

Test parse results:
  prop_id=1, prop_val_yr=2025, owner=N/A...
  prop_id=2, prop_val_yr=2025, owner=N/A...
  prop_id=3, prop_val_yr=2025, owner=N/A...


In [22]:
# Recreate the schema with updated structure
# First drop the appraisal_info table to recreate it

if db_service.test_connection():
    with db_service.get_connection() as conn:
        with conn.cursor() as cur:
            cur.execute("DROP TABLE IF EXISTS cad.appraisal_info CASCADE")
        conn.commit()
    print("✅ Dropped appraisal_info table")
    
    # Recreate schema
    success = db_service.execute_sql_file(str(project_root / "sql" / "001_create_schema.sql"))
    if success:
        print("✅ Schema recreated")

2025-12-07 17:41:57 - cad_loader - INFO - Database connection successful
✅ Dropped appraisal_info table
2025-12-07 17:41:57 - cad_loader - INFO - Executed SQL file: /Users/tapiwamaruni/Documents/projects/housing1/sql/001_create_schema.sql
✅ Schema recreated


In [23]:
# Reload the loader with updated config and load INFO file
import importlib
import app.services.loader as loader_module
importlib.reload(loader_module)

from app.services.loader import DataLoader
loader = DataLoader()

# Load just the INFO file
result = loader.load_file("INFO", truncate=True)
print(f"\nINFO load result:")
print(f"  Status: {result['status']}")
print(f"  Records loaded: {result['records_loaded']:,}")
if result.get('error'):
    print(f"  Error: {result['error']}")

2025-12-07 17:47:34 - cad_loader - INFO - Loaded layout config: Kaufman County CAD 2025 Appraisal Data Export Layout Configuration
2025-12-07 17:47:34 - cad_loader - INFO - Truncated table: cad.appraisal_info
2025-12-07 17:47:34 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_INFO.TXT
2025-12-07 17:47:38 - cad_loader - INFO - Inserted 10000 records into appraisal_info
2025-12-07 17:47:43 - cad_loader - INFO - Inserted 20000 records into appraisal_info
2025-12-07 17:47:47 - cad_loader - INFO - Inserted 30000 records into appraisal_info
2025-12-07 17:47:50 - cad_loader - INFO - Inserted 40000 records into appraisal_info
2025-12-07 17:47:54 - cad_loader - INFO - Inserted 50000 records into appraisal_info
2025-12-07 17:47:59 - cad_loader - INFO - Inserted 60000 records into appraisal_info
2025-12-07 17:48:03 - cad_loader - INFO - Inserted 70000 records into appraisal_info
2025-12-07 17:48:07 - cad_loader - INFO - Inserted 80000 records into appraisal_info
2025-12-07 17:48:0

In [24]:
# Check appraisal_info table now
count = db_service.get_table_count("appraisal_info")
print(f"appraisal_info record count: {count:,}")

# Query sample
query = "SELECT prop_id, prop_type_cd, prop_val_yr, owner_name, situs_street, appraised_val FROM cad.appraisal_info LIMIT 10"
df = pd.read_sql(query, conn_string)
print("\nSample data:")
display(df)

appraisal_info record count: 0

Sample data:


Unnamed: 0,prop_id,prop_type_cd,prop_val_yr,owner_name,situs_street,appraised_val


In [25]:
# Reload database module and retry INFO load
import importlib
import app.services.database as db_module
importlib.reload(db_module)

from app.services.database import DatabaseService
db_service = DatabaseService()

# Drop and recreate appraisal_info with nullable prop_id
with db_service.get_connection() as conn:
    with conn.cursor() as cur:
        cur.execute("DROP TABLE IF EXISTS cad.appraisal_info CASCADE")
    conn.commit()
print("Dropped appraisal_info")

db_service.execute_sql_file(str(project_root / "sql" / "001_create_schema.sql"))
print("Schema recreated")

Dropped appraisal_info
2025-12-07 17:49:43 - cad_loader - INFO - Executed SQL file: /Users/tapiwamaruni/Documents/projects/housing1/sql/001_create_schema.sql
Schema recreated


In [26]:
# Reload loader and load INFO with updated database service
import app.services.loader as loader_module
importlib.reload(loader_module)

from app.services.loader import DataLoader
loader = DataLoader()

result = loader.load_file("INFO", truncate=True)
print(f"\nINFO load result:")
print(f"  Status: {result['status']}")
print(f"  Records: {result['records_loaded']:,}")
print(f"  Duration: {result.get('duration_seconds', 0):.1f}s")
if result.get('error'):
    print(f"  Error: {result['error'][:200]}...")

2025-12-07 17:50:13 - cad_loader - INFO - Loaded layout config: Kaufman County CAD 2025 Appraisal Data Export Layout Configuration
2025-12-07 17:50:13 - cad_loader - INFO - Truncated table: cad.appraisal_info
2025-12-07 17:50:13 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_INFO.TXT
2025-12-07 17:50:17 - cad_loader - INFO - Inserted 10000 records into appraisal_info
2025-12-07 17:50:21 - cad_loader - INFO - Inserted 20000 records into appraisal_info
2025-12-07 17:50:25 - cad_loader - INFO - Inserted 30000 records into appraisal_info
2025-12-07 17:50:29 - cad_loader - INFO - Inserted 40000 records into appraisal_info
2025-12-07 17:50:33 - cad_loader - INFO - Inserted 50000 records into appraisal_info
2025-12-07 17:50:37 - cad_loader - INFO - Inserted 60000 records into appraisal_info
2025-12-07 17:50:41 - cad_loader - INFO - Inserted 70000 records into appraisal_info
2025-12-07 17:50:45 - cad_loader - INFO - Inserted 80000 records into appraisal_info
2025-12-07 17:50:5

In [27]:
# Verify INFO data loaded correctly
count = db_service.get_table_count("appraisal_info")
print(f"✅ appraisal_info: {count:,} records")

# Sample query
query = """
SELECT prop_id, prop_type_cd, prop_val_yr, exemption_cd, geo_id, owner_name 
FROM cad.appraisal_info 
WHERE prop_id IS NOT NULL
LIMIT 10
"""
df = pd.read_sql(query, conn_string)
print("\nSample property records:")
display(df)

✅ appraisal_info: 104,369 records

Sample property records:


Unnamed: 0,prop_id,prop_type_cd,prop_val_yr,exemption_cd,geo_id,owner_name
0,1,R,2025,,,
1,2,R,2025,,,
2,3,R,2025,X P,,
3,4,R,2025,X P,,
4,5,R,2025,X P,,
5,6,R,2025,,,
6,7,R,2025,X P,,
7,8,R,2025,,,
8,9,R,2025,,,
9,10,R,2025,X P,,


In [29]:
# Check actual columns in the table
query = """
SELECT column_name 
FROM information_schema.columns 
WHERE table_schema = 'cad' AND table_name = 'appraisal_info'
ORDER BY ordinal_position
"""
columns_df = pd.read_sql(query, conn_string)
print(f"Total columns: {len(columns_df)}")
print("\nColumns in appraisal_info:")
for i, col in enumerate(columns_df['column_name'].tolist()):
    print(f"{i+1:3}. {col}")

Total columns: 46

Columns in appraisal_info:
  1. id
  2. prop_id
  3. prop_type_cd
  4. prop_val_yr
  5. sup_num_rest
  6. exemption_cd
  7. exemption_desc
  8. geo_id
  9. owner_id
 10. owner_name
 11. confidential_flag
 12. py_owner_id
 13. py_owner_name
 14. addr_line1
 15. addr_line2
 16. city
 17. state
 18. country
 19. zip
 20. confidential_flag2
 21. delivery_point
 22. exemption_flag
 23. situs_street
 24. situs_city
 25. situs_zip
 26. legal_desc
 27. legal_desc2
 28. legal_acreage
 29. abs_subdv_cd
 30. neighborhood_cd
 31. block
 32. tract_or_lot
 33. land_hstd_val
 34. land_non_hstd_val
 35. land_ag_mkt_val
 36. land_ag_use_val
 37. land_timber_mkt
 38. land_timber_use
 39. impr_hstd_val
 40. impr_non_hstd_val
 41. personal_val
 42. mineral_val
 43. appraised_val
 44. hs_cap_val
 45. assessed_val
 46. created_at


In [30]:
# Sample records from INFO table
query = """
SELECT prop_id, prop_type_cd, prop_val_yr, owner_name, legal_desc, situs_street, 
       appraised_val, assessed_val, land_hstd_val
FROM cad.appraisal_info 
WHERE prop_id IS NOT NULL
LIMIT 5
"""
df = pd.read_sql(query, conn_string)
df

Unnamed: 0,prop_id,prop_type_cd,prop_val_yr,owner_name,legal_desc,situs_street,appraised_val,assessed_val,land_hstd_val
0,1,R,2025,,TX ...,CAR,,,
1,2,R,2025,,TX ...,KAU,,,
2,3,R,2025,,TX ...,KAU,,,
3,4,R,2025,,TX ...,KAU,,,
4,5,R,2025,,TX ...,TER,,,


In [32]:
# Check a raw line from the file vs what we stored
from pathlib import Path
CAD_DATA_DIR = "/Users/tapiwamaruni/Documents/projects/housing1/Kaufman-CAD-2025-Certified-Full-Roll-Download-updated-with-Supp-5"

info_file = Path(CAD_DATA_DIR) / "2025-10-27_002174_APPRAISAL_INFO.TXT"
with open(info_file, 'r', encoding='utf-8') as f:
    first_line = f.readline()

print(f"Line length: {len(first_line.rstrip())}")
print(f"\nFirst 500 chars:\n{first_line[:500]}")
print(f"\nCharacter positions 0-100:")
for i in range(0, 100, 10):
    print(f"  {i:3}-{i+10:3}: '{first_line[i:i+10]}'")

Line length: 9262

First 500 chars:
000000000001R    02025000000000000                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

Character positions 0-100:
    0- 10: '0000000000'
   10- 20: '01R    020'
   20- 30: '2500000000'
   30- 40: '0000      '
   40- 50: '          '
   50- 60: '          '
   60- 70: '          '
   70- 80: '          '
   80- 90: '          '
   90-100: '          '


In [33]:
# Find non-empty data in the first line
for i in range(0, len(first_line), 100):
    chunk = first_line[i:i+100]
    if chunk.strip():  # Only show non-empty chunks
        print(f"\nPositions {i}-{i+100}:")
        print(f"  '{chunk}'")


Positions 0-100:
  '000000000001R    02025000000000000                                                                  '

Positions 500-600:
  '                                              99.0001.0000.0005.00.06.00                        0000'

Positions 600-700:
  '00202803BARAY RAUL ANTONIO                                                    F000000000000         '

Positions 700-800:
  '                                                     1823 BALMORAL DR                               '

Positions 800-900:
  '                                                                         CARROLLTON                 '

Positions 900-1000:
  '                       TX                                                US   75006      FF         '

Positions 1000-1100:
  '           Y                                     FM RD 2578                                         '

Positions 1100-1200:
  '         TERRELL                       75160     R ANTHONY, TRACT 5.00; 26.232 ACRES, & HOUSE      

In [34]:
# Let's check what data we actually got stored - look at non-null values
query = """
SELECT prop_id, prop_type_cd, prop_val_yr, geo_id, owner_id, owner_name, 
       situs_street, legal_desc, appraised_val
FROM cad.appraisal_info 
WHERE owner_name IS NOT NULL AND owner_name != ''
LIMIT 5
"""
try:
    df = pd.read_sql(query, conn_string)
    print(f"Records with owner_name: {len(df)}")
    if len(df) > 0:
        display(df)
except Exception as e:
    print(f"Error: {e}")

# Also check if ALL records have NULL owner_name
query2 = """
SELECT COUNT(*) as total,
       COUNT(owner_name) as with_owner,
       COUNT(appraised_val) as with_appraised
FROM cad.appraisal_info
"""
df2 = pd.read_sql(query2, conn_string)
print(f"\nData quality check:")
print(df2)

Records with owner_name: 3


Unnamed: 0,prop_id,prop_type_cd,prop_val_yr,geo_id,owner_id,owner_name,situs_street,legal_desc,appraised_val
0,,E,,US 7,,Y RUNNING ...,,0000000000001846S3087 22-,0
1,,I,,US 75126,,Y LAVACA,,0000000000001034S3886 21-003N,0
2,,E,,,75160.0,FF Y ...,,0000000000002204S4190,0



Data quality check:
    total  with_owner  with_appraised
0  104369           3              40


In [35]:
# Analyze the INFO file to find actual column positions based on known data
# From first line analysis we know:
# Owner name "BARAY RAUL ANTONIO" appears around position 600

# Read multiple lines to find patterns
with open(info_file, 'r', encoding='utf-8') as f:
    lines = [f.readline() for _ in range(5)]

# Look for owner name position more precisely
for i, line in enumerate(lines[:3]):
    print(f"\n=== Line {i+1} ===")
    # Known positions from layout doc (adjusted to 0-based)
    # prop_id should be 0-12, owner_name should be around 365-435
    print(f"prop_id (0-12): '{line[0:12]}'")
    print(f"prop_type_cd (12-13): '{line[12:13]}'")
    
    # Try to find prop_val_yr - should be near position 14-19 in layout
    # But we found it at 18-22 in the file
    print(f"Position 13-22: '{line[13:22]}'")
    
    # Owner name according to layout doc should be at 366-436 (1-based = 365-435 0-based)
    print(f"Position 365-435 (owner_name per doc): '{line[365:435].strip()}'")
    
    # But from our analysis, owner starts around 603
    print(f"Position 600-680 (where we saw owner): '{line[600:680].strip()}'")
    
    # Look for the actual owner name in this line
    if "BARAY" in line or "TEXAS" in line or line[600:680].strip():
        print(f"  -> Found data at positions 600-680")


=== Line 1 ===
prop_id (0-12): '000000000001'
prop_type_cd (12-13): 'R'
Position 13-22: '    02025'
Position 365-435 (owner_name per doc): ''
Position 600-680 (where we saw owner): '00202803BARAY RAUL ANTONIO                                                    F0'
  -> Found data at positions 600-680

=== Line 2 ===
prop_id (0-12): '000000000002'
prop_type_cd (12-13): 'R'
Position 13-22: '    02025'
Position 365-435 (owner_name per doc): ''
Position 600-680 (where we saw owner): '00010003HOLMES FRANCES A                                                      F0'
  -> Found data at positions 600-680

=== Line 3 ===
prop_id (0-12): '000000000003'
prop_type_cd (12-13): 'R'
Position 13-22: '    02025'
Position 365-435 (owner_name per doc): ''
Position 600-680 (where we saw owner): '00055953LYNCH KENNETH JR & CHERYL                                             F0'
  -> Found data at positions 600-680


In [36]:
# Map out the actual structure by looking at data patterns in the first line
line = lines[0]

# We know:
# 0-12: prop_id (12 chars)
# 12-13: prop_type_cd (1 char)
# 13-17: some filler (4 chars)
# 17-18: sup indicator? (1 char)
# 18-22: prop_val_yr (4 chars)

# Now let's look for the owner_id and owner_name
# At position 600: "00202803BARAY RAUL ANTONIO"
# owner_id is 8 chars (00202803), owner_name follows for about 70 chars

# Let's trace back from position 600 to find what's between position 22 and 600

print("Detailed positional analysis:")
print(f"{'Start':>6} {'End':>6} {'Len':>4} Content")
print("-" * 80)

# Define the sections we need to map
positions = [
    (0, 12, "prop_id"),
    (12, 13, "prop_type_cd"),
    (13, 17, "filler1"),
    (17, 18, "sup_ind"),
    (18, 22, "prop_val_yr"),
    (22, 37, "unknown1"),  # 15 chars for sup_num?
    (37, 47, "unknown2"),  # exemption_cd?
    (47, 127, "unknown3"),  # exemption_desc (80 chars)?
    (127, 327, "filler2"),  # 200 chars filler?
    (327, 357, "geo_id"),   # 30 chars
    (357, 369, "unknown4"),  # Should this be owner_id?
]

# Let me just show what's at key expected positions per doc
key_positions = [
    (0, 12, "prop_id"),
    (12, 13, "prop_type_cd"),
    (13, 18, "prop_val_yr+sup"),
    (18, 33, "sup_num rest"),
    (33, 43, "exemption_cd"),
    (323, 353, "geo_id"),
    (353, 365, "owner_id"),
    (365, 435, "owner_name"),
    (600, 608, "actual_owner_id"),
    (608, 678, "actual_owner_name"),
]

for start, end, name in key_positions:
    content = line[start:end]
    stripped = content.strip()
    print(f"{start:6} {end:6} {end-start:4} {name:20} '{stripped[:50]}'{'...' if len(stripped) > 50 else ''}")

Detailed positional analysis:
 Start    End  Len Content
--------------------------------------------------------------------------------
     0     12   12 prop_id              '000000000001'
    12     13    1 prop_type_cd         'R'
    13     18    5 prop_val_yr+sup      '0'
    18     33   15 sup_num rest         '202500000000000'
    33     43   10 exemption_cd         '0'
   323    353   30 geo_id               ''
   353    365   12 owner_id             ''
   365    435   70 owner_name           ''
   600    608    8 actual_owner_id      '00202803'
   608    678   70 actual_owner_name    'BARAY RAUL ANTONIO'


In [37]:
# Let's use the second line with owner "HOLMES FRANCES A" to trace back
# We know at position 600: "00010003HOLMES FRANCES A"
# This is owner_id (8 chars) + owner_name (70 chars)

# Let me trace non-whitespace data backward from position 600
line = lines[1]  # Second line

print("=== Finding data clusters ===")
print(f"Line length: {len(line)}")

# Find all non-empty segments
i = 0
segments = []
while i < len(line):
    # Skip whitespace
    while i < len(line) and line[i] == ' ':
        i += 1
    if i >= len(line):
        break
    # Found start of data
    start = i
    # Find end of data
    while i < len(line) and line[i] != ' ':
        i += 1
    # Allow one space inside data
    if i < len(line) - 1 and line[i] == ' ' and line[i+1] != ' ':
        i += 1
        while i < len(line) and line[i] != ' ':
            i += 1
    # Continue capturing if multiple words
    lookahead = 0
    while i + lookahead < len(line) and lookahead < 3:
        if line[i + lookahead] != ' ':
            i += lookahead + 1
            while i < len(line) and line[i] != ' ':
                i += 1
            lookahead = 0
        else:
            lookahead += 1
    
    if i - start > 0:
        content = line[start:i].rstrip()
        if content and len(content) > 2:  # Only show significant data
            segments.append((start, i, content[:80]))

print(f"\nFound {len(segments)} data segments:")
for start, end, content in segments[:30]:
    print(f"  {start:4}-{end:4}: '{content}'")

=== Finding data clusters ===
Line length: 9263

Found 49 data segments:
     0-  13: '000000000002R'
    17-  34: '02025000000000000'
   546- 572: '99.0001.0000.0010.00.06.00'
   596- 624: '000000010003HOLMES FRANCES A'
   678- 691: 'F000000000000'
   753- 764: '5474 FM 987'
   873- 880: 'KAUFMAN'
   978- 983: '75142'
  1049-1058: 'FM RD 987'
  1109-1116: 'KAUFMAN'
  1139-1144: '75142'
  1149-1184: 'R ANTHONY, TRACT 10.00; 5.783 ACRES'
  1659-1680: '0000000000057830A0001'
  1685-1691: '28-001'
  1745-1750: '10.00'
  1795-1960: '00000000000000000000000016896200000000000000000000000000000000000000000000000000'
  1993-1997: '1232'
  2013-2016: '481'
  2033-2041: '10311996'
  2058-2070: '000000000000'
  2190-2202: '000000000000'
  2660-2671: 'FFFFFFFFFFF'
  2721-2731: 'FFFFFFFFFF'
  2771-2803: '00000000000000057830000000000000'
  3203-3215: '000000000000'
  3615-3627: '000000000000'
  4032-4047: '000000000000000'
  4090-4126: '000000000000000000000000000000000000'
  4213-4227: '0000000016

In [38]:
# Given the complexity and mismatch between documentation and actual file,
# let's prioritize the key fields we need most for analysis:
# prop_id, prop_val_yr, owner_name, situs_street, legal_desc, appraised_val

# Based on analysis:
# - prop_id: 0-12 (confirmed)
# - prop_type_cd: 12-13 (confirmed)
# - prop_val_yr: somewhere in 13-34, contains "2025"
# - owner_id: 596-608 (12 chars based on segment starting at 596)
# - owner_name: 608-678 (70 chars)
# - situs_street: around 753 (saw "5474 FM 987")
# - legal_desc: around 1149 (saw "R ANTHONY, TRACT 10.00...")

# Let's verify these positions across multiple lines
for i, line in enumerate(lines[:3]):
    print(f"\n=== Line {i+1} ===")
    print(f"prop_id (0-12):     '{line[0:12].strip()}'")
    print(f"prop_type_cd (12):  '{line[12:13]}'")
    print(f"prop_val_yr (17-21):'{line[17:21].strip()}'")  # Skip position 13-16
    print(f"owner_id (596-608): '{line[596:608].strip()}'")
    print(f"owner_name (608-678): '{line[608:678].strip()}'")
    print(f"situs_street (745-785): '{line[745:785].strip()}'")  # Adjusted
    print(f"legal_desc (1145-1295): '{line[1145:1295].strip()}'")  # Adjusted


=== Line 1 ===
prop_id (0-12):     '000000000001'
prop_type_cd (12):  'R'
prop_val_yr (17-21):'0202'
owner_id (596-608): '000000202803'
owner_name (608-678): 'BARAY RAUL ANTONIO'
situs_street (745-785): '1823 BALMORAL DR'
legal_desc (1145-1295): 'R ANTHONY, TRACT 5.00; 26.232 ACRES, & HOUSE'

=== Line 2 ===
prop_id (0-12):     '000000000002'
prop_type_cd (12):  'R'
prop_val_yr (17-21):'0202'
owner_id (596-608): '000000010003'
owner_name (608-678): 'HOLMES FRANCES A'
situs_street (745-785): '5474 FM 987'
legal_desc (1145-1295): 'R ANTHONY, TRACT 10.00; 5.783 ACRES'

=== Line 3 ===
prop_id (0-12):     '000000000003'
prop_type_cd (12):  'R'
prop_val_yr (17-21):'0202'
owner_id (596-608): '000000055953'
owner_name (608-678): 'LYNCH KENNETH JR & CHERYL'
situs_street (745-785): '5448 FM RD 987'
legal_desc (1145-1295): 'R ANTHONY, TRACT 10.01; 15.278 ACRES, & OUTBUILDING'


In [39]:
# Refined positions - let's verify more fields
# Using line 2 which has cleaner data

line = lines[1]

# Build a comprehensive field map
field_map = {
    "prop_id": (0, 12),
    "prop_type_cd": (12, 13),
    # Skip 4 char filler 
    # Skip 1 char sup indicator
    "prop_val_yr": (18, 22),  # 4 chars for year
    # Large unknown gap - need to map more fields
    "owner_id": (596, 608),  # 12 chars
    "owner_name": (608, 678),  # 70 chars
    "confidential_flag": (678, 679),  # 1 char (saw 'F')
    # More fields...
    "situs_street": (745, 785),  # 40 chars
    "situs_city": (873, 903),  # ~30 chars
    "situs_zip": (978, 988),  # ~10 chars 
    "legal_desc": (1145, 1295),  # 150 chars
}

print("Verified field positions:")
for field, (start, end) in field_map.items():
    value = line[start:end].strip()
    print(f"{field:20} ({start:4}-{end:4}, {end-start:3} chars): '{value[:50]}'{'...' if len(value) > 50 else ''}")

# Now find the value fields (appraised_val, etc)
# From segment analysis we saw numeric data around 1795-1960
print(f"\nNumeric area (1795-1960):")
print(f"  '{line[1795:1960]}'")

# Also check the 168962 we saw at position 4213 - that might be an appraised value
print(f"\nValue at 4213-4228: '{line[4213:4228].strip()}'")

Verified field positions:
prop_id              (   0-  12,  12 chars): '000000000002'
prop_type_cd         (  12-  13,   1 chars): 'R'
prop_val_yr          (  18-  22,   4 chars): '2025'
owner_id             ( 596- 608,  12 chars): '000000010003'
owner_name           ( 608- 678,  70 chars): 'HOLMES FRANCES A'
confidential_flag    ( 678- 679,   1 chars): 'F'
situs_street         ( 745- 785,  40 chars): '5474 FM 987'
situs_city           ( 873- 903,  30 chars): 'KAUFMAN'
situs_zip            ( 978- 988,  10 chars): '75142'
legal_desc           (1145-1295, 150 chars): 'R ANTHONY, TRACT 10.00; 5.783 ACRES'

Numeric area (1795-1960):
  '000000000000000000000000168962000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000168962000000000000000000000000168962'

Value at 4213-4228: '00000000168962'


In [40]:
# Create a corrected INFO layout based on actual file analysis
# We'll build it programmatically with verified positions

import json

corrected_info_layout = {
    "fileName": "INFO",
    "tableName": "appraisal_info",
    "description": "Main property/parcel information",
    "columns": [
        {"index": 0, "name": "prop_id", "dataType": "BIGINT", "length": 12, "nullable": False},
        {"index": 1, "name": "prop_type_cd", "dataType": "VARCHAR", "length": 1, "nullable": True},
        {"index": 2, "name": "filler_1", "dataType": "VARCHAR", "length": 5, "nullable": True, "skip": True},
        {"index": 3, "name": "prop_val_yr", "dataType": "INTEGER", "length": 4, "nullable": False},
        # Gap from position 22 to 596 = 574 chars
        {"index": 4, "name": "filler_gap1", "dataType": "VARCHAR", "length": 574, "nullable": True, "skip": True},
        {"index": 5, "name": "owner_id", "dataType": "BIGINT", "length": 12, "nullable": True},
        {"index": 6, "name": "owner_name", "dataType": "VARCHAR", "length": 70, "nullable": True},
        {"index": 7, "name": "confidential_flag", "dataType": "VARCHAR", "length": 1, "nullable": True},
        # Gap from 679 to 745 = 66 chars (contains previous year owner, addresses, etc.)
        {"index": 8, "name": "filler_gap2", "dataType": "VARCHAR", "length": 66, "nullable": True, "skip": True},
        {"index": 9, "name": "situs_street", "dataType": "VARCHAR", "length": 40, "nullable": True},
        # Gap from 785 to 873 = 88 chars
        {"index": 10, "name": "filler_gap3", "dataType": "VARCHAR", "length": 88, "nullable": True, "skip": True},
        {"index": 11, "name": "situs_city", "dataType": "VARCHAR", "length": 30, "nullable": True},
        # Gap from 903 to 978 = 75 chars
        {"index": 12, "name": "filler_gap4", "dataType": "VARCHAR", "length": 75, "nullable": True, "skip": True},
        {"index": 13, "name": "situs_zip", "dataType": "VARCHAR", "length": 10, "nullable": True},
        # Gap from 988 to 1145 = 157 chars
        {"index": 14, "name": "filler_gap5", "dataType": "VARCHAR", "length": 157, "nullable": True, "skip": True},
        {"index": 15, "name": "legal_desc", "dataType": "VARCHAR", "length": 150, "nullable": True},
        # Read rest of line as one big blob for now
        {"index": 16, "name": "remainder", "dataType": "VARCHAR", "length": 7968, "nullable": True, "skip": True},
    ]
}

# Verify the positions add up
total_length = sum(col["length"] for col in corrected_info_layout["columns"])
print(f"Total calculated length: {total_length}")
print(f"Expected line length: 9263")

# Print the running position
running_pos = 0
for col in corrected_info_layout["columns"]:
    end_pos = running_pos + col["length"]
    skip = col.get("skip", False)
    print(f"{running_pos:5}-{end_pos:5}: {col['name']:20} ({col['length']:4} chars) {'SKIP' if skip else ''}")
    running_pos = end_pos

Total calculated length: 9263
Expected line length: 9263
    0-   12: prop_id              (  12 chars) 
   12-   13: prop_type_cd         (   1 chars) 
   13-   18: filler_1             (   5 chars) SKIP
   18-   22: prop_val_yr          (   4 chars) 
   22-  596: filler_gap1          ( 574 chars) SKIP
  596-  608: owner_id             (  12 chars) 
  608-  678: owner_name           (  70 chars) 
  678-  679: confidential_flag    (   1 chars) 
  679-  745: filler_gap2          (  66 chars) SKIP
  745-  785: situs_street         (  40 chars) 
  785-  873: filler_gap3          (  88 chars) SKIP
  873-  903: situs_city           (  30 chars) 
  903-  978: filler_gap4          (  75 chars) SKIP
  978-  988: situs_zip            (  10 chars) 
  988- 1145: filler_gap5          ( 157 chars) SKIP
 1145- 1295: legal_desc           ( 150 chars) 
 1295- 9263: remainder            (7968 chars) SKIP


In [41]:
# Test the corrected layout by parsing records manually
def parse_info_line(line, layout):
    """Parse a line using the corrected layout"""
    record = {}
    position = 0
    for col in layout["columns"]:
        value = line[position:position + col["length"]]
        if not col.get("skip", False):
            # Clean up the value
            value = value.strip()
            if col["dataType"] in ("BIGINT", "INTEGER") and value:
                try:
                    value = int(value)
                except ValueError:
                    value = None
            elif not value:
                value = None
            record[col["name"]] = value
        position += col["length"]
    return record

# Test on first 3 lines
for i, line in enumerate(lines[:3]):
    record = parse_info_line(line, corrected_info_layout)
    print(f"\n=== Record {i+1} ===")
    for key, value in record.items():
        val_str = str(value)[:60] if value else "None"
        print(f"  {key:20}: {val_str}")


=== Record 1 ===
  prop_id             : 1
  prop_type_cd        : R
  prop_val_yr         : 2025
  owner_id            : 202803
  owner_name          : BARAY RAUL ANTONIO
  confidential_flag   : F
  situs_street        : 1823 BALMORAL DR
  situs_city          : CARROLLTON
  situs_zip           : 75006
  legal_desc          : R ANTHONY, TRACT 5.00; 26.232 ACRES, & HOUSE

=== Record 2 ===
  prop_id             : 2
  prop_type_cd        : R
  prop_val_yr         : 2025
  owner_id            : 10003
  owner_name          : HOLMES FRANCES A
  confidential_flag   : F
  situs_street        : 5474 FM 987
  situs_city          : KAUFMAN
  situs_zip           : 75142
  legal_desc          : R ANTHONY, TRACT 10.00; 5.783 ACRES

=== Record 3 ===
  prop_id             : 3
  prop_type_cd        : R
  prop_val_yr         : 2025
  owner_id            : 55953
  owner_name          : LYNCH KENNETH JR & CHERYL
  confidential_flag   : F
  situs_street        : 5448 FM RD 987
  situs_city          : KAUF

In [42]:
# Drop and recreate appraisal_info table with simplified schema
import psycopg2

conn = psycopg2.connect(conn_string)
cur = conn.cursor()

# Drop existing table
cur.execute("DROP TABLE IF EXISTS cad.appraisal_info CASCADE")

# Create new simplified table
create_sql = """
CREATE TABLE IF NOT EXISTS cad.appraisal_info (
    id SERIAL PRIMARY KEY,
    prop_id BIGINT,
    prop_type_cd VARCHAR(1),
    prop_val_yr INTEGER,
    owner_id BIGINT,
    owner_name VARCHAR(70),
    confidential_flag VARCHAR(1),
    situs_street VARCHAR(40),
    situs_city VARCHAR(30),
    situs_zip VARCHAR(10),
    legal_desc VARCHAR(150),
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
cur.execute(create_sql)

# Create indexes
cur.execute("CREATE INDEX IF NOT EXISTS idx_appraisal_info_prop_id ON cad.appraisal_info(prop_id)")
cur.execute("CREATE INDEX IF NOT EXISTS idx_appraisal_info_year ON cad.appraisal_info(prop_val_yr)")

conn.commit()
cur.close()
conn.close()

print("✅ appraisal_info table recreated with simplified schema")

✅ appraisal_info table recreated with simplified schema


In [45]:
# Reload the services with updated layout config
import importlib
import sys

# Force reload of all our modules
modules_to_reload = [m for m in sys.modules.keys() if m.startswith('app.')]
for mod_name in modules_to_reload:
    if mod_name in sys.modules:
        del sys.modules[mod_name]

# Re-import
from app.models.layout import load_layout_config
from app.services.file_reader import read_fixed_width_file
from app.services.database import DatabaseService

# Reinitialize
layout_config = load_layout_config(Path("/Users/tapiwamaruni/Documents/projects/housing1/config/file_layouts.json"))
db_service = DatabaseService(conn_string)

# Check the new INFO layout
info_config = layout_config.get_file_config("INFO")
print(f"INFO layout now has {len(info_config.columns)} column definitions")
print(f"Non-skip columns: {[c.name for c in info_config.columns if not c.skip]}")

INFO layout now has 17 column definitions
Non-skip columns: ['prop_id', 'prop_type_cd', 'prop_val_yr', 'owner_id', 'owner_name', 'confidential_flag', 'situs_street', 'situs_city', 'situs_zip', 'legal_desc']


In [46]:
# Load INFO data with corrected layout
from pathlib import Path
import time

CAD_DATA_DIR = Path("/Users/tapiwamaruni/Documents/projects/housing1/Kaufman-CAD-2025-Certified-Full-Roll-Download-updated-with-Supp-5")
info_file = CAD_DATA_DIR / "2025-10-27_002174_APPRAISAL_INFO.TXT"

print(f"Loading INFO data from: {info_file}")
print(f"Table: {info_config.tableName}")

# Generate records
start_time = time.time()
records = list(read_fixed_width_file(info_file, info_config))
read_time = time.time() - start_time
print(f"Read {len(records):,} records in {read_time:.1f}s")

# Show sample records
print("\nSample records:")
for i, rec in enumerate(records[:3]):
    print(f"\n  Record {i+1}:")
    for k, v in rec.items():
        if v is not None:
            print(f"    {k}: {v}")

Loading INFO data from: /Users/tapiwamaruni/Documents/projects/housing1/Kaufman-CAD-2025-Certified-Full-Roll-Download-updated-with-Supp-5/2025-10-27_002174_APPRAISAL_INFO.TXT
Table: appraisal_info
2025-12-07 17:58:23 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_INFO.TXT
2025-12-07 17:58:26 - cad_loader - INFO - Processed 104369 records from 2025-10-27_002174_APPRAISAL_INFO.TXT
Read 104,369 records in 3.3s

Sample records:

  Record 1:
    prop_id: 1
    prop_type_cd: R
    prop_val_yr: 2025
    owner_id: 202803
    owner_name: BARAY RAUL ANTONIO
    confidential_flag: F
    situs_street: 1823 BALMORAL DR
    situs_city: CARROLLTON
    situs_zip: 75006
    legal_desc: R ANTHONY, TRACT 5.00; 26.232 ACRES, & HOUSE

  Record 2:
    prop_id: 2
    prop_type_cd: R
    prop_val_yr: 2025
    owner_id: 10003
    owner_name: HOLMES FRANCES A
    confidential_flag: F
    situs_street: 5474 FM 987
    situs_city: KAUFMAN
    situs_zip: 75142
    legal_desc: R ANTHONY, TRACT 10.0

In [50]:
# Insert INFO records into database
import time

# Reinitialize database service with proper config
db_config = {
    "host": "localhost",
    "port": 5432,
    "database": "kaufman_cad",
    "user": "cad_user",
    "password": "cad_password"
}
db_service = DatabaseService(db_config)

# Get column names (excluding skip columns)
column_names = [c.name for c in info_config.columns if not c.skip]
print(f"Inserting {len(records):,} records with columns: {column_names}")

# Use streaming insert
start_time = time.time()
result = db_service.insert_records_streaming(
    records_generator=iter(records),
    file_config=info_config,
    batch_size=5000
)
insert_time = time.time() - start_time

print(f"\n✅ Insert complete!")
print(f"   Records inserted: {result}")
print(f"   Time: {insert_time:.1f}s")
print(f"   Rate: {result/insert_time:,.0f} records/second")

Inserting 104,369 records with columns: ['prop_id', 'prop_type_cd', 'prop_val_yr', 'owner_id', 'owner_name', 'confidential_flag', 'situs_street', 'situs_city', 'situs_zip', 'legal_desc']
2025-12-07 17:59:32 - cad_loader - INFO - Inserted 10000 records into appraisal_info
2025-12-07 17:59:33 - cad_loader - INFO - Inserted 20000 records into appraisal_info
2025-12-07 17:59:35 - cad_loader - INFO - Inserted 30000 records into appraisal_info
2025-12-07 17:59:36 - cad_loader - INFO - Inserted 40000 records into appraisal_info
2025-12-07 17:59:37 - cad_loader - INFO - Inserted 50000 records into appraisal_info
2025-12-07 17:59:38 - cad_loader - INFO - Inserted 60000 records into appraisal_info
2025-12-07 17:59:39 - cad_loader - INFO - Inserted 70000 records into appraisal_info
2025-12-07 17:59:40 - cad_loader - INFO - Inserted 80000 records into appraisal_info
2025-12-07 17:59:41 - cad_loader - INFO - Inserted 90000 records into appraisal_info
2025-12-07 17:59:42 - cad_loader - INFO - Insert

In [51]:
# Verify loaded data
print("✅ Data Verification\n")

# Get count
count_query = "SELECT COUNT(*) FROM cad.appraisal_info"
count = pd.read_sql(count_query, conn_string).iloc[0, 0]
print(f"Total records: {count:,}")

# Check data quality
quality_query = """
SELECT 
    COUNT(*) as total,
    COUNT(prop_id) as with_prop_id,
    COUNT(owner_name) as with_owner,
    COUNT(situs_street) as with_street,
    COUNT(legal_desc) as with_legal,
    COUNT(DISTINCT prop_val_yr) as years
FROM cad.appraisal_info
"""
quality = pd.read_sql(quality_query, conn_string)
print("\nData completeness:")
print(quality.T)

# Sample records
sample_query = """
SELECT prop_id, prop_type_cd, prop_val_yr, owner_name, 
       situs_street, situs_city, situs_zip, 
       SUBSTRING(legal_desc, 1, 50) as legal_desc_short
FROM cad.appraisal_info 
WHERE owner_name IS NOT NULL
ORDER BY prop_id
LIMIT 5
"""
sample = pd.read_sql(sample_query, conn_string)
print("\nSample records:")
sample

✅ Data Verification

Total records: 104,369

Data completeness:
                   0
total         104369
with_prop_id  104361
with_owner    103867
with_street   103534
with_legal    104326
years              1

Sample records:


Unnamed: 0,prop_id,prop_type_cd,prop_val_yr,owner_name,situs_street,situs_city,situs_zip,legal_desc_short
0,1,R,2025,BARAY RAUL ANTONIO,1823 BALMORAL DR,CARROLLTON,75006,"R ANTHONY, TRACT 5.00; 26.232 ACRES, & HOUSE"
1,2,R,2025,HOLMES FRANCES A,5474 FM 987,KAUFMAN,75142,"R ANTHONY, TRACT 10.00; 5.783 ACRES"
2,3,R,2025,LYNCH KENNETH JR & CHERYL,5448 FM RD 987,KAUFMAN,75142,"R ANTHONY, TRACT 10.01; 15.278 ACRES, & OUTBUI..."
3,4,R,2025,HOLMES FRANCES A,5474 FM 987,KAUFMAN,75142,"R ANTHONY, TRACT 15.00; 5. ACRES, & HOUSE"
4,5,R,2025,RAINVILLE WILLIAM E,7073 CO RD 275,TERRELL,75160,"R ANTHONY, TRACT 20.00; 1.0 ACRES, & HOUSE"
