In [1]:
# Setup and Imports
import sys
import os
from pathlib import Path
import json
import subprocess
import psycopg2
from psycopg2.extras import RealDictCursor

# Add project root to path
project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from app.models.layout import load_layout_config
from app.config import DATA_DIR, CONFIG_DIR

# Database config
DB_CONFIG = {
    "host": "localhost",
    "port": 5432,
    "database": "kaufman_cad",
    "user": "cad_user",
    "password": "cad_password"
}

print(f"Project root: {project_root}")
print(f"Data dir: {DATA_DIR}")
print(f"Config dir: {CONFIG_DIR}")

Project root: /Users/tapiwamaruni/Documents/projects/housing1
Data dir: /Users/tapiwamaruni/Documents/projects/housing1/Kaufman-CAD-2025-Certified-Full-Roll-Download-updated-with-Supp-5
Config dir: /Users/tapiwamaruni/Documents/projects/housing1/config


In [11]:
# Load Configuration
layout_config = load_layout_config(CONFIG_DIR / "file_layouts.json")
print(f"Loaded configuration version: {layout_config.version}")

def get_file_line_count(file_path):
    """Count lines in a file using wc -l"""
    if not file_path.exists():
        return 0
    try:
        result = subprocess.run(['wc', '-l', str(file_path)], capture_output=True, text=True)
        return int(result.stdout.split()[0])
    except Exception as e:
        print(f"Error counting lines in {file_path}: {e}")
        return 0

def get_table_row_count(table_name):
    """Count rows in a database table"""
    try:
        with psycopg2.connect(**DB_CONFIG) as conn:
            with conn.cursor() as cur:
                # Use 'cad' schema
                cur.execute(f"SELECT COUNT(*) FROM cad.{table_name}")
                return cur.fetchone()[0]
    except Exception as e:
        print(f"Error counting rows in {table_name}: {e}")
        return -1

def validate_load():
    """Validate that file line counts match table row counts"""
    print("\n=== Validation Report ===")
    print(f"{'File Type':<20} | {'File Lines':<12} | {'DB Rows':<10} | {'Status':<10}")
    print("-" * 60)
    
    for file_config in layout_config.files:
        file_name = f"{layout_config.filePrefix}{file_config.fileName}.TXT"
        file_path = DATA_DIR / file_name
        
        # Check if file exists
        if not file_path.exists():
            print(f"{file_config.fileName:<20} | {'MISSING':<12} | {'N/A':<10} | SKIPPED")
            continue
            
        file_lines = get_file_line_count(file_path)
        db_rows = get_table_row_count(file_config.tableName)
        
        status = "MATCH" if file_lines == db_rows else "MISMATCH"
        if db_rows == -1: status = "DB ERROR"
        
        print(f"{file_config.fileName:<20} | {file_lines:<12} | {db_rows:<10} | {status}")

# Run validation
validate_load()

Loaded configuration version: 2.0.0

=== Validation Report ===
File Type            | File Lines   | DB Rows    | Status    
------------------------------------------------------------
HEADER               | 1            | 1          | MATCH
INFO                 | 104369       | 104369     | MATCH
ENTITY               | 65           | 65         | MATCH
INFO                 | 104369       | 104369     | MATCH
ENTITY               | 65           | 65         | MATCH
ENTITY_INFO          | 511086       | 104361     | MISMATCH
ENTITY_TOTALS        | 65           | 28         | MISMATCH
LAND_DETAIL          | 102733       | 102733     | MATCH
ENTITY_INFO          | 511086       | 104361     | MISMATCH
ENTITY_TOTALS        | 65           | 28         | MISMATCH
LAND_DETAIL          | 102733       | 102733     | MATCH
IMPROVEMENT_INFO     | 101635       | 101635     | MATCH
IMPROVEMENT_INFO     | 101635       | 101635     | MATCH
IMPROVEMENT_DETAIL   | 347682       | 347682     | MATCH
IMPR

In [7]:
# Verify Code Mappings in INFO table
def verify_code_mappings():
    print("\n=== Code Mapping Verification (INFO Table) ===")
    try:
        with psycopg2.connect(**DB_CONFIG) as conn:
            with conn.cursor() as cur:
                # Check distinct property types in cad.appraisal_info
                cur.execute("SELECT DISTINCT prop_type_cd, COUNT(*) FROM cad.appraisal_info GROUP BY prop_type_cd ORDER BY prop_type_cd")
                rows = cur.fetchall()
                
                print("Distinct Property Types found in DB:")
                for row in rows:
                    print(f"  '{row[0]}': {row[1]} records")
                    
                # Check if we see 'Real' instead of 'R'
                has_mapped_values = any(r[0] in ['Real', 'Personal', 'Mineral', 'Automobile'] for r in rows if r[0])
                if has_mapped_values:
                    print("\n✅ Code mappings appear to be applied (Full names found).")
                else:
                    print("\n⚠️ Code mappings might NOT be applied (Codes found). Check loader configuration.")
                    
    except Exception as e:
        print(f"Error verifying mappings: {e}")

verify_code_mappings()


=== Code Mapping Verification (INFO Table) ===
Distinct Property Types found in DB:
  'E': 2 records
  'I': 1 records
  'Mineral': 813 records
  'Personal': 5031 records
  'Real': 66413 records
  'None': 2 records

✅ Code mappings appear to be applied (Full names found).


In [10]:
# Investigate Missing Records in INFO Table
from app.services.file_reader import parse_line

def investigate_missing_info_records():
    print("\n=== Investigating Missing INFO Records ===")
    
    info_config = layout_config.get_file_config("INFO")
    file_name = f"{layout_config.filePrefix}INFO.TXT"
    file_path = DATA_DIR / file_name
    
    if not file_path.exists():
        print(f"File not found: {file_path}")
        return

    # 1. Get all Prop IDs from DB
    print("Fetching Property IDs from database...")
    db_prop_ids = set()
    try:
        with psycopg2.connect(**DB_CONFIG) as conn:
            with conn.cursor() as cur:
                cur.execute("SELECT prop_id FROM cad.appraisal_info")
                rows = cur.fetchall()
                db_prop_ids = {str(r[0]) for r in rows} # Convert to string for comparison
    except Exception as e:
        print(f"Error fetching DB records: {e}")
        return
    
    print(f"Found {len(db_prop_ids)} records in DB.")

    # 2. Scan file for missing IDs
    print(f"Scanning file: {file_name}...")
    missing_examples = []
    missing_count = 0
    
    try:
        with open(file_path, 'r', encoding=layout_config.encoding) as f:
            for line_num, line in enumerate(f, 1):
                # Extract prop_id directly based on config (index 0, length 12, start 1)
                # prop_id is at 0:12 (0-based)
                raw_prop_id = line[0:12].strip()
                
                # Skip empty lines
                if not raw_prop_id:
                    continue
                    
                # Check if loaded
                # Note: DB might store as integer, so '000001' becomes 1. 
                # We need to handle leading zeros or convert to int for comparison if DB is numeric.
                # The schema says prop_id is BIGINT.
                try:
                    prop_id_int = int(raw_prop_id)
                    prop_id_str = str(prop_id_int)
                except ValueError:
                    # If prop_id is not an integer, that's a likely cause for failure
                    if len(missing_examples) < 5:
                        missing_examples.append({
                            "line_num": line_num,
                            "raw_id": raw_prop_id,
                            "reason": "Invalid Prop ID (Not an integer)",
                            "line_snippet": line[:100]
                        })
                    missing_count += 1
                    continue

                if prop_id_str not in db_prop_ids:
                    missing_count += 1
                    if len(missing_examples) < 5:
                        # Try to parse the full line to see what happens
                        try:
                            parsed = parse_line(line, info_config)
                            missing_examples.append({
                                "line_num": line_num,
                                "raw_id": raw_prop_id,
                                "reason": "Valid ID but missing in DB",
                                "parsed_sample": {k: v for k, v in parsed.items() if k in ['prop_id', 'prop_type_cd', 'owner_name', 'situs_street']},
                                "line_snippet": line[:100]
                            })
                        except Exception as e:
                            missing_examples.append({
                                "line_num": line_num,
                                "raw_id": raw_prop_id,
                                "reason": f"Parse Error: {e}",
                                "line_snippet": line[:100]
                            })

    except Exception as e:
        print(f"Error reading file: {e}")
        return

    print(f"Total missing records found in scan: {missing_count}")
    
    print("\n--- Examples of Missing Records ---")
    for ex in missing_examples:
        print(f"\nLine {ex['line_num']}: Prop ID '{ex['raw_id']}'")
        print(f"Reason: {ex['reason']}")
        if 'parsed_sample' in ex:
            print(f"Parsed Data: {ex['parsed_sample']}")
        print(f"Snippet: {ex['line_snippet']}...")

investigate_missing_info_records()


=== Investigating Missing INFO Records ===
Fetching Property IDs from database...
Found 72258 records in DB.
Scanning file: 2025-10-27_002174_APPRAISAL_INFO.TXT...
Total missing records found in scan: 32112

--- Examples of Missing Records ---

Line 1: Prop ID '000000000001'
Reason: Valid ID but missing in DB
Parsed Data: {'prop_id': 1, 'prop_type_cd': 'R    020250000000000', 'owner_name': 'BARAY RAUL ANTONIO', 'situs_street': 'FM RD 2578'}
Snippet: 000000000001R    02025000000000000                                                                  ...

Line 3: Prop ID '000000000003'
Reason: Valid ID but missing in DB
Parsed Data: {'prop_id': 3, 'prop_type_cd': 'R    020250000000000', 'owner_name': 'LYNCH KENNETH JR & CHERYL', 'situs_street': 'FM RD 987'}
Snippet: 000000000003R    02025000000000001M EX        PROPOSED 2025 SB4/SB23 EXEMPTION CHANGES              ...

Line 4: Prop ID '000000000004'
Reason: Valid ID but missing in DB
Parsed Data: {'prop_id': 4, 'prop_type_cd': 'R    0202

In [None]:
# Manual check of ENTITY_INFO count
with psycopg2.connect(**DB_CONFIG) as conn:
    with conn.cursor() as cur:
        cur.execute("SELECT COUNT(*) FROM cad.appraisal_entity_info")
        count = cur.fetchone()[0]
        print(f"ENTITY_INFO count: {count}")