# Kaufman CAD Data Loader

This notebook loads Kaufman County Central Appraisal District property data from fixed-width text files into PostgreSQL.

In [110]:
# Setup and Imports
import sys
from pathlib import Path

project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from app.utils.logging_config import setup_logger
from app.models.layout import load_layout_config
from app.services.file_reader import read_fixed_width_file, discover_data_files, get_file_path
from app.services.database import DatabaseService
from app.services.loader import DataLoader
from app.config import DATA_DIR, CONFIG_DIR
import pandas as pd

logger = setup_logger("cad_loader", level="INFO")
print(f"Project root: {project_root}")
print("Setup complete!")

Project root: /Users/tapiwamaruni/Documents/projects/housing1
Setup complete!


## Configuration

In [112]:
# ============================================
# STEP 2: Load Configuration (Force Reload)
# ============================================
import importlib
import app.models.layout
import app.services.file_reader
import app.services.loader

importlib.reload(app.models.layout)
importlib.reload(app.services.file_reader)
importlib.reload(app.services.loader)

from app.models.layout import load_layout_config
from app.services.loader import DataLoader

layout_config = load_layout_config(CONFIG_DIR / "file_layouts.json")

print(f"Tax Year: {layout_config.taxYear}")
print(f"\nConfigured file types ({len(layout_config.files)}):")
for fc in layout_config.files:
    print(f"  {fc.fileName}: {len(fc.active_columns)} active columns")

Tax Year: 2025

Configured file types (17):
  HEADER: 11 active columns
  INFO: 16 active columns
  ENTITY: 2 active columns
  ENTITY_INFO: 9 active columns
  ENTITY_TOTALS: 5 active columns
  LAND_DETAIL: 16 active columns
  IMPROVEMENT_INFO: 11 active columns
  IMPROVEMENT_DETAIL: 8 active columns
  IMPROVEMENT_DETAIL_ATTR: 6 active columns
  ABSTRACT_SUBDV: 2 active columns
  AGENT: 8 active columns
  STATE_CODE: 2 active columns
  COUNTRY_CODE: 2 active columns
  LAWSUIT: 5 active columns
  MOBILE_HOME_INFO: 8 active columns
  TAX_DEFERRAL_INFO: 5 active columns
  UDI: 4 active columns


In [113]:
# Database configuration
import psycopg2

db_config = {
    "host": "localhost",
    "port": 5432,
    "database": "kaufman_cad",
    "user": "cad_user",
    "password": "cad_password"
}

conn_string = "postgresql://cad_user:cad_password@localhost:5432/kaufman_cad"
db_service = DatabaseService(db_config)

# Create a persistent connection for analysis
conn = psycopg2.connect(**db_config)
conn.autocommit = True
print("‚úÖ Database connection successful")

‚úÖ Database connection successful


## Discover Data Files

In [None]:
# Find available data files
available_files = discover_data_files(DATA_DIR, layout_config.filePrefix)
print(f"Data directory: {DATA_DIR}")
print(f"\nAvailable files ({len(available_files)}):")
for f in available_files:
    print(f"  - {f}")

## Load Reference Tables

In [96]:
# Load reference tables (small lookup tables)
reference_tables = ["HEADER", "STATE_CODE", "COUNTRY_CODE", "ABSTRACT_SUBDV", "AGENT", "ENTITY"]

# Fix DataLoader instantiation: pass arguments by name or in correct order
# __init__(self, config_path=None, data_dir=None, db_service=None)
loader = DataLoader(
    config_path=CONFIG_DIR / "file_layouts.json",
    data_dir=DATA_DIR,
    db_service=db_service
)

for table in reference_tables:
    result = loader.load_file(table)
    status = "‚úÖ" if result["status"] == "SUCCESS" else "‚ùå"
    print(f"{status} {table}: {result['records_loaded']:,} records")

2025-12-07 22:01:47 - cad_loader - INFO - Loaded layout config: Kaufman County CAD 2025 Appraisal Data Export Layout Configuration
2025-12-07 22:01:47 - cad_loader - INFO - Truncated table: cad.appraisal_header
2025-12-07 22:01:47 - cad_loader - INFO - Truncated table: cad.appraisal_header
2025-12-07 22:01:47 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_HEADER.TXT
2025-12-07 22:01:47 - cad_loader - INFO - Processed 1 records from 2025-10-27_002174_APPRAISAL_HEADER.TXT
2025-12-07 22:01:47 - cad_loader - INFO - Completed inserting 1 records into cad.appraisal_header
2025-12-07 22:01:47 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_HEADER.TXT
2025-12-07 22:01:47 - cad_loader - INFO - Processed 1 records from 2025-10-27_002174_APPRAISAL_HEADER.TXT
2025-12-07 22:01:47 - cad_loader - INFO - Completed inserting 1 records into cad.appraisal_header
‚úÖ HEADER: 1 records
‚úÖ HEADER: 1 records
2025-12-07 22:01:47 - cad_loader - INFO - Truncated table: cad.appr

## Load Main Data Tables

In [109]:
# Reload modules to pick up logging changes
importlib.reload(app.services.database)
importlib.reload(app.services.loader)

# Re-import the class from the reloaded module
from app.services.database import DatabaseService
from app.services.loader import DataLoader

# Re-instantiate services
db_service = DatabaseService(db_config)
loader = DataLoader(
    config_path=CONFIG_DIR / "file_layouts.json",
    data_dir=DATA_DIR,
    db_service=db_service
)

# Load remaining tables
main_tables = [
    "ENTITY_INFO", "ENTITY_TOTALS"
]

for table in main_tables:
    print(f"\nLoading {table}...")
    result = loader.load_file(table)
    status = "‚úÖ" if result["status"] == "SUCCESS" else "‚ùå"
    duration = result.get('duration_seconds', 0)
    print(f"{status} {table}: {result['records_loaded']:,} records in {duration:.1f}s")
    if result["status"] != "SUCCESS":
        print(f"  Error: {result.get('error')}")


Loading ENTITY_INFO...
2025-12-08 04:38:55 - cad_loader - INFO - Loaded layout config: Kaufman County CAD 2025 Appraisal Data Export Layout Configuration


2025-12-08 04:38:55 - cad_loader - INFO - Truncated table: cad.appraisal_entity_info
2025-12-08 04:38:55 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_ENTITY_INFO.TXT
2025-12-08 04:38:55 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_ENTITY_INFO.TXT
2025-12-08 04:38:55 - cad_loader - ERROR - Error inserting record: duplicate key value violates unique constraint "appraisal_entity_info_pkey"
DETAIL:  Key (prop_id, tax_year, entity_id)=(1, 202, 50000) already exists.

2025-12-08 04:38:55 - cad_loader - ERROR - Error inserting record: duplicate key value violates unique constraint "appraisal_entity_info_pkey"
DETAIL:  Key (prop_id, tax_year, entity_id)=(1, 202, 50000) already exists.

2025-12-08 04:38:55 - cad_loader - ERROR - Error inserting record: duplicate key value violates unique constraint "appraisal_entity_info_pkey"
DETAIL:  Key (prop_id, tax_year, entity_id)=(1, 202, 50000) already exists.

2025-12-08 04:38:55 - cad_loader - ERROR - Error insert

In [106]:
# Check column lengths in DB
with psycopg2.connect(**db_config) as conn:
    with conn.cursor() as cur:
        cur.execute("""
            SELECT column_name, character_maximum_length 
            FROM information_schema.columns 
            WHERE table_schema = 'cad' AND table_name = 'appraisal_info'
            ORDER BY character_maximum_length
        """)
        rows = cur.fetchall()
        print("Column Lengths in DB:")
        for r in rows:
            print(f"  {r[0]}: {r[1]}")

Column Lengths in DB:
  confidential_flag: 1
  situs_zip: 10
  mail_zip: 10
  prop_type_cd: 20
  mail_country: 20
  situs_city: 30
  mail_state: 50
  mail_city: 50
  situs_street: 60
  owner_name: 70
  mail_addr_line2: 80
  mail_addr_line1: 80
  legal_desc: 150
  created_at: None
  prop_id: None
  prop_val_yr: None
  owner_id: None
  id: None


In [107]:
# Fix legal_desc length mismatch
try:
    with psycopg2.connect(**db_config) as conn:
        with conn.cursor() as cur:
            print("Altering appraisal_info.legal_desc to VARCHAR(340)...")
            cur.execute("ALTER TABLE cad.appraisal_info ALTER COLUMN legal_desc TYPE VARCHAR(340)")
            conn.commit()
            print("Schema updated.")
except Exception as e:
    print(f"Schema update failed: {e}")

Altering appraisal_info.legal_desc to VARCHAR(340)...
Schema updated.


## Verify Loaded Data

In [None]:
# Check record counts for all tables
tables = [
    "appraisal_header", "appraisal_state_code", "appraisal_country_code",
    "appraisal_abstract_subdv", "appraisal_agent", "appraisal_entity",
    "appraisal_info", "appraisal_land_detail", "appraisal_improvement_info",
    "appraisal_improvement_detail", "appraisal_improvement_detail_attr",
    "appraisal_lawsuit", "appraisal_mobile_home_info", 
    "appraisal_tax_deferral_info", "appraisal_udi"
]

print("Table Record Counts:")
print("-" * 50)
total = 0
for table in tables:
    count = db_service.get_table_count(table)
    total += count
    print(f"{table:40} {count:>8,}")
print("-" * 50)
print(f"{'TOTAL':40} {total:>8,}")

## Query Examples

In [None]:
# Sample property records
query = """
SELECT prop_id, prop_type_cd, prop_val_yr, owner_name, 
       situs_street, situs_city, situs_zip
FROM cad.appraisal_info 
WHERE owner_name IS NOT NULL
ORDER BY prop_id
LIMIT 10
"""
df = pd.read_sql(query, conn_string)
df

In [None]:
# Property count by type
query = """
SELECT prop_type_cd, COUNT(*) as count
FROM cad.appraisal_info
WHERE prop_type_cd IS NOT NULL
GROUP BY prop_type_cd
ORDER BY count DESC
"""
df = pd.read_sql(query, conn_string)
print("Property counts by type:")
df

In [None]:
# Properties by city
query = """
SELECT situs_city, COUNT(*) as count
FROM cad.appraisal_info
WHERE situs_city IS NOT NULL AND situs_city != ''
GROUP BY situs_city
ORDER BY count DESC
LIMIT 15
"""
df = pd.read_sql(query, conn_string)
print("Top 15 cities by property count:")
df

## Gateway Parks Analysis

Analyze homes in Gateway Parks subdivision to identify:
1. Total properties in the subdivision
2. Owner-occupied vs non-owner-occupied (investor) properties
3. Owner occupancy is determined by comparing owner's mailing address with the property (situs) address

In [64]:
# First, recreate the appraisal_info table with mailing address columns
# This is needed to compare owner's mailing address vs property address

import psycopg2

conn = psycopg2.connect(conn_string)
cur = conn.cursor()

# Drop and recreate with new schema
cur.execute("DROP TABLE IF EXISTS cad.appraisal_info CASCADE")
cur.execute("""
CREATE TABLE cad.appraisal_info (
    id SERIAL PRIMARY KEY,
    prop_id BIGINT,
    prop_type_cd VARCHAR(1),
    prop_val_yr INTEGER,
    owner_id BIGINT,
    owner_name VARCHAR(70),
    confidential_flag VARCHAR(1),
    mail_addr_line1 VARCHAR(80),
    mail_addr_line2 VARCHAR(80),
    mail_city VARCHAR(50),
    mail_state VARCHAR(50),
    mail_country VARCHAR(20),
    mail_zip VARCHAR(10),
    situs_street VARCHAR(60),
    situs_city VARCHAR(30),
    situs_zip VARCHAR(10),
    legal_desc VARCHAR(150),
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
cur.execute("CREATE INDEX idx_info_prop_id ON cad.appraisal_info(prop_id)")
cur.execute("CREATE INDEX idx_info_legal_desc ON cad.appraisal_info USING gin(to_tsvector('english', legal_desc))")
conn.commit()
cur.close()
conn.close()

print("‚úÖ appraisal_info table recreated with mailing address columns")

‚úÖ appraisal_info table recreated with mailing address columns


In [63]:
# ============================================
# Reload modules and verify column positions
# ============================================
import importlib
import app.services.loader
import app.services.file_reader
importlib.reload(app.services.file_reader)
importlib.reload(app.services.loader)

from app.services.loader import DataLoader
from app.services.file_reader import read_fixed_width_file, get_file_path

# Reload layout config
layout_config = load_layout_config(CONFIG_DIR / "file_layouts.json")
info_config = layout_config.get_file_config("INFO")

print("Active columns for INFO:")
for col in info_config.active_columns:
    print(f"  {col.name}: length={col.length}")

# Test parsing a single line
info_file = CAD_DATA_DIR / "2025-10-27_002174_APPRAISAL_INFO.TXT"
with open(info_file, 'r') as f:
    first_line = f.readline()

# Manual parse to verify positions
from app.services.file_reader import parse_line
record = parse_line(first_line, info_config)
print("\n--- Sample Record (first line) ---")
for key, val in record.items():
    print(f"  {key}: '{val}'")

Active columns for INFO:
  prop_id: length=12
  prop_type_cd: length=1
  prop_val_yr: length=4
  owner_id: length=12
  owner_name: length=70
  confidential_flag: length=1
  mail_addr_line1: length=80
  mail_addr_line2: length=40
  mail_city: length=50
  mail_state: length=50
  mail_country: length=5
  mail_zip: length=10
  situs_street: length=60
  situs_city: length=30
  situs_zip: length=10
  legal_desc: length=340

--- Sample Record (first line) ---
  prop_id: '1'
  prop_type_cd: 'R'
  prop_val_yr: '2025'
  owner_id: '202803'
  owner_name: 'BARAY RAUL ANTONIO'
  confidential_flag: 'F'
  mail_addr_line1: '1823 BALMORAL DR'
  mail_addr_line2: 'None'
  mail_city: 'CARROLLTON'
  mail_state: 'TX'
  mail_country: 'US'
  mail_zip: '75006'
  situs_street: 'FM RD 2578'
  situs_city: 'TERRELL'
  situs_zip: '75160'
  legal_desc: 'R ANTHONY, TRACT 5.00; 26.232 ACRES, & HOUSE                                                                                                                          

In [65]:
# Load INFO data with mailing address columns
from pathlib import Path
import time

CAD_DATA_DIR = Path("/Users/tapiwamaruni/Documents/projects/housing1/Kaufman-CAD-2025-Certified-Full-Roll-Download-updated-with-Supp-5")
info_file = CAD_DATA_DIR / "2025-10-27_002174_APPRAISAL_INFO.TXT"

db_service = DatabaseService(db_config)

print(f"Loading INFO data from: {info_file.name}")
start_time = time.time()

# Read and insert records
records = list(read_fixed_width_file(info_file, info_config))
result = db_service.insert_records_streaming(
    records_generator=iter(records),
    file_config=info_config,
    batch_size=5000
)

print(f"\n‚úÖ Loaded {result:,} records in {time.time() - start_time:.1f}s")

Loading INFO data from: 2025-10-27_002174_APPRAISAL_INFO.TXT
2025-12-07 18:50:48 - cad_loader - INFO - Reading file: 2025-10-27_002174_APPRAISAL_INFO.TXT
2025-12-07 18:50:51 - cad_loader - INFO - Processed 104369 records from 2025-10-27_002174_APPRAISAL_INFO.TXT
2025-12-07 18:54:35 - cad_loader - INFO - Completed inserting 72262 records into cad.appraisal_info

‚úÖ Loaded 72,262 records in 227.7s


In [66]:
# Verify the new columns have data
query = """
SELECT 
    COUNT(*) as total,
    COUNT(mail_addr_line1) as with_mail_addr,
    COUNT(mail_city) as with_mail_city,
    COUNT(situs_street) as with_situs
FROM cad.appraisal_info
"""
stats = pd.read_sql(query, conn_string)
print("Data quality check:")
print(stats.T)

# Sample record to verify addresses are loading correctly
sample_query = """
SELECT prop_id, owner_name, 
       mail_addr_line1, mail_city, mail_state, mail_zip,
       situs_street, situs_city, situs_zip
FROM cad.appraisal_info 
WHERE mail_addr_line1 IS NOT NULL
LIMIT 5
"""
sample = pd.read_sql(sample_query, conn_string)
print("\nSample records with mailing addresses:")
sample

Data quality check:
                    0
total           72262
with_mail_addr  71806
with_mail_city  71872
with_situs      67931

Sample records with mailing addresses:


Unnamed: 0,prop_id,owner_name,mail_addr_line1,mail_city,mail_state,mail_zip,situs_street,situs_city,situs_zip
0,2,HOLMES FRANCES A,5474 FM 987,KAUFMAN,TX,75142,FM RD 987,KAUFMAN,75142
1,8,CLAYTON LUCINDA,7025 CO RD 275,TERRELL,TX,75160,CO RD 275,TERRELL,75160
2,13,HARPER LAURA J TRUST &,1500 S HOUSTON ST,KAUFMAN,TX,75142,FM RD 987,KAUFMAN,75142
3,14,EVANS RODNEY & DEEDRA,5376 FM RD 987,KAUFMAN,TX,75142,FM RD 987,KAUFMAN,75142
4,22,SHORT RICHARD L,6330 CO RD 275,TERRELL,TX,75160,FM RD 987,TERRELL,75160


In [133]:
# ============================================
# GATEWAY PARKS - Find all properties in subdivision
# ============================================

gateway_query = """
SELECT 
    i.prop_id,
    i.owner_name,
    i.mail_addr_line1 as mail_address,
    i.mail_city,
    i.mail_state,
    i.mail_zip,
    i.situs_street,
    i.situs_city,
    i.situs_zip,
    i.legal_desc,
    i.prop_val_yr,
    -- Year built is not available in this dataset (not populated in IMPROVEMENT_INFO table)
    NULL as year_built,
    -- Get appraised value (assessed value) from entity info
    MAX(e.assessed_val) as appraised_value,
    -- Create full property address
    CASE 
        WHEN UPPER(i.mail_city) = 'FORNEY' THEN 
            CONCAT(i.mail_addr_line1, ', ', 'FORNEY', ', TX ', COALESCE(i.situs_zip, i.mail_zip))
        ELSE 
            CONCAT(COALESCE(i.situs_street, ''), ', ', COALESCE(i.situs_city, 'FORNEY'), ', TX ', COALESCE(i.situs_zip, ''))
    END as full_property_address
FROM cad.appraisal_info i
LEFT JOIN cad.appraisal_entity_info e ON i.prop_id = e.prop_id AND i.prop_val_yr = e.tax_year
WHERE UPPER(i.legal_desc) LIKE '%GATEWAY PARK%'
GROUP BY 
    i.prop_id, i.owner_name, i.mail_addr_line1, i.mail_city, i.mail_state, 
    i.mail_zip, i.situs_street, i.situs_city, i.situs_zip, i.legal_desc, i.prop_val_yr
ORDER BY i.situs_street
"""

with conn.cursor() as cur:
    cur.execute(gateway_query)
    columns = [desc[0] for desc in cur.description]
    rows = cur.fetchall()
    
gateway_parks = pd.DataFrame(rows, columns=columns)
print(f"üèòÔ∏è Found {len(gateway_parks):,} properties in Gateway Parks subdivision")
print(f"\nProperty address cities: {gateway_parks['situs_city'].value_counts().to_dict()}")

# Show sample with property addresses and values
print("\nüìç Sample Properties with Details:")
display_cols = ['prop_id', 'owner_name', 'appraised_value', 'full_property_address']
print(gateway_parks[display_cols].head(15).to_string())

# Show value statistics
print(f"\nüí∞ Appraised Value Statistics:")
print(f"  Average: ${gateway_parks['appraised_value'].mean():,.0f}")
print(f"  Median: ${gateway_parks['appraised_value'].median():,.0f}")
print(f"  Min: ${gateway_parks['appraised_value'].min():,.0f}")
print(f"  Max: ${gateway_parks['appraised_value'].max():,.0f}")

print(f"\n‚ö†Ô∏è  NOTE: Year built is not available in this dataset")

gateway_parks[['prop_id', 'owner_name', 'appraised_value', 'mail_city', 'situs_street']].head(10)


üèòÔ∏è Found 1,286 properties in Gateway Parks subdivision

Property address cities: {'FORNEY': 749, 'ATHENS': 1}

üìç Sample Properties with Details:
    prop_id                                   owner_name  appraised_value                                              full_property_address
0    207880                                SOLIS GERARDO           369912                                    1864 ARBOR DR, FORNEY, TX 75126
1    207881                          DAVIS CHEREA & GARY           422000                                    1860 ARBOR DR, FORNEY, TX 75126
2    207882             ASHUARRAH NELSON T & MAGDALENE C           396000                                    1856 ARBOR DR, FORNEY, TX 75126
3    207883  THRIVING MANAGEMENT A SERIES LLC SERIES FN2           347000  ARBOR                                             DR, FORNEY, TX 
4    207884                               SIMMONS DONALD           397385                                    1848 ARBOR DR, FORNEY, TX 75126
5

Unnamed: 0,prop_id,owner_name,appraised_value,mail_city,situs_street
0,207880,SOLIS GERARDO,369912,FORNEY,ARBOR ...
1,207881,DAVIS CHEREA & GARY,422000,FORNEY,ARBOR ...
2,207882,ASHUARRAH NELSON T & MAGDALENE C,396000,FORNEY,ARBOR ...
3,207883,THRIVING MANAGEMENT A SERIES LLC SERIES FN2,347000,DALLAS,ARBOR ...
4,207884,SIMMONS DONALD,397385,FORNEY,ARBOR ...
5,207885,DUONG ANDY,358640,FORNEY,ARBOR ...
6,207886,JACKSON BRANDY &,353584,FORNEY,ARBOR ...
7,207887,DILWORTH ANTHONY & EULA,330942,FORNEY,ARBOR ...
8,207888,MENDOZA RACHEL D & RAMON D,357404,FORNEY,ARBOR ...
9,207889,ATCHISON TIMOTHY D &,409586,FORNEY,ARBOR ...


In [134]:
# ============================================
# OWNER OCCUPANCY ANALYSIS
# Logic: If mailing city = FORNEY, owner likely lives at the property
# If mailing city is elsewhere, property is likely investor-owned
# ============================================

# Gateway Parks is in FORNEY, TX
PROPERTY_CITY = "FORNEY"

def determine_occupancy(row):
    """
    Determine owner occupancy based on mailing address city.
    If owner's mailing address is in FORNEY, they likely live there.
    If mailing address is elsewhere, it's likely an investment property.
    """
    mail_city = str(row['mail_city']).upper().strip() if pd.notna(row['mail_city']) else ""
    
    # Handle no mailing city
    if not mail_city:
        return "Unknown"
    
    # Check if mailing city matches property city
    if mail_city == PROPERTY_CITY:
        return "Owner-Occupied"
    else:
        return "Investor/Non-Owner"

gateway_parks['occupancy_status'] = gateway_parks.apply(determine_occupancy, axis=1)

# Summary statistics
print("üìä Gateway Parks Owner Occupancy Analysis")
print("=" * 50)
print(f"\nProperty Location: {PROPERTY_CITY}, TX")
print(f"Total Properties: {len(gateway_parks):,}")
print("\n" + "=" * 50)
print("Occupancy Classification (based on mailing address city):")
print("=" * 50)

occupancy_counts = gateway_parks['occupancy_status'].value_counts()
for status, count in occupancy_counts.items():
    pct = count / len(gateway_parks) * 100
    print(f"  {status}: {count:,} ({pct:.1f}%)")

print("\n" + "=" * 50)
print("Owner mailing city breakdown:")
print("=" * 50)
print(gateway_parks['mail_city'].value_counts().head(15))

üìä Gateway Parks Owner Occupancy Analysis

Property Location: FORNEY, TX
Total Properties: 1,286

Occupancy Classification (based on mailing address city):
  Owner-Occupied: 873 (67.9%)
  Investor/Non-Owner: 406 (31.6%)
  Unknown: 7 (0.5%)

Owner mailing city breakdown:
mail_city
FORNEY            873
FARMERS BRANCH     58
PLANO              46
DALLAS             27
SCOTTSDALE         25
ALLEN              22
FRISCO             22
IRVING             13
FARMERS            11
MARIETTA           10
PROSPER             9
CARROLLTON          6
HOUSTON             6
FRIENDSWOOD         5
ROWLETT             4
Name: count, dtype: int64


In [135]:
# ============================================
# INVESTOR PROPERTY ANALYSIS
# Who are the biggest investors in Gateway Parks?
# ============================================

investors = gateway_parks[gateway_parks['occupancy_status'] == 'Investor/Non-Owner'].copy()

print(f"üè¢ Investor-Owned Properties in Gateway Parks")
print("=" * 60)
print(f"\nTotal investor properties: {len(investors):,}")
print(f"Percentage of subdivision: {len(investors)/len(gateway_parks)*100:.1f}%")

# Top investors by number of properties
print("\n" + "=" * 60)
print("Top Property Owners (by count):")
print("=" * 60)
top_owners = investors.groupby('owner_name').size().sort_values(ascending=False).head(15)
for owner, count in top_owners.items():
    print(f"  {owner}: {count} properties")

# Where are investors located?
print("\n" + "=" * 60)
print("Investor Locations (mailing city):")
print("=" * 60)
investor_cities = investors['mail_city'].value_counts().head(15)
for city, count in investor_cities.items():
    print(f"  {city}: {count} properties")

üè¢ Investor-Owned Properties in Gateway Parks

Total investor properties: 406
Percentage of subdivision: 31.6%

Top Property Owners (by count):
  ASHTON DALLAS RESIDENTIAL LLC: 67 properties
  K HOVNANIAN DFW GATEWAY PARKS LLC: 23 properties
  GATEWAY PARKS HOA INC: 22 properties
  K HOVNANIAN DFW GATEWAY PARK LLC: 9 properties
  IDF1 SFR PROPCO A LLC: 8 properties
  PRNL RESIDENTIAL BUYER LLC: 7 properties
  STARLIGHT HOMES TEXAS LLC: 6 properties
  TRIANGLE TRIO VENTURES LLC: 5 properties
  CSH PROPERTY ONE LLC: 3 properties
  ADDAYPALLY SHRAVAN K & PADMINI SAMPATH: 3 properties
  PATHAK ALOK K: 2 properties
  SFR II TEXAS SUB 2021-3 LLC: 2 properties
  PONUGUPATI GOPALA K ETAL: 2 properties
  KHAN MOHAMMAD S: 2 properties
  MISHRA NACHIKETA &: 2 properties

Investor Locations (mailing city):
  FARMERS BRANCH: 58 properties
  PLANO: 46 properties
  DALLAS: 27 properties
  SCOTTSDALE: 25 properties
  ALLEN: 22 properties
  FRISCO: 22 properties
  IRVING: 13 properties
  FARMERS: 11 

In [123]:
# ============================================
# DETAILED VIEW: Investor Properties
# ============================================

# Display investor properties with details
print("üìã Investor-Owned Properties (sorted by owner)")
print("=" * 80)

display_cols = ['owner_name', 'mail_address', 'mail_city', 'mail_state', 'prop_val_yr', 'year_built', 'appraised_value']
investors_display = investors[display_cols].sort_values('owner_name')
investors_display.head(30)


üìã Investor-Owned Properties (sorted by owner)


Unnamed: 0,owner_name,mail_address,mail_city,mail_state,prop_val_yr,year_built,appraised_value
941,1829 PUEBLO STREET TRUST,166 GEARY STR ...,SAN FRANCISCO,CA,2025,,598000000000031
1154,2024 1 IH BORROWER LP,5420 LBJ FWY ...,DALLAS,TX,2025,,392000000000035
993,3BSAS INVESTMENTS LLC,1804 FARINGDON DR,PLANO,TX,2025,,535000000000028
930,ADDAYPALLY SHRAVAN K & PADMINI SAMPATH,13467 PROUD CLAIRON ST,FRISCO,TX,2025,,518500000000038
947,ADDAYPALLY SHRAVAN K & PADMINI SAMPATH,13467 PROUD CLARION ST,FRISCO,TX,2025,,500000000000034
489,ADDAYPALLY SHRAVAN K & PADMINI SAMPATH,13467 PROUD CLARION ST,FRISCO,TX,2025,,500000000000034
963,AHMAD MOSHEER Y,1216 FARGO DR,MCKINNEY,TX,2025,,630900000000038
1057,AJMERI SALIM A,42668 LOBELIA PL,CHANTILLY,VA,2025,,412300000000035
35,ALI KARIM,4313 TALL KNIGHT LN,CARROLLTON,TX,2025,,400000000000036
264,ALI KARIM,4313 TALL KNIGHT LN,CARROLLTON,TX,2025,,800000000000040


In [124]:
# ============================================
# CORPORATE vs INDIVIDUAL INVESTORS
# ============================================

def classify_owner_type(name):
    """Classify owner as corporate or individual"""
    if pd.isna(name):
        return "Unknown"
    name = name.upper()
    corp_indicators = ['LLC', 'LP', 'INC', 'CORP', 'TRUST', 'PARTNERS', 'PROPERTIES', 
                       'HOMES', 'RESIDENTIAL', 'HOLDINGS', 'VENTURES', 'HOA']
    for indicator in corp_indicators:
        if indicator in name:
            return "Corporate/Entity"
    return "Individual"

investors['owner_type'] = investors['owner_name'].apply(classify_owner_type)

print("üè¢ Owner Type Analysis")
print("=" * 60)
owner_type_counts = investors['owner_type'].value_counts()
for otype, count in owner_type_counts.items():
    pct = count / len(investors) * 100
    print(f"  {otype}: {count} ({pct:.1f}%)")

# Corporate owners detail
print("\n" + "=" * 60)
print("Corporate/Entity Owners:")
print("=" * 60)
corp_owners = investors[investors['owner_type'] == 'Corporate/Entity']
corp_summary = corp_owners.groupby('owner_name').size().sort_values(ascending=False)
for owner, count in corp_summary.head(10).items():
    print(f"  {owner}: {count} properties")

üè¢ Owner Type Analysis
  Corporate/Entity: 226 (55.7%)
  Individual: 180 (44.3%)

Corporate/Entity Owners:
  ASHTON DALLAS RESIDENTIAL LLC: 67 properties
  K HOVNANIAN DFW GATEWAY PARKS LLC: 23 properties
  GATEWAY PARKS HOA INC: 22 properties
  K HOVNANIAN DFW GATEWAY PARK LLC: 9 properties
  IDF1 SFR PROPCO A LLC: 8 properties
  PRNL RESIDENTIAL BUYER LLC: 7 properties
  STARLIGHT HOMES TEXAS LLC: 6 properties
  TRIANGLE TRIO VENTURES LLC: 5 properties
  CSH PROPERTY ONE LLC: 3 properties
  SFR II TEXAS SUB 2021-3 LLC: 2 properties


In [136]:
# ============================================
# SUMMARY & EXPORT
# ============================================

print("üìä GATEWAY PARKS SUBDIVISION SUMMARY")
print("=" * 60)
print(f"\nTotal Properties: {len(gateway_parks):,}")
print(f"\nOwnership Breakdown:")

# Create summary
summary_data = {
    'Category': ['Owner-Occupied', 'Investor-Owned', 'Unknown'],
    'Count': [
        (gateway_parks['occupancy_status'] == 'Owner-Occupied').sum(),
        (gateway_parks['occupancy_status'] == 'Investor/Non-Owner').sum(),
        (gateway_parks['occupancy_status'] == 'Unknown').sum()
    ]
}
summary_df = pd.DataFrame(summary_data)
summary_df['Percentage'] = (summary_df['Count'] / len(gateway_parks) * 100).round(1)
print(summary_df.to_string(index=False))

# Export to CSV
output_file = project_root / "gateway_parks_analysis.csv"
gateway_parks.to_csv(output_file, index=False)
print(f"\n‚úÖ Full dataset exported to: {output_file}")

# Also export investors list
investors_file = project_root / "gateway_parks_investors.csv"
investors.to_csv(investors_file, index=False)
print(f"‚úÖ Investors list exported to: {investors_file}")

üìä GATEWAY PARKS SUBDIVISION SUMMARY

Total Properties: 1,286

Ownership Breakdown:
      Category  Count  Percentage
Owner-Occupied    873        67.9
Investor-Owned    406        31.6
       Unknown      7         0.5

‚úÖ Full dataset exported to: /Users/tapiwamaruni/Documents/projects/housing1/gateway_parks_analysis.csv
‚úÖ Investors list exported to: /Users/tapiwamaruni/Documents/projects/housing1/gateway_parks_investors.csv


In [None]:
# ============================================
# EXPORT: Save Gateway Parks analysis to CSV
# ============================================

# Save full dataset with occupancy classification
output_file = project_root / "gateway_parks_analysis.csv"
gateway_parks.to_csv(output_file, index=False)
print(f"‚úÖ Saved analysis to: {output_file}")

# Summary by occupancy status
summary = gateway_parks.groupby('occupancy_status').agg({
    'prop_id': 'count',
}).rename(columns={'prop_id': 'property_count'})
summary['percentage'] = (summary['property_count'] / len(gateway_parks) * 100).round(1)
print("\nüìä Summary:")
summary