In [10]:
# Real Estate Data Processing for Central New Jersey

## Setup and Import Libraries

import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pathlib import Path

# Optional: Set display options for better visibility
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)


In [18]:

## Load Data Files

def load_csv_files(file_paths):
    """
    Load multiple CSV files into pandas DataFrames
    
    Args:
        file_paths (dict): Dictionary with keys as dataframe names and values as file paths
        
    Returns:
        dict: Dictionary with keys as dataframe names and values as pandas DataFrames
    """
    dataframes = {}
    
    for name, path in file_paths.items():
        try:
            dataframes[name] = pd.read_csv(path)
            print(f"Successfully loaded {name} with {len(dataframes[name])} records")
        except Exception as e:
            print(f"Error loading {name}: {e}")
    
    return dataframes

# Define file paths - adjust these to match your actual file locations
file_paths = {
    'buyers': 'real_estate_synthetic_data/buyers.csv',
    'houses': 'real_estate_synthetic_data/houses_for_sale.csv',
    'past_sales': 'real_estate_synthetic_data/past_sales.csv',
    'call_transcripts': 'real_estate_synthetic_data/basic_call_transcripts.csv',
    # Add any additional CSV files you might have
    # 'realtors': 'realtors.csv',
}

# Load all dataframes
dfs = load_csv_files(file_paths)


Successfully loaded buyers with 5000 records
Successfully loaded houses with 1000 records
Successfully loaded past_sales with 500 records
Successfully loaded call_transcripts with 10000 records


In [19]:
## Explore Data Structure

# Display sample data and column information
for name, df in dfs.items():
    print(f"\n--- {name.upper()} DataFrame ---")
    print(f"Shape: {df.shape}")
    print("Columns:")
    for col in df.columns:
        print(f"  - {col}: {df[col].dtype}")
    print("\nSample Data:")
    display(df.head(2))

## Data Cleaning and Preparation

def clean_and_prepare_data(dfs):
    """
    Clean and prepare dataframes for joining
    
    Args:
        dfs (dict): Dictionary with dataframes
        
    Returns:
        dict: Dictionary with cleaned dataframes
    """
    cleaned_dfs = {}
    
    # Deep copy to avoid modifying original dataframes
    for name, df in dfs.items():
        cleaned_dfs[name] = df.copy(deep=True)
    
    # Convert ID columns to string to ensure proper joining
    if 'buyers' in cleaned_dfs:
        if 'BuyerID' in cleaned_dfs['buyers'].columns:
            cleaned_dfs['buyers']['BuyerID'] = cleaned_dfs['buyers']['BuyerID'].astype(str)
    
    if 'houses' in cleaned_dfs:
        if 'HouseID' in cleaned_dfs['houses'].columns:
            cleaned_dfs['houses']['HouseID'] = cleaned_dfs['houses']['HouseID'].astype(str)
    
    if 'past_sales' in cleaned_dfs:
        if 'HouseID' in cleaned_dfs['past_sales'].columns:
            cleaned_dfs['past_sales']['HouseID'] = cleaned_dfs['past_sales']['HouseID'].astype(str)
        if 'BuyerID' in cleaned_dfs['past_sales'].columns:
            cleaned_dfs['past_sales']['BuyerID'] = cleaned_dfs['past_sales']['BuyerID'].astype(str)
        # Convert SaleDate to datetime
        if 'SaleDate' in cleaned_dfs['past_sales'].columns:
            cleaned_dfs['past_sales']['SaleDate'] = pd.to_datetime(cleaned_dfs['past_sales']['SaleDate'])
    
    # Copy for call_transcripts if it exists and has the same columns
    if 'call_transcripts' in cleaned_dfs:
        if 'HouseID' in cleaned_dfs['call_transcripts'].columns:
            cleaned_dfs['call_transcripts']['HouseID'] = cleaned_dfs['call_transcripts']['HouseID'].astype(str)
        if 'BuyerID' in cleaned_dfs['call_transcripts'].columns:
            cleaned_dfs['call_transcripts']['BuyerID'] = cleaned_dfs['call_transcripts']['BuyerID'].astype(str)
        if 'SaleDate' in cleaned_dfs['call_transcripts'].columns:
            cleaned_dfs['call_transcripts']['SaleDate'] = pd.to_datetime(cleaned_dfs['call_transcripts']['SaleDate'])
    
    return cleaned_dfs

# Clean and prepare data
cleaned_dfs = clean_and_prepare_data(dfs)



--- BUYERS DataFrame ---
Shape: (5000, 14)
Columns:
  - BuyerID: object
  - FullName: object
  - Address: object
  - SSN: object
  - PhoneNumber: object
  - Email: object
  - IncomeBracket: object
  - AnnualIncome: int64
  - NetWorth: int64
  - TotalDebt: int64
  - CreditScore: int64
  - DesiredPriceRange_Min: float64
  - DesiredPriceRange_Max: float64
  - PreApprovedAmount: int64

Sample Data:


Unnamed: 0,BuyerID,FullName,Address,SSN,PhoneNumber,Email,IncomeBracket,AnnualIncome,NetWorth,TotalDebt,CreditScore,DesiredPriceRange_Min,DesiredPriceRange_Max,PreApprovedAmount
0,84092373-60de-427b-a0a7-a9f096e107a3,Nathan Sanchez,"4009 Carter Alley Suite 217, West Windsor, NJ 08618",702-78-5163,(307)773-5111x30556,owashington@example.com,Medium,117249,479189,272934,669,450000.0,633268.0,567860
1,42b70d65-9cfd-481c-9b12-4dc862dab833,Cindy Henderson,"635 Steven Meadows, Franklin Township, NJ 08498",455-07-1143,001-926-910-8836x670,adrienneedwards@example.com,Low,68657,10379,73615,723,450000.0,450000.0,480703



--- HOUSES DataFrame ---
Shape: (1000, 10)
Columns:
  - HouseID: object
  - Address: object
  - ListingPrice: int64
  - Bedrooms: int64
  - Bathrooms: float64
  - SquareFootage: int64
  - LotSize_Acres: float64
  - YearBuilt: int64
  - PropertyType: object
  - Status: object

Sample Data:


Unnamed: 0,HouseID,Address,ListingPrice,Bedrooms,Bathrooms,SquareFootage,LotSize_Acres,YearBuilt,PropertyType,Status
0,e9d0206b-debd-4b8a-b68d-cc770fb32c7d,"840 Vega Spurs Apt. 726, Hillsborough, NJ 08761",1859911,2,4.0,3724,1.44,1965,Single Family,For Sale
1,b63d84bb-0fdb-467d-a93b-a4f8268dbd2c,"95522 Lawrence Island Apt. 349, Hillsborough, NJ 08834",3270629,6,6.0,6160,2.04,1994,Single Family,For Sale



--- PAST_SALES DataFrame ---
Shape: (500, 20)
Columns:
  - SaleID: object
  - HouseID: object
  - BuyerID: object
  - SellerFullName: object
  - SellerAddress: object
  - ListingPrice: int64
  - SalePrice: int64
  - SaleDate: object
  - SaleCategory: object
  - SaleDetails: object
  - Buyer_AnnualIncome: int64
  - Buyer_NetWorth: int64
  - Buyer_TotalDebt: int64
  - Buyer_CreditScore: int64
  - House_Address: object
  - House_Bedrooms: int64
  - House_Bathrooms: float64
  - House_SquareFootage: int64
  - House_YearBuilt: int64
  - House_PropertyType: object

Sample Data:


Unnamed: 0,SaleID,HouseID,BuyerID,SellerFullName,SellerAddress,ListingPrice,SalePrice,SaleDate,SaleCategory,SaleDetails,Buyer_AnnualIncome,Buyer_NetWorth,Buyer_TotalDebt,Buyer_CreditScore,House_Address,House_Bedrooms,House_Bathrooms,House_SquareFootage,House_YearBuilt,House_PropertyType
0,31a4b65a-dfa1-4f29-ae22-0c082e5070e6,e8a15cc0-ae28-4ee7-a436-8c13b0ae4948,c1206aa8-7991-4ff1-9c64-8c8c165826bf,Brian Christensen,"75627 Micheal Groves, Woodbridge, NJ 07089",1658266,1596812,2023-05-07,Normal,Standard transaction.,228326,1244831,147890,702,"3959 Ashley Village, Montgomery, NJ 07317",5,4.5,4482,2016,Single Family
1,0d72c875-922e-4a2b-9ea7-98bc2e7254f0,7f2362d3-41d4-42c9-bbfe-d48f233b44a8,ac36acd2-c6d8-4c48-89ac-4d80af9da158,Veronica Holmes,"13658 Cooper Summit Suite 308, Montgomery, NJ 08924",1734526,1620621,2023-01-13,Normal,Standard transaction.,137784,549338,76540,690,"644 Oliver Brook, West Windsor, NJ 07529",6,4.5,5522,2015,Single Family



--- CALL_TRANSCRIPTS DataFrame ---
Shape: (10000, 6)
Columns:
  - TranscriptID: object
  - CallDateTime: object
  - BuyerID: object
  - BrokerName: object
  - MortgageBankerName: object
  - TranscriptText: object

Sample Data:


Unnamed: 0,TranscriptID,CallDateTime,BuyerID,BrokerName,MortgageBankerName,TranscriptText
0,a308472d-5055-4ece-b698-31d4c1885651,2023-09-13 22:48:06.736542,adb735a0-2433-4515-b3f3-8ccb19322c45,Harold Garcia,Tony Vazquez,"Broker: Harold Garcia checking in with Brianna Baker. Any thoughts on the properties we saw last week? \nBuyer: Still considering. The one near Franklin Township park was nice. \nBroker: Got it. Just confirming your details for updates: Name: Brianna Baker, Address: 31247 Miller Fields, Woodbridge, NJ 07655, SSN: 795-94-0501."
1,3ae10cbc-ab58-4945-97fe-7a1e078db431,2024-07-29 10:07:32.989926,c14e2182-d73f-48c3-a066-b5a7d28534f8,Jonathan Nguyen,Thomas Evans,"MB: Thomas Evans here for Richard Anderson. We need to verify income for the pre-approval. \nBuyer: Okay, what do you need? \nMB: Can you confirm your full SSN 444-58-9536 and current residence at 40500 Alvarez Fork Suite 318, Hopewell, NJ 07324 for security?"


In [20]:

## Join Data into a Comprehensive DataFrame

def join_real_estate_data(dfs):
    """
    Join real estate data from multiple dataframes
    
    Args:
        dfs (dict): Dictionary with cleaned dataframes
        
    Returns:
        DataFrame: Comprehensive joined dataframe
    """
    # Start with past_sales which seems to be the central table
    if 'past_sales' not in dfs or len(dfs['past_sales']) == 0:
        print("No past sales data available")
        return pd.DataFrame()
    
    # Create a copy to avoid modifying the original
    joined_df = dfs['past_sales'].copy()
    
    # Join with buyers data if available
    if 'buyers' in dfs and len(dfs['buyers']) > 0:
        # Use left join to keep all past sales records
        joined_df = pd.merge(
            joined_df,
            dfs['buyers'],
            on='BuyerID',
            how='left',
            suffixes=('', '_buyer')
        )
    
    # Join with houses data if available
    if 'houses' in dfs and len(dfs['houses']) > 0:
        # Use left join to keep all past sales records
        joined_df = pd.merge(
            joined_df,
            dfs['houses'],
            on='HouseID',
            how='left',
            suffixes=('', '_house_listing')
        )
    
    # Note: we're not joining with call_transcripts directly since it appears to have the same schema as past_sales
    # Instead, we'll handle call transcripts separately in the create_complete_houses_json function
    
    return joined_df

def create_comprehensive_dataset(dfs):
    """
    Create a more comprehensive dataset that includes both past sales and 
    houses for sale with all related data
    
    Args:
        dfs (dict): Dictionary with cleaned dataframes
        
    Returns:
        DataFrame: Expanded comprehensive dataframe
    """
    # First, create the standard joined dataframe
    past_sales_joined = join_real_estate_data(dfs)
    
    # Now create a dataframe for houses currently for sale
    if 'houses' not in dfs or len(dfs['houses']) == 0:
        print("No houses for sale data available")
        return past_sales_joined
    
    # Try to add houses that are for sale but haven't been sold yet
    # These won't be in the past_sales dataframe
    
    # Get a list of houses that have been sold
    sold_house_ids = set()
    if 'past_sales' in dfs and len(dfs['past_sales']) > 0 and 'HouseID' in dfs['past_sales'].columns:
        sold_house_ids = set(dfs['past_sales']['HouseID'].unique())
    
    # Filter houses for sale to only include those not in past_sales
    if 'houses' in dfs and len(dfs['houses']) > 0 and 'HouseID' in dfs['houses'].columns:
        houses_for_sale = dfs['houses'].copy()
        houses_for_sale['IsActiveListing'] = True
        
        # Add a flag for active listings to the past_sales_joined dataframe
        if len(past_sales_joined) > 0:
            past_sales_joined['IsActiveListing'] = False
        
        # Only include houses that are not in the past_sales dataframe
        houses_not_sold = houses_for_sale[~houses_for_sale['HouseID'].isin(sold_house_ids)]
        
        # Create a dataframe with the same structure as past_sales_joined
        if len(houses_not_sold) > 0:
            # Add empty columns to match past_sales_joined structure
            for col in past_sales_joined.columns:
                if col not in houses_not_sold.columns:
                    houses_not_sold[col] = None
            
            # Combine the two dataframes
            comprehensive_df = pd.concat([past_sales_joined, houses_not_sold], ignore_index=True)
            return comprehensive_df
    
    # If no new houses for sale, just return the past_sales_joined dataframe
    return past_sales_joined

# Join the data for past sales
past_sales_joined = join_real_estate_data(cleaned_dfs)

# Create a more comprehensive dataset with both past sales and active listings
comprehensive_df = create_comprehensive_dataset(cleaned_dfs)


  comprehensive_df = pd.concat([past_sales_joined, houses_not_sold], ignore_index=True)


In [21]:

# Display the result
print("\n--- COMPREHENSIVE JOINED DATAFRAME ---")
print(f"Shape: {comprehensive_df.shape}")
print("Includes both past sales and active listings")
print("Columns:")
for col in comprehensive_df.columns:
    print(f"  - {col}")
print("\nSample Data:")
display(comprehensive_df.head(2))

## Convert to JSON Objects

class NumpyEncoder(json.JSONEncoder):
    """
    Custom JSON encoder that can handle NumPy data types
    """
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, pd.Timestamp):
            return obj.strftime('%Y-%m-%d')
        return super(NumpyEncoder, self).default(obj)

def dataframe_to_json(df, orient='records', indent=2):
    """
    Convert dataframe to JSON
    
    Args:
        df (DataFrame): Pandas DataFrame to convert
        orient (str): Orientation of the JSON output
        indent (int): Indentation level
        
    Returns:
        str: JSON string
    """
    # First convert datetime columns to string to make it JSON serializable
    df_copy = df.copy()
    
    for col in df_copy.select_dtypes(include=['datetime64']).columns:
        df_copy[col] = df_copy[col].dt.strftime('%Y-%m-%d')
    
    # Convert to JSON string using the custom encoder
    json_str = df_copy.to_json(orient=orient, indent=indent, date_format='iso')
    
    # Load the JSON string to get a Python object
    json_obj = json.loads(json_str)
    
    return json_obj, json_str

# Convert the comprehensive dataframe to JSON
json_obj, json_str = dataframe_to_json(comprehensive_df)

# Print the first JSON object to see the structure
print("\n--- SAMPLE JSON OBJECT ---")
print(json.dumps(json_obj[0], indent=2) if json_obj else "No data")



--- COMPREHENSIVE JOINED DATAFRAME ---
Shape: (1500, 43)
Includes both past sales and active listings
Columns:
  - SaleID
  - HouseID
  - BuyerID
  - SellerFullName
  - SellerAddress
  - ListingPrice
  - SalePrice
  - SaleDate
  - SaleCategory
  - SaleDetails
  - Buyer_AnnualIncome
  - Buyer_NetWorth
  - Buyer_TotalDebt
  - Buyer_CreditScore
  - House_Address
  - House_Bedrooms
  - House_Bathrooms
  - House_SquareFootage
  - House_YearBuilt
  - House_PropertyType
  - FullName
  - Address
  - SSN
  - PhoneNumber
  - Email
  - IncomeBracket
  - AnnualIncome
  - NetWorth
  - TotalDebt
  - CreditScore
  - DesiredPriceRange_Min
  - DesiredPriceRange_Max
  - PreApprovedAmount
  - Address_house_listing
  - ListingPrice_house_listing
  - Bedrooms
  - Bathrooms
  - SquareFootage
  - LotSize_Acres
  - YearBuilt
  - PropertyType
  - Status
  - IsActiveListing

Sample Data:


Unnamed: 0,SaleID,HouseID,BuyerID,SellerFullName,SellerAddress,ListingPrice,SalePrice,SaleDate,SaleCategory,SaleDetails,Buyer_AnnualIncome,Buyer_NetWorth,Buyer_TotalDebt,Buyer_CreditScore,House_Address,House_Bedrooms,House_Bathrooms,House_SquareFootage,House_YearBuilt,House_PropertyType,FullName,Address,SSN,PhoneNumber,Email,IncomeBracket,AnnualIncome,NetWorth,TotalDebt,CreditScore,DesiredPriceRange_Min,DesiredPriceRange_Max,PreApprovedAmount,Address_house_listing,ListingPrice_house_listing,Bedrooms,Bathrooms,SquareFootage,LotSize_Acres,YearBuilt,PropertyType,Status,IsActiveListing
0,31a4b65a-dfa1-4f29-ae22-0c082e5070e6,e8a15cc0-ae28-4ee7-a436-8c13b0ae4948,c1206aa8-7991-4ff1-9c64-8c8c165826bf,Brian Christensen,"75627 Micheal Groves, Woodbridge, NJ 07089",1658266,1596812,2023-05-07,Normal,Standard transaction.,228326,1244831,147890,702,"3959 Ashley Village, Montgomery, NJ 07317",5,4.5,4482,2016,Single Family,Michael Brown,"7874 Joshua Islands, Hillsborough, NJ 07990",791-91-2935,+1-506-898-4274x107,luis34@example.org,High,228326,1244831,147890,702,888242.0,1649593.0,1682076,,,,,,,,,,False
1,0d72c875-922e-4a2b-9ea7-98bc2e7254f0,7f2362d3-41d4-42c9-bbfe-d48f233b44a8,ac36acd2-c6d8-4c48-89ac-4d80af9da158,Veronica Holmes,"13658 Cooper Summit Suite 308, Montgomery, NJ 08924",1734526,1620621,2023-01-13,Normal,Standard transaction.,137784,549338,76540,690,"644 Oliver Brook, West Windsor, NJ 07529",6,4.5,5522,2015,Single Family,Colton Berry,"713 Wang Drive, Hillsborough, NJ 07063",049-48-2140,948-299-7607x94398,jill13@example.org,Medium,137784,549338,76540,690,450000.0,773443.0,824244,,,,,,,,,,False



--- SAMPLE JSON OBJECT ---
{
  "SaleID": "31a4b65a-dfa1-4f29-ae22-0c082e5070e6",
  "HouseID": "e8a15cc0-ae28-4ee7-a436-8c13b0ae4948",
  "BuyerID": "c1206aa8-7991-4ff1-9c64-8c8c165826bf",
  "SellerFullName": "Brian Christensen",
  "SellerAddress": "75627 Micheal Groves, Woodbridge, NJ 07089",
  "ListingPrice": 1658266,
  "SalePrice": 1596812,
  "SaleDate": "2023-05-07",
  "SaleCategory": "Normal",
  "SaleDetails": "Standard transaction.",
  "Buyer_AnnualIncome": 228326,
  "Buyer_NetWorth": 1244831,
  "Buyer_TotalDebt": 147890,
  "Buyer_CreditScore": 702,
  "House_Address": "3959 Ashley Village, Montgomery, NJ 07317",
  "House_Bedrooms": 5,
  "House_Bathrooms": 4.5,
  "House_SquareFootage": 4482,
  "House_YearBuilt": 2016,
  "House_PropertyType": "Single Family",
  "FullName": "Michael Brown",
  "Address": "7874 Joshua Islands, Hillsborough, NJ 07990",
  "SSN": "791-91-2935",
  "PhoneNumber": "+1-506-898-4274x107",
  "Email": "luis34@example.org",
  "IncomeBracket": "High",
  "AnnualInc

In [22]:

## Create More Specialized JSON Objects

def create_specialized_json_objects(dfs, comprehensive_df):
    """
    Create specialized JSON objects for different aspects of the real estate data
    
    Args:
        dfs (dict): Dictionary with cleaned dataframes
        comprehensive_df (DataFrame): Comprehensive joined dataframe
        
    Returns:
        dict: Dictionary with different JSON objects
    """
    json_objects = {}
    
    # 1. Buyers with their profile information
    if 'buyers' in dfs and len(dfs['buyers']) > 0:
        json_objects['buyers'], _ = dataframe_to_json(dfs['buyers'])
    
    # 2. Houses for sale with full details
    if 'houses' in dfs and len(dfs['houses']) > 0:
        json_objects['houses_for_sale'], _ = dataframe_to_json(dfs['houses'])
    
    # 3. Past sales with detailed information
    if 'past_sales' in dfs and len(dfs['past_sales']) > 0:
        json_objects['past_sales'], _ = dataframe_to_json(dfs['past_sales'])
    
    # 4. Call transcripts if they exist and are different from past_sales
    if 'call_transcripts' in dfs and len(dfs['call_transcripts']) > 0:
        # Check if call_transcripts has the same data as past_sales
        if not dfs['call_transcripts'].equals(dfs['past_sales']):
            json_objects['call_transcripts'], _ = dataframe_to_json(dfs['call_transcripts'])
    
    # 5. Comprehensive joined data
    if len(comprehensive_df) > 0:
        json_objects['comprehensive'], _ = dataframe_to_json(comprehensive_df)
    
    # 6. Create a unified houses dataset with all related data
    json_objects['complete_houses'] = create_complete_houses_json(dfs, comprehensive_df)
    
    return json_objects

def create_complete_houses_json(dfs, comprehensive_df):
    """
    Create a comprehensive houses JSON dataset that includes:
    - All houses (both from past sales and active listings)
    - All related data joined to them (buyers, sellers, etc.)
    - Call transcripts as arrays within each house object
    
    Args:
        dfs (dict): Dictionary with cleaned dataframes
        comprehensive_df (DataFrame): Comprehensive joined dataframe
        
    Returns:
        list: List of house objects with all related data
    """
    # Start with an empty list to store all house objects
    all_houses = []
    
    # Get all unique house IDs from both active listings and past sales
    house_ids = set()
    
    if 'houses' in dfs and len(dfs['houses']) > 0 and 'HouseID' in dfs['houses'].columns:
        house_ids.update(dfs['houses']['HouseID'].unique())
    
    if 'past_sales' in dfs and len(dfs['past_sales']) > 0 and 'HouseID' in dfs['past_sales'].columns:
        house_ids.update(dfs['past_sales']['HouseID'].unique())
    
    # Process each house
    for house_id in house_ids:
        # Create a house object
        house_obj = {"HouseID": house_id}
        
        # Add house details from houses for sale dataframe if available
        if 'houses' in dfs and len(dfs['houses']) > 0 and 'HouseID' in dfs['houses'].columns:
            house_data = dfs['houses'][dfs['houses']['HouseID'] == house_id]
            if len(house_data) > 0:
                for col in house_data.columns:
                    if col != 'HouseID':  # Already added
                        # Convert NumPy types to Python native types
                        val = house_data.iloc[0][col]
                        if isinstance(val, np.integer):
                            house_obj[col] = int(val)
                        elif isinstance(val, np.floating):
                            house_obj[col] = float(val)
                        elif isinstance(val, np.ndarray):
                            house_obj[col] = val.tolist()
                        elif isinstance(val, pd.Timestamp):
                            house_obj[col] = val.strftime('%Y-%m-%d')
                        else:
                            house_obj[col] = val
                house_obj['IsActiveListing'] = True
            else:
                house_obj['IsActiveListing'] = False
        
        # Add any house details from past sales if available
        if 'past_sales' in dfs and len(dfs['past_sales']) > 0 and 'HouseID' in dfs['past_sales'].columns:
            past_sales = dfs['past_sales'][dfs['past_sales']['HouseID'] == house_id]
            
            if len(past_sales) > 0:
                # Add house-specific fields from past sales if they don't already exist
                for col in past_sales.columns:
                    if col.startswith('House_') and col not in house_obj:
                        # Convert NumPy types to Python native types
                        val = past_sales.iloc[0][col]
                        if isinstance(val, np.integer):
                            house_obj[col.replace('House_', '')] = int(val)
                        elif isinstance(val, np.floating):
                            house_obj[col.replace('House_', '')] = float(val)
                        elif isinstance(val, np.ndarray):
                            house_obj[col.replace('House_', '')] = val.tolist()
                        elif isinstance(val, pd.Timestamp):
                            house_obj[col.replace('House_', '')] = val.strftime('%Y-%m-%d')
                        else:
                            house_obj[col.replace('House_', '')] = val
                
                # Create a sales history array
                sales_history = []
                for _, sale in past_sales.iterrows():
                    sale_obj = {}
                    for col in sale.index:
                        if not col.startswith('House_'):  # Skip house fields already added
                            # Convert NumPy types to Python native types
                            val = sale[col]
                            if isinstance(val, np.integer):
                                sale_obj[col] = int(val)
                            elif isinstance(val, np.floating):
                                sale_obj[col] = float(val)
                            elif isinstance(val, np.ndarray):
                                sale_obj[col] = val.tolist()
                            elif isinstance(val, pd.Timestamp):
                                sale_obj[col] = val.strftime('%Y-%m-%d')
                            else:
                                sale_obj[col] = val
                    
                    # Add buyer details if available
                    if 'buyers' in dfs and len(dfs['buyers']) > 0 and 'BuyerID' in sale_obj:
                        buyer_id = sale_obj['BuyerID']
                        buyer_data = dfs['buyers'][dfs['buyers']['BuyerID'] == buyer_id]
                        if len(buyer_data) > 0:
                            buyer_dict = {}
                            for col in buyer_data.columns:
                                # Convert NumPy types to Python native types
                                val = buyer_data.iloc[0][col]
                                if isinstance(val, np.integer):
                                    buyer_dict[col] = int(val)
                                elif isinstance(val, np.floating):
                                    buyer_dict[col] = float(val)
                                elif isinstance(val, np.ndarray):
                                    buyer_dict[col] = val.tolist()
                                elif isinstance(val, pd.Timestamp):
                                    buyer_dict[col] = val.strftime('%Y-%m-%d')
                                else:
                                    buyer_dict[col] = val
                            sale_obj['BuyerDetails'] = buyer_dict
                    
                    sales_history.append(sale_obj)
                
                house_obj['SalesHistory'] = sales_history
        
        # Add call transcripts as an array if available
        if 'call_transcripts' in dfs and len(dfs['call_transcripts']) > 0 and 'HouseID' in dfs['call_transcripts'].columns:
            transcripts = dfs['call_transcripts'][dfs['call_transcripts']['HouseID'] == house_id]
            
            if len(transcripts) > 0:
                call_transcripts = []
                for _, transcript in transcripts.iterrows():
                    transcript_dict = {}
                    for col in transcript.index:
                        # Convert NumPy types to Python native types
                        val = transcript[col]
                        if isinstance(val, np.integer):
                            transcript_dict[col] = int(val)
                        elif isinstance(val, np.floating):
                            transcript_dict[col] = float(val)
                        elif isinstance(val, np.ndarray):
                            transcript_dict[col] = val.tolist()
                        elif isinstance(val, pd.Timestamp):
                            transcript_dict[col] = val.strftime('%Y-%m-%d')
                        else:
                            transcript_dict[col] = val
                    call_transcripts.append(transcript_dict)
                
                house_obj['CallTranscripts'] = call_transcripts
        
        # Add to the list of all houses
        all_houses.append(house_obj)
    
    return all_houses

# Create specialized JSON objects
specialized_json = create_specialized_json_objects(cleaned_dfs, comprehensive_df)

# Print the keys of the specialized JSON objects
print("\n--- SPECIALIZED JSON OBJECTS ---")
for key in specialized_json.keys():
    print(f"- {key}: {len(specialized_json[key])} records")

# Take a closer look at the complete_houses JSON structure
if 'complete_houses' in specialized_json and len(specialized_json['complete_houses']) > 0:
    print("\n--- SAMPLE COMPLETE HOUSE JSON ---")
    sample_house = specialized_json['complete_houses'][0]
    print(f"House ID: {sample_house.get('HouseID', 'N/A')}")
    print(f"Address: {sample_house.get('Address', sample_house.get('House_Address', 'N/A'))}")
    print(f"Active Listing: {sample_house.get('IsActiveListing', 'N/A')}")
    print(f"Property Type: {sample_house.get('PropertyType', sample_house.get('House_PropertyType', 'N/A'))}")
    
    # Show sales history if available
    if 'SalesHistory' in sample_house and len(sample_house['SalesHistory']) > 0:
        print(f"Number of Past Sales: {len(sample_house['SalesHistory'])}")
        print("Most Recent Sale:")
        recent_sale = sample_house['SalesHistory'][0]
        print(f"  Date: {recent_sale.get('SaleDate', 'N/A')}")
        print(f"  Price: ${recent_sale.get('SalePrice', 'N/A'):,}")
        
        # Show buyer info if available
        if 'BuyerDetails' in recent_sale:
            buyer = recent_sale['BuyerDetails']
            print(f"  Buyer: {buyer.get('FullName', 'N/A')}")
            print(f"  Buyer Income: ${buyer.get('AnnualIncome', 'N/A'):,}")
    
    # Show call transcripts if available
    if 'CallTranscripts' in sample_house and len(sample_house['CallTranscripts']) > 0:
        print(f"Number of Call Transcripts: {len(sample_house['CallTranscripts'])}")
    
    # Print the full JSON for one house (uncomment if needed)
    # print("\nFull JSON Structure for One House:")
    # print(json.dumps(sample_house, indent=2))



--- SPECIALIZED JSON OBJECTS ---
- buyers: 5000 records
- houses_for_sale: 1000 records
- past_sales: 500 records
- call_transcripts: 10000 records
- comprehensive: 1500 records
- complete_houses: 1500 records

--- SAMPLE COMPLETE HOUSE JSON ---
House ID: 7d7e7c96-206e-4d4c-8d53-088b301f99f7
Address: 683 Brady River, Woodbridge, NJ 07598
Active Listing: False
Property Type: Multi-Family
Number of Past Sales: 1
Most Recent Sale:
  Date: 2024-12-05
  Price: $1,525,370
  Buyer: Priscilla Taylor
  Buyer Income: $230,426


In [23]:

## Save JSON Files

def save_json_files(json_objects, output_dir='.'):
    """
    Save JSON objects to files
    
    Args:
        json_objects (dict): Dictionary with JSON objects
        output_dir (str): Output directory
        
    Returns:
        list: List of saved file paths
    """
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    saved_files = []
    
    for name, data in json_objects.items():
        file_path = Path(output_dir) / f"{name}.json"
        
        try:
            with open(file_path, 'w') as f:
                # Use the custom NumpyEncoder to handle NumPy data types
                json.dump(data, f, indent=2, cls=NumpyEncoder)
            
            saved_files.append(str(file_path))
            print(f"Saved {len(data)} records to {file_path}")
        except TypeError as e:
            print(f"Error saving {name}.json: {e}")
            print("Attempting to convert problematic data types...")
            
            # If we're dealing with a list of dictionaries (common case)
            if isinstance(data, list) and all(isinstance(item, dict) for item in data):
                converted_data = []
                for item in data:
                    converted_item = {}
                    for k, v in item.items():
                        # Convert various NumPy types to Python native types
                        if isinstance(v, np.integer):
                            converted_item[k] = int(v)
                        elif isinstance(v, np.floating):
                            converted_item[k] = float(v)
                        elif isinstance(v, np.ndarray):
                            converted_item[k] = v.tolist()
                        elif isinstance(v, pd.Timestamp):
                            converted_item[k] = v.strftime('%Y-%m-%d')
                        else:
                            converted_item[k] = v
                    converted_data.append(converted_item)
                
                # Try again with the converted data
                with open(file_path, 'w') as f:
                    json.dump(converted_data, f, indent=2)
                
                saved_files.append(str(file_path))
                print(f"Successfully saved {len(converted_data)} records to {file_path} after conversion")
            else:
                print(f"Could not save {name}.json. Unsupported data structure.")
    
    return saved_files

# Save JSON files
output_dir = 'json_output'
saved_files = save_json_files(specialized_json, output_dir)


Saved 5000 records to json_output/buyers.json
Saved 1000 records to json_output/houses_for_sale.json
Saved 500 records to json_output/past_sales.json
Saved 10000 records to json_output/call_transcripts.json
Saved 1500 records to json_output/comprehensive.json
Saved 1500 records to json_output/complete_houses.json


In [24]:

## Optional: Data Analysis and Visualization

def analyze_real_estate_data(dfs, comprehensive_df):
    """
    Perform basic analysis on real estate data
    
    Args:
        dfs (dict): Dictionary with cleaned dataframes
        comprehensive_df (DataFrame): Comprehensive joined dataframe
    """
    print("\n--- DATA ANALYSIS ---")
    
    # 1. Price distribution for houses for sale
    if 'houses' in dfs and len(dfs['houses']) > 0 and 'ListingPrice' in dfs['houses'].columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(dfs['houses']['ListingPrice'], kde=True)
        plt.title('Distribution of Listing Prices for Houses For Sale')
        plt.xlabel('Listing Price ($)')
        plt.ylabel('Count')
        plt.show()
    
    # 2. Sale price vs. listing price for past sales
    if 'past_sales' in dfs and len(dfs['past_sales']) > 0:
        if all(col in dfs['past_sales'].columns for col in ['ListingPrice', 'SalePrice']):
            plt.figure(figsize=(10, 6))
            plt.scatter(dfs['past_sales']['ListingPrice'], dfs['past_sales']['SalePrice'])
            plt.plot([dfs['past_sales']['ListingPrice'].min(), dfs['past_sales']['ListingPrice'].max()], 
                    [dfs['past_sales']['ListingPrice'].min(), dfs['past_sales']['ListingPrice'].max()], 
                    'r--')
            plt.title('Sale Price vs. Listing Price')
            plt.xlabel('Listing Price ($)')
            plt.ylabel('Sale Price ($)')
            plt.grid(True)
            plt.show()
            
            # Calculate average discount/premium
            dfs['past_sales']['PriceDiff'] = dfs['past_sales']['SalePrice'] - dfs['past_sales']['ListingPrice']
            dfs['past_sales']['PriceDiffPct'] = (dfs['past_sales']['PriceDiff'] / dfs['past_sales']['ListingPrice']) * 100
            
            avg_price_diff = dfs['past_sales']['PriceDiffPct'].mean()
            print(f"Average price difference: {avg_price_diff:.2f}% from listing price")
    
    # 3. Sales by property type
    if 'comprehensive_df' in locals() and len(comprehensive_df) > 0:
        if 'House_PropertyType' in comprehensive_df.columns:
            plt.figure(figsize=(10, 6))
            property_counts = comprehensive_df['House_PropertyType'].value_counts()
            property_counts.plot(kind='bar')
            plt.title('Sales by Property Type')
            plt.xlabel('Property Type')
            plt.ylabel('Number of Sales')
            plt.xticks(rotation=45)
            plt.show()
    
    # 4. Buyer income vs. house price
    if 'comprehensive_df' in locals() and len(comprehensive_df) > 0:
        if all(col in comprehensive_df.columns for col in ['Buyer_AnnualIncome', 'SalePrice']):
            plt.figure(figsize=(10, 6))
            plt.scatter(comprehensive_df['Buyer_AnnualIncome'], comprehensive_df['SalePrice'])
            plt.title('Buyer Income vs. House Price')
            plt.xlabel('Buyer Annual Income ($)')
            plt.ylabel('Sale Price ($)')
            plt.grid(True)
            
            # Add trend line
            z = np.polyfit(comprehensive_df['Buyer_AnnualIncome'], comprehensive_df['SalePrice'], 1)
            p = np.poly1d(z)
            plt.plot(comprehensive_df['Buyer_AnnualIncome'], p(comprehensive_df['Buyer_AnnualIncome']), "r--")
            
            plt.show()
            
            # Calculate correlation
            corr = comprehensive_df[['Buyer_AnnualIncome', 'SalePrice']].corr().iloc[0,1]
            print(f"Correlation between buyer income and sale price: {corr:.2f}")

# Optional: Uncomment to run analysis
# analyze_real_estate_data(cleaned_dfs, comprehensive_df)

## Conclusion

print("\n--- CONCLUSION ---")
print(f"Successfully processed {len(comprehensive_df)} real estate records")
print(f"Created {len(specialized_json)} different JSON object collections")
print(f"Saved JSON files to: {', '.join(saved_files)}")
print("\nThe 'complete_houses.json' file contains a comprehensive dataset with:")
print("- All houses (both from past sales and current listings)")
print("- All related buyer and seller information")
print("- Sales history as arrays within each house object")
print("- Call transcripts as arrays within each house object")
print("\nYou can now use these JSON files for your application!")



--- CONCLUSION ---
Successfully processed 1500 real estate records
Created 6 different JSON object collections
Saved JSON files to: json_output/buyers.json, json_output/houses_for_sale.json, json_output/past_sales.json, json_output/call_transcripts.json, json_output/comprehensive.json, json_output/complete_houses.json

The 'complete_houses.json' file contains a comprehensive dataset with:
- All houses (both from past sales and current listings)
- All related buyer and seller information
- Sales history as arrays within each house object
- Call transcripts as arrays within each house object

You can now use these JSON files for your application!


In [25]:

## Function to Load and Work with JSON objects

def load_json_files(json_dir='.'):
    """
    Load JSON files into Python objects
    
    Args:
        json_dir (str): Directory with JSON files
        
    Returns:
        dict: Dictionary with JSON objects
    """
    json_objects = {}
    
    json_dir_path = Path(json_dir)
    
    for json_file in json_dir_path.glob('*.json'):
        with open(json_file, 'r') as f:
            json_objects[json_file.stem] = json.load(f)
    
    return json_objects

# Example usage:
# loaded_json = load_json_files('json_output')
# print(f"Loaded JSON files: {', '.join(loaded_json.keys())}")

## Example: How to work with the complete_houses dataset

def filter_houses_example(complete_houses_json, min_price=None, max_price=None, 
                         min_bedrooms=None, property_type=None, has_call_transcripts=False):
    """
    Example function to demonstrate filtering the complete_houses dataset
    
    Args:
        complete_houses_json (list): List of house objects
        min_price (float, optional): Minimum listing price
        max_price (float, optional): Maximum listing price
        min_bedrooms (int, optional): Minimum number of bedrooms
        property_type (str, optional): Property type to filter for
        has_call_transcripts (bool, optional): Only return houses with call transcripts
        
    Returns:
        list: Filtered list of house objects
    """
    filtered_houses = []
    
    for house in complete_houses_json:
        # Get listing price - check in multiple possible locations
        listing_price = house.get('ListingPrice')
        if listing_price is None and 'SalesHistory' in house and len(house['SalesHistory']) > 0:
            listing_price = house['SalesHistory'][0].get('ListingPrice')
        
        # Get bedrooms - check in multiple possible locations
        bedrooms = house.get('Bedrooms')
        if bedrooms is None:
            bedrooms = house.get('House_Bedrooms')
        
        # Apply filters
        if min_price is not None and (listing_price is None or listing_price < min_price):
            continue
            
        if max_price is not None and (listing_price is None or listing_price > max_price):
            continue
            
        if min_bedrooms is not None and (bedrooms is None or bedrooms < min_bedrooms):
            continue
            
        if property_type is not None and house.get('PropertyType') != property_type and house.get('House_PropertyType') != property_type:
            continue
            
        if has_call_transcripts and ('CallTranscripts' not in house or len(house['CallTranscripts']) == 0):
            continue
        
        filtered_houses.append(house)
    
    return filtered_houses

# Example usage:
# filtered_houses = filter_houses_example(specialized_json['complete_houses'], min_price=500000, min_bedrooms=3)
# print(f"Found {len(filtered_houses)} houses matching criteria")