In [1]:
import json

# Load the provided JSON raw data from the attachment
json_file_path = "appraisals_dataset.json"

with open(json_file_path, 'r') as file:
    raw_data = json.load(file)

In [2]:
import pandas as pd
from collections import defaultdict

# Initialize dictionaries to store field names by section and sample values
field_names = {
    'subject': defaultdict(set),
    'comps': defaultdict(set),
    'properties': defaultdict(set)
}

# Extract field names and collect up to 3 unique sample values for each field
for appraisal in raw_data.get('appraisals', []):
    # Subject property
    if 'subject' in appraisal:
        for field, value in appraisal['subject'].items():
            if value and value != '' and value != 'n/a' and value != 'N/A':
                if len(field_names['subject'][field]) < 3:
                    field_names['subject'][field].add(str(value))
    # Comps
    if 'comps' in appraisal:
        for comp in appraisal['comps']:
            for field, value in comp.items():
                if value and value != '' and value != 'n/a' and value != 'N/A':
                    if len(field_names['comps'][field]) < 3:
                        field_names['comps'][field].add(str(value))
    # Properties
    if 'properties' in appraisal:
        for prop in appraisal['properties']:
            for field, value in prop.items():
                if value and value != '' and value != 'n/a' and value != 'N/A':
                    if len(field_names['properties'][field]) < 3:
                        field_names['properties'][field].add(str(value))

# Convert sets to sorted lists for consistent output
for section in field_names:
    for field in field_names[section]:
        field_names[section][field] = sorted(field_names[section][field])

# Create a DataFrame for the mapping template
all_fields = []
for section, fields in field_names.items():
    for field, samples in fields.items():
        all_fields.append({
            'section': section,
            'original_field': field,
            'sample_values': ', '.join(samples),
            'canonical_field': '',  # To be filled manually
            'data_type': ''  # To be filled manually
        })

mapping_df = pd.DataFrame(all_fields)

# Save the DataFrame to CSV
mapping_df.to_csv('field_mapping_template.csv', index=True)

In [3]:
mapping_df.head()  # Display the first few rows as output

Unnamed: 0,section,original_field,sample_values,canonical_field,data_type
0,subject,address,"11 PAUL AVE Ayr ON N0B1E0, 142-950 Oakview Ave...",,
1,subject,subject_city_province_zip,"""Twin Oak Meadows"", Ayr ON N0B1E0, West Chezze...",,
2,subject,effective_date,"Apr/11/2025, Apr/17/2025, May/01/2025",,
3,subject,municipality_district,Halifax Regional Municipality - West Chezzetco...,,
4,subject,site_dimensions,"131' x 154', Condo Common Property, See Schedu...",,


In [4]:
# After filling in the mapping_df CSV (completed version is called "complete_field_mappings.csv")...

from utils import *

# Initialize lists to store processed data
subjects = []
comps = []
properties = []

# Process each appraisal
for appraisal in raw_data.get('appraisals', []):
    # Process subject property
    if 'subject' in appraisal:
        subject_data = appraisal['subject'].copy()
        # Add standardized address
        subject_data.update(process_subject_address(subject_data))
        subjects.append(subject_data)
    
    # Process comp properties
    if 'comps' in appraisal:
        for comp in appraisal['comps']:
            comp_data = comp.copy()
            # Add standardized address
            comp_data.update(process_comp_address(comp_data))
            # Add reference to subject property
            if 'subject' in appraisal and 'address' in appraisal['subject']:
                comp_data['subject_address'] = appraisal['subject']['address']
            comps.append(comp_data)
    
    # Process available properties
    if 'properties' in appraisal:
        for prop in appraisal['properties']:
            prop_data = prop.copy()
            # Add standardized address
            prop_data.update(process_property_address(prop_data))
            # Add reference to subject property
            if 'subject' in appraisal and 'address' in appraisal['subject']:
                prop_data['subject_address'] = appraisal['subject']['address']
            properties.append(prop_data)

# Convert to DataFrames
subjects_df = pd.DataFrame(subjects)
comps_df = pd.DataFrame(comps)
properties_df = pd.DataFrame(properties)

AttributeError: 'NoneType' object has no attribute 'strip'

In [None]:
# Clean the DataFrames (basic cleaning)
subjects_df = clean_dataframe(subjects_df)
comps_df = clean_dataframe(comps_df)
properties_df = clean_dataframe(properties_df)

In [None]:
# Apply specific processing to fields that need special handling
    
# Process sale_price in comps
if 'sale_price' in comps_df.columns:
    comps_df['sale_price'] = comps_df['sale_price'].apply(process_sale_price)

# Process bedroom counts
if 'num_beds' in subjects_df.columns:
    subjects_df['bedrooms'] = subjects_df['num_beds'].apply(process_bedroom_count)
    # Keep original column for reference if needed
    subjects_df.drop('num_beds', axis=1, inplace=True)

if 'bed_count' in comps_df.columns:
    comps_df['bedrooms'] = comps_df['bed_count'].apply(process_bedroom_count)
    # Keep original column for reference if needed
    comps_df.drop('bed_count', axis=1, inplace=True)

# Process bathroom counts
if 'num_baths' in subjects_df.columns:
    bath_results = subjects_df['num_baths'].apply(process_bathroom_count)
    subjects_df['full_baths'] = [result[0] for result in bath_results]
    subjects_df['half_baths'] = [result[1] for result in bath_results]
    # Keep original column for reference if needed
    subjects_df.drop('num_baths', axis=1, inplace=True)

if 'bath_count' in comps_df.columns:
    bath_results = comps_df['bath_count'].apply(process_bathroom_count)
    comps_df['full_baths'] = [result[0] for result in bath_results]
    comps_df['half_baths'] = [result[1] for result in bath_results]
    # Keep original column for reference if needed
    comps_df.drop('bath_count', axis=1, inplace=True)

# Process GLA (Gross Living Area)
if 'gla' in subjects_df.columns:
    subjects_df['gla'] = subjects_df['gla'].apply(process_gla)

if 'gla' in comps_df.columns:
    comps_df['gla'] = comps_df['gla'].apply(process_gla)

if 'gla' in properties_df.columns:
    properties_df['gla'] = properties_df['gla'].apply(process_gla)

In [None]:
# Remove units from numeric fields
numeric_fields_with_units = [
    'distance_to_subject', 
    'lot_size_sf',
    'main_lvl_area',
    'second_lvl_area',
    'third_lvl_area',
    'basement_area',
    'main_level_finished_area',
    'upper_lvl_fin_area'
]

for df in [subjects_df, comps_df, properties_df]:
    for field in numeric_fields_with_units:
        if field in df.columns:
            df[field] = df[field].apply(remove_units_and_symbols)

In [None]:
# Convert column types
mapping_df = pd.read_csv("complete_field_mappings.csv")

subjects_df = convert_column_types(subjects_df, mapping_df[mapping_df['section'] == 'subject'])
comps_df = convert_column_types(comps_df, mapping_df[mapping_df['section'] == 'comps'])
properties_df = convert_column_types(properties_df, mapping_df[mapping_df['section'] == 'properties'])

In [None]:
# Display the first few rows of each DataFrame
print("Subject Properties:")
display(subjects_df.head())

print("\nComp Properties:")
display(comps_df.head())

print("\nAvailable Properties:")
display(properties_df.head())

# Check for missing values
print("Missing values in Subject Properties:")
display(subjects_df.isna().sum())

print("\nMissing values in Comp Properties:")
display(comps_df.isna().sum())

print("\nMissing values in Available Properties:")
display(properties_df.isna().sum())

# Examine the standardized addresses
print("Sample of standardized subject addresses:")
display(subjects_df[['address', 'standardized_full']].head())

print("\nSample of standardized comp addresses:")
display(comps_df[['address', 'standardized_full']].head())

In [None]:
# Save processed data to CSV files for further analysis
subjects_df.to_csv("processed_subjects.csv", index=True)
comps_df.to_csv("processed_comps.csv", index=True)
properties_df.to_csv("processed_properties.csv", index=True)

print("Processed data saved to CSV files.")