- drop duplicates by MLS
- merge columns
- explodes amenities into individual columns
- calculate sqft when range or room dimensions (only) are given

In [158]:
import pandas as pd
import os
import numpy as np
import ast
import re
from pathlib import Path
from collections import Counter


In [159]:
import re

def get_street_address(address):
    if isinstance(address, str):
        # Step 1: Handle missing or invalid addresses
        address = address.strip()
        
        # Step 2: If there is a dash and the part after the dash seems like an apartment number, split
        if "-" in address:
            parts = address.split("-")
            # Check if the part after the dash is likely an apartment number (contains digits)
            if re.search(r'\d', parts[1]):  # If the second part contains digits, treat as apt number
                address = parts[1]  # This assumes the apartment number is the part after the dash
            else:
                address = parts[0]  # If not, we keep the first part (before the dash)

        # Step 3: Remove any content after the opening parenthesis (e.g., neighborhoods)
        address = address.split("(")[0].strip()

        # Step 4: Remove content after "Toronto" (city name) if necessary
        address = address.split("Toronto")[0].strip()

        # Step 5: Convert to lowercase
        return address.lower()
    return ''  # If it's not a valid string, return an empty string

In [160]:
def numeric_price(df, columns_list):
    """$600,000 (str) --> 600000 (float)"""
    df_copy = df.copy()
    
    for column in columns_list:
        # Remove "$" and "," then convert to numeric, invalid parsing will become NaN
        df_copy[column] = df_copy[column].str.replace("$", "", regex=False).str.replace(",", "", regex=False)
        df_copy[column] = pd.to_numeric(df_copy[column], errors='coerce')  # Convert to float, invalid entries become NaN
        
    return df_copy


In [161]:
def numeric_lot_dim(dim):
    """Converts dimension string (e.g., '122 ft, 8 in', '40 ft', '40') to a float representing feet."""
    
    if pd.isna(dim): # For the NaN values (str operations cannot operate on float-like NaN values)
        return np.nan
    if isinstance(dim, (int, float)):
        return float(dim)
    
    dim = dim.strip().lower()

    # Initialize
    feet = 0
    inches = 0

    # Both feet and inches (e.g., "122 ft, 8 in")
    feet_in_inches = re.match(r'(\d+)\s*ft.*?(\d+)\s*in', dim)
    if feet_in_inches:
        feet = int(feet_in_inches.group(1))  # feet
        inches = int(feet_in_inches.group(2))  # inches
    else:
        # Only feet is present (e.g., "40 ft" or "40")
        feet_in_feet_only = re.match(r'(\d+)\s*ft', dim)
        if feet_in_feet_only:
            feet = int(feet_in_feet_only.group(1))
        else:
            # String is just a number (e.g., "40")
            if dim.isdigit():
                feet = int(dim)

    # Convert inches to feet (12 inches = 1 foot)
    total_feet = feet + (inches / 12)
    
    return total_feet

In [162]:
def get_realtor_house_sigma_sqft(row):
    """For Realtor dfs.
    Remove 'sqft'.
    If Square footage is a range, take the average.
    If it is a single value, leave as is."""
    
    # Make sure to treat row as a string and remove 'sqft'
    if isinstance(row, str):
        row = row.replace("sqft", "")\
                    .replace("feet²", "")\
                    .replace("+", "")\
                    .replace("<", "")\
                    .replace("<", "")\
                    .strip()  # Remove 'units' and descriptors and strip spaces

        # Check if it's a range (contains '-')
        if "-" in row:
            lower, upper = row.split("-")
            return (float(lower.strip()) + float(upper.strip())) / 2
        else:
            return float(row)
    else:
        return None

In [163]:
def get_zolo_sqft(row, room_dimensions=None):
    """For Zolo dfs. 
    If Size (sq ft) is NaN, then replaces with the sqft calculated from the room dimensions.
    If Size (sq ft) is a range (i.e., contains "-"), replaces with the average of the bounds."""
    
    # If the 'Size (sq ft)' is a range (contains "-"):
    if isinstance(row, str) and '-' in row:
        lower, upper = row.split("-")
        # Calculate the average of the two bounds
        return (float(lower) + float(upper)) / 2
    
    # If 'Size (sq ft)' is NaN, calculate from room dimensions
    elif pd.isna(row) and room_dimensions:  
        try:
            # Check if 'room_dimensions' is a list-like or string that we can process
            dimensions = ast.literal_eval(room_dimensions)
            square_meters = 0

            # If room dimensions is a list of room sizes (i.e., '200x300', '100x200', etc.)
            if isinstance(dimensions, list):
                for dimension in dimensions:
                    # Only split if 'dimension' contains 'x'
                    length, width = dimension.split("x")
                    square_meters += float(length.strip()) * float(width.strip())
                    
            return square_meters*10.7639
        
        except (ValueError, SyntaxError, TypeError):
            # Handle invalid or malformed room dimensions
            return np.nan

    elif isinstance(row, str):
        # If it's a string that doesn't contain a "-", remove any "+" and return the value as float
        return float(row.replace("+", "").replace(">", "").replace("<", "").strip())
    
    else:
        # If it's a numeric value (already a number), return it
        return row


In [164]:
# Clean the string value associated with the key "size" in each dict the "Room Info" dict for each row.

def clean_size_string(size):
    """For House Sigma dfs"""
    # Remove all non-numeric characters and spaces around the dimensions
    cleaned_dimensions = re.sub(r'[^\d. x]', '', size)
    return cleaned_dimensions

# Calculate the area of each room (where each dict in the "Room Info" list represents a different room)

def get_area(room):
    """House Sigma dfs"""
    # Sometimes the "Room Info" list of dictionaries is an empty list [], or there's no key named "size"
    # for certain rooms in the list.
    if room['size'] is None:
        return 0
    
    # Clean to remove the mï¼‰ special characters after the dimensions for each room
    cleaned_size = clean_size_string(room['size'])
    
    # Use regex to identify form and pattern of dimensions in each dictionary in the list of rooms
    # Provided dimensions are actually in meters.
    match = re.match(r'([\d.]+) x ([\d.]+)', cleaned_size)
    if match:
        length = float(match.group(1))  
        width = float(match.group(2))   
        area_m2 = length * width       
        area_ft2 = area_m2 * 10.7639    
        return area_ft2
    else:
        return 0  
    
# Calculate total sqft (sum of the areas of all rooms in the list of dictionaries)

def get_total_sqft(rooms_list):
    return sum(get_area(room) for room in rooms_list)

In [165]:
def clean_real_estate(realtor, zolo, house_sigma, house_sigma_overview):
    
    """Cleans and processes realtor, zolo, and house_sigma dataframes."""
    
    house_sigma_overview = house_sigma_overview.rename(columns={
        "Column_8" : "lat",
        "Column_9": "long"
    })
    
    
    ##### REALTOR #####
    # Drop NaN MLSs and duplicate MLSs
    # Doubled checked many samples of MLSs and some listings just did not have a leading letter in the MLS.
    
    realtor.dropna(subset=['mls'], inplace=True)
    realtor.drop_duplicates(subset=['mls'], inplace=True)

    # Also cannot use any listings without sqft or address.
    realtor.dropna(subset=['Square Footage'], inplace=True)
    realtor.dropna(subset=['address'], inplace=True)

    # Get clean street address
    realtor["street address"] = realtor["address"].fillna('').apply(get_street_address)

    # Remove rentals
    realtor = realtor[~realtor["price"].str.contains("Monthly | Weekly", case=False, na=False)]


    # Get the numeric price
    realtor = numeric_price(realtor, ["price"])
    
    
    # Get sqft from sqft column (when scraping, there were no listings without sqft that had room dimensions, 
    # so we only scraped the sqft).
    realtor["sqft"] = realtor["Square Footage"].apply(get_realtor_house_sigma_sqft)
    realtor = realtor.drop(columns=["Square Footage"])
    
    # Beds and baths columns. Assume that a "partial bath" is 1/2 bath.
    realtor["beds"] = realtor["Above Grade"].fillna(0).astype(float) + realtor["Below Grade"].fillna(0).astype(float)
    realtor["baths"] = realtor["Total"].fillna(0).astype(float) + 0.5* realtor["Partial"].fillna(0).astype(float)
    realtor["Air Conditioning"] = realtor["Cooling"]
    realtor["Community"] = realtor["Community Name"]
    
    # Pool   
    
    realtor['Building Amenities'] = realtor['Building Amenities'].fillna('')  # Replace NaN with empty string so no float + str error
    
    # realtor['Building Amenities'] = realtor.apply(
    #     lambda row: (row['Building Amenities'] + ", Pool") if pd.notna(row['Pool Type']) else row['Building Amenities'],
    #     axis=1
    # )

    realtor['Building Amenities'] = realtor.apply(
        lambda row: (str(row['Building Amenities']) + ", Pool") if pd.notna(row['Pool Type']) else row['Building Amenities'],
        axis=1
    )
    
    realtor["Building Type"] = realtor.apply(
        lambda row: row["Style"] if row["Building Type"] == "House" else row["Building Type"], axis=1)
    
    
    realtor["Frontage"] = realtor["Frontage"].apply(numeric_lot_dim)
    realtor["Lot Depth"] = realtor["Land Depth"].apply(numeric_lot_dim)
    
    ##### ZOLO #####
    
    # Drop NaN MLSs and duplicate MLSs
    zolo["mls"] = zolo["mls"].astype(str).str[3:].str.replace("®", "", regex=False).str.strip()

    zolo.dropna(subset=['mls'], inplace=True)
    zolo.drop_duplicates(subset=['mls'], inplace=True)

    # Also cannot use any listings without an address.
    zolo.dropna(subset=['address'], inplace=True)

    # Get clean street address
    zolo["street address"] = zolo["address"].fillna('').apply(get_street_address)

    # Remove rentals
    zolo = zolo[~zolo["price"].str.contains("Monthly | Weekly", case=False, na=False)]
 
    # Get the numeric price
    zolo = numeric_price(zolo, ["price"])
   
    # Get sqft from sqft column (when scraping, there were no listings without sqft that had room dimensions, 
    # so we only scraped the sqft).
    
    
    
    # Extract first room dimension (length) if the room dimensions are in a list format
    zolo["sample_split_dimension"] = zolo["room dimensions"].apply(
        lambda x: ast.literal_eval(x)[0].split("x")[0] if isinstance(ast.literal_eval(x), list) and len(ast.literal_eval(x)) > 0 else None
    )

    # Convert the first dimension to float if it's not "N/A" or None, else set to None
    zolo["sample_split_dimension"] = zolo["sample_split_dimension"].apply(
        lambda x: float(x) if ((x != "N/A") and (x is not None)) else None
    )

    # Filter rows where sample_split_dimension is a valid float or Size (sq ft) is not NaN,
    # so that there is a way to get the sqft with either room dimensions or sqft.
    zolo = zolo[
        zolo["sample_split_dimension"].notna() |  # Keep rows where the dimension is a valid float (not NaN)
        zolo["Size (sq ft)"].notna()  # Keep rows where "Size (sq ft)" is not NaN
    ]
    
    zolo["sqft"] = zolo.apply(lambda row: get_zolo_sqft(row["Size (sq ft)"], row.get('room dimensions', None)), axis=1)
    zolo = zolo.drop(columns=["Size (sq ft)"])

    zolo.dropna(subset=['sqft'], inplace=True)
    
    zolo["beds"] = zolo["Bedrooms"].fillna(0).astype(float) + zolo["Bedrooms Plus"].fillna(0).astype(float)
    zolo["baths"] = zolo["Bathrooms"]
    zolo["Heating Type"] = zolo["Heating"]
    
    zolo = zolo.rename(columns={"Pool": "Pool Type"})
    zolo["Pool Type"] = zolo["Pool Type"].fillna("None")
    zolo["Pool Type"] = zolo["Pool Type"].apply(lambda x: "Pool" if pd.notna(x) and x != "None" else None)
    
    zolo['Amenity'] = zolo['Amenity'].fillna('')
    zolo['Amenity'] = zolo.apply(
        lambda row: row['Amenity'] + ", Pool" if pd.notna(row['Pool Type']) else row['Amenity'],
        axis=1
    )

    zolo['Amenity'] = zolo.apply(
        lambda row: (str(row['Amenity']) + ", Pool") if pd.notna(row['Pool Type']) else row['Amenity'],
        axis=1
    )
    
    zolo = zolo.rename(columns={"Type" : "Building Type"})

    zolo["Frontage"] = zolo["Frontage"].apply(numeric_lot_dim)
    zolo["Lot Depth"] = zolo["Lot Depth"].apply(numeric_lot_dim)

    ##### House Sigma #####

    new_row = pd.DataFrame([house_sigma.columns], columns=house_sigma.columns)

    # Step 2: Append the new row at the top of the DataFrame
    new_house_sigma = pd.concat([new_row, house_sigma], ignore_index=True)

    # Step 3: Set new column names (make sure the number of new column names matches the number of columns)
    new_house_sigma.columns = ['Name', 'Property Info', 'Listing Info', 'Room Info', 'description 1', 'description 2', 'link']
        
    new_house_sigma["Property Info"] = new_house_sigma["Property Info"].apply(ast.literal_eval)
    new_house_sigma["Listing Info"] = new_house_sigma["Listing Info"].apply(ast.literal_eval)
    new_house_sigma["Room Info"] = new_house_sigma["Room Info"].apply(ast.literal_eval)

    property_expanded = pd.json_normalize(new_house_sigma['Property Info'])
    listing_expanded = pd.json_normalize(new_house_sigma['Listing Info'])

    # Concatenate the expanded DataFrames with the original DataFrame, excluding the old columns
    df_expanded = pd.concat([new_house_sigma.drop(columns=['Property Info', 'Listing Info']), property_expanded, listing_expanded], axis=1)
    
    df_expanded['calculated_sqft'] = df_expanded['Room Info'].apply(get_total_sqft)


    # Drop the first instance of each duplicate column (property info and listing info had lots of duplicate keys)
    df_expanded = df_expanded.loc[:, ~df_expanded.columns.duplicated(keep='last')]
    
    df_expanded["Size:"] = df_expanded["Size:"].apply(get_realtor_house_sigma_sqft)

    df_expanded["Size:"] = df_expanded["Size:"].fillna(0)
    df_expanded["sqft"] = df_expanded.apply(lambda row: row["calculated_sqft"] if row["Size:"] == 0 else row["Size:"], axis=1)
    df_expanded = df_expanded.loc[df_expanded['sqft'] != 0]

    house_sigma = df_expanded

    house_sigma["mls"] = house_sigma["Listing #:"]
    house_sigma["beds"] = house_sigma["Bedrooms:"]
    house_sigma["baths"] = house_sigma["Bathrooms:"]
    house_sigma["Heating Type"] = house_sigma["Heating Type:"]
    house_sigma["Air Conditioning"] = house_sigma["Cooling:"]
    house_sigma["Community"] = house_sigma["Community:"]
    
    house_sigma["Frontage"] = house_sigma["Frontage:"].apply(numeric_lot_dim)
    house_sigma["Land Depth"] = house_sigma["Depth:"].apply(numeric_lot_dim)
    
    # house_sigma["Amenities:"] = house_sigma["Amenities:"].str.strip().replace({
    #     "Outdoor Pool": "Pool", 
    #     "Indoor Pool": "Pool"
    # })
    
    # house_sigma["Amenities:"] = house_sigma["Amenities:"].apply(
    #     lambda x: x.replace("Outdoor Pool", "Pool").replace("Indoor Pool", "Pool") if isinstance(x, str) else x
    # )

    house_sigma['Amenities:'] = house_sigma.apply(
        lambda row: (
            (str(row['Amenities:']) + ", Pool") if (
                pd.notna(row['Pool:']) or
                (isinstance(row['description 1'], str) and 'pool' in row['description 1'].lower()) or
                (isinstance(row['description 2'], str) and 'pool' in row['description 2'].lower())
            ) else row['Amenities:']
        ),
        axis=1
    )
    
    # house_sigma['Amenities:'] = house_sigma.apply(
    #     lambda row: (str(row['Amenities:']) + ", Pool") if (
    #     pd.notna(row['Pool:']) or
    #     (isinstance(row['description 1'], str) and 'pool' in row['description 1'].lower()) or # descriptions not str means that they are NaN or [] --> can ignore without removing these rows
    #     (isinstance(row['description 2'], str) and 'pool' in row['description 2'].lower())
    #     ) else row['Amenities:'],
    #     axis=1
    # )

    # house_sigma['Amenities:'] = house_sigma.apply(
    #     lambda row: (row['Amenities:'] + ", Pool") if (
    #                                                     pd.notna(row['Pool:']) or
    #                                                     row['description 1'].str.contains("pool", case=False, na=False) or
    #                                                     row['description 2'].str.contains("pool", case=False, na=False)
    #                                                 ) else row['Amenities:'],
    #                                                         axis=1
    # )
    
    # house_sigma['Amenities:'] = house_sigma.apply(
    #     lambda row: (row['Amenities:'] + ", Pool") if (pd.notna(row['Pool:']), |
    #                                                     house_sigma["description 1"].str.contains("pool", case=False), |
    #                                                     house_sigma["description 2"].str.contains("pool", case=False)
    #                                                     ) else row['Amenities:'],
    #                                                     axis=1)
    
    # Replace NaN values explicitly with "No Amenity"
    house_sigma["Amenities:"] = house_sigma["Amenities:"].fillna("No amenity")

    house_sigma_merged = pd.merge(house_sigma_overview, house_sigma, on="link", how="left")
    
    # Filter out listings for rent and convert price to float.
    
    house_sigma_merged = house_sigma_merged[~house_sigma_merged["Listed Price"].str.contains("Monthly | Weekly", case=False, na=False)]
    house_sigma_merged = numeric_price(house_sigma_merged, ["Listed Price", "Sold Price"])
    house_sigma_merged["price"] = house_sigma_merged["Listed Price"]
    
    house_sigma_merged = house_sigma_merged.rename(columns={"Property Type:" : "Building Type"})
    
    return realtor, zolo, house_sigma_merged

In [166]:
# def get_top_amenities(realtor, zolo, house_sigma):
#     """Gets bools for top 10 amenities across all three real estate dfs."""
    
#     # Concatenate the three dfs
#     concatenated = pd.concat([realtor, zolo, house_sigma], ignore_index=True)

#     # Function to handle both strings and lists
#     def split_string_or_list(value):
#         # If the value is a string and contains commas, split it
#         if isinstance(value, str):
#             return value.split(',')  # Split by commas to create a list
#         elif isinstance(value, list):
#             return value  # Return the list as is
#         return []  # In case of other types (e.g., NaN or unexpected values)

#     # Exploding the amenities columns, one by one, and handling both strings and lists
#     features_exploded = concatenated['Features'].apply(split_string_or_list).explode().dropna()
#     building_amenities_exploded = concatenated['Building Amenities'].apply(split_string_or_list).explode().dropna()
#     amenity_exploded = concatenated['Amenity'].apply(split_string_or_list).explode().dropna()
#     amenities_exploded = concatenated['Amenities:'].apply(split_string_or_list).explode().dropna()

#     # Concatenate all the exploded lists into a single Series
#     all_amenities = pd.concat([features_exploded, building_amenities_exploded, amenity_exploded, amenities_exploded])

#     # Count the occurrences of each amenity
#     counter = Counter(all_amenities)
    
#     return counter

In [167]:
# def split_string_or_list(value):
#     """Handles both strings and lists, ensuring proper splitting and cleaning."""
#     # If it's a string, ensure proper trimming and splitting
#     if isinstance(value, str):
#         # Convert to lowercase and strip leading/trailing spaces, then split by commas
#         value = value.lower().strip()
#         return value.split(',')  # Split by commas to create a list
#     elif isinstance(value, list):
#         return value  # Return the list as is
#     return []  # In case of NaN or other unexpected types

def split_string_or_list(value):
    """Handles both strings and lists, ensuring proper splitting and cleaning."""
    # If it's a string, ensure proper trimming and splitting
    if isinstance(value, str):
        # Convert to lowercase and strip leading/trailing spaces, then split by commas
        value = value.lower().strip()
        return [v.strip() for v in value.split(',')]  # Split by commas and strip each item
    elif isinstance(value, list):
        return value  # Return the list as is
    return []


def get_top_amenities(realtor, zolo, house_sigma):
    """Gets bools for top amenities across all real estate dfs."""
    
    # Concatenate the three dfs
    concatenated = pd.concat([realtor, zolo, house_sigma], ignore_index=True)

    # Exploding the amenities columns, one by one, and handling both strings and lists
    features_exploded = concatenated['Features'].apply(split_string_or_list).explode().dropna()
    building_amenities_exploded = concatenated['Building Amenities'].apply(split_string_or_list).explode().dropna()
    amenity_exploded = concatenated['Amenity'].apply(split_string_or_list).explode().dropna()
    amenities_exploded = concatenated['Amenities:'].apply(split_string_or_list).explode().dropna()

    # Concatenate all the exploded lists into a single Series
    all_amenities = pd.concat([features_exploded, building_amenities_exploded, amenity_exploded, amenities_exploded])

    # Count the occurrences of each amenity (case-insensitive)
    counter = Counter(all_amenities)

    # Print the top 20 and their respective counts.
    print(counter.most_common(20))

    return counter

In [168]:
# def merge_real_estate(realtor, zolo, house_sigma):
#     """Gets bools for top 10 amenities across all three real estate dfs."""
    
#     concatenated = pd.concat([realtor, zolo, house_sigma], ignore_index=True)

#     all_amenities = concatenated['Features'].explode().tolist() \
#         + concatenated['Building Amenities'].explode().tolist() \
#         + concatenated['Amenity'].explode().tolist() \
#         + concatenated['Amenities:'].explode().tolist()
    
#     counter = get_top_amenities(realtor, zolo, house_sigma)
#     top_20_amenities = counter.most_common(20)
    
#     top_20_amenity_names = [amenity[0].strip() for amenity in top_20_amenities]
    
#     # Create a new column for each top amenity: 1 if it's in the row, 0 otherwise
#     for amenity in top_20_amenity_names:
#         concatenated[amenity] = concatenated.apply(lambda row: 1 if amenity in row.values else 0, axis=1)
        
#     print("Columns in concatenated dataframe:", concatenated.columns)
#     print("Top 20 amenity names:", top_20_amenity_names)
#     columns_to_select = [
#         "price", 
#         "address", 
#         "mls", 
#         "sqft", 
#         "Community", 
#         "beds", 
#         "baths", 
#         "Air Conditioning", 
#         "Heating"
#     ] + top_20_amenity_names  

#     selected_data = concatenated[columns_to_select]
    
#     return selected_data


In [169]:
def merge_real_estate(realtor, zolo, house_sigma):
    """Gets bools for top 10 amenities across all three real estate dfs."""
    
    # Concatenate all the DataFrames
    concatenated = pd.concat([realtor, zolo, house_sigma], ignore_index=True)

    # Get the top amenities from the concatenated DataFrame
    all_amenities = concatenated['Features'].explode().tolist() \
        + concatenated['Building Amenities'].explode().tolist() \
        + concatenated['Amenity'].explode().tolist() \
        + concatenated['Amenities:'].explode().tolist()

    # Get the top 20 amenities based on the count
    counter = get_top_amenities(realtor, zolo, house_sigma)
    top_20_amenities = counter.most_common(21) # Used top 21 because "" shows up as a frequent "amenity".
    
    # Extract the names of the top 20 amenities
    top_20_amenity_names = [amenity[0].strip() for amenity in top_20_amenities]
    print(top_20_amenity_names)
    # Create a new DataFrame with the boolean columns for each amenity
    amenity_columns = pd.DataFrame()

    for amenity in top_20_amenity_names:
        amenity_columns[amenity] = concatenated.apply(lambda row: 1 if amenity in row.values else 0, axis=1)

    # Concatenate the new boolean columns with the original DataFrame
    concatenated = pd.concat([concatenated, amenity_columns], axis=1)

    # Select the relevant columns
    columns_to_select = [
        "price", 
        "address", 
        "mls", 
        "sqft",
        "Community", 
        "beds", 
        "baths",
        "Building Type",
        "Air Conditioning", 
        "Heating Type",
        "lat",
        "long",
        "Frontage",
        "Land Depth",
    ] + top_20_amenity_names  

    selected_data = concatenated[columns_to_select]
    
    return selected_data


In [170]:
def final_cleaning(merged):
    """Remove any remaining NaN values by price, address, MLS, and sqft.
    Drop duplicates by MLS across the three real estate data sources."""
    
    merged = merged.dropna(subset=['price', 'address', 'mls', 'sqft'])
                    
    merged = merged.drop_duplicates(subset=["mls"])
    
    return merged

In [171]:
parent_dir = Path.cwd().parent
realtor = pd.read_csv(parent_dir/"2_data_cleaning"/"cleaned_csv"/"realtor_with_coords.csv")
zolo = pd.read_csv(parent_dir/"2_data_cleaning"/"cleaned_csv"/"zolo_with_coords.csv")
house_sigma = pd.read_csv(parent_dir/"2_data_cleaning"/"cleaned_csv"/"properties_housesigma.csv")
house_sigma_overview = pd.read_csv(parent_dir/"2_data_cleaning"/"cleaned_csv"/"housesigma_data_with_coords.csv")

In [172]:
realtor_cleaned, zolo_cleaned, house_sigma_cleaned = clean_real_estate(realtor, zolo, house_sigma, house_sigma_overview)
merged = merge_real_estate(realtor_cleaned, zolo_cleaned, house_sigma_cleaned)



[('', 1989), ('no amenity', 1917), ('pool', 1816), ('visitor parking', 1306), ('party/meeting room', 1187), ('gym', 1060), ('concierge', 1023), ('indoor pool', 638), ('exercise room', 601), ('rooftop deck/garden', 531), ('balcony', 492), ('nan', 423), ('exercise centre', 417), ('sauna', 413), ('storage - locker', 403), ('security/concierge', 371), ('outdoor pool', 348), ('guest suites', 348), ('party room', 323), ('bbqs allowed', 291)]
['', 'no amenity', 'pool', 'visitor parking', 'party/meeting room', 'gym', 'concierge', 'indoor pool', 'exercise room', 'rooftop deck/garden', 'balcony', 'nan', 'exercise centre', 'sauna', 'storage - locker', 'security/concierge', 'outdoor pool', 'guest suites', 'party room', 'bbqs allowed', 'bike storage']


In [173]:
merged.isna().sum()

price                   183
address                   0
mls                    1478
sqft                   1478
Community              1507
beds                   1478
baths                  1478
Building Type          1486
Air Conditioning       1744
Heating Type           1527
lat                      87
long                     87
Frontage               4536
Land Depth             6332
                          0
no amenity                0
pool                      0
visitor parking           0
party/meeting room        0
gym                       0
concierge                 0
indoor pool               0
exercise room             0
rooftop deck/garden       0
balcony                   0
nan                       0
exercise centre           0
sauna                     0
storage - locker          0
security/concierge        0
outdoor pool              0
guest suites              0
party room                0
bbqs allowed              0
bike storage              0
dtype: int64

In [174]:
merged["Pool"]

KeyError: 'Pool'

In [None]:
house_sigma_cleaned["Amenities:"]

0                                                     NaN
1       Concierge,  Gym,  Indoor Pool,  Sauna,  Tennis...
2       Concierge,  Gym,  Indoor Pool,  Sauna,  Tennis...
3       Concierge,  Gym,  Indoor Pool,  Sauna,  Tennis...
4                                                     NaN
                              ...                        
5521                                           No amenity
5522    Concierge,  Exercise Room,  Gym,  Sauna,  Visi...
5523                                                  NaN
5524                                                  NaN
5525                                                  NaN
Name: Amenities:, Length: 5526, dtype: object

In [None]:
house_sigma_cleaned['Amenities:']

0                                                     NaN
1       Concierge,  Gym,  Indoor Pool,  Sauna,  Tennis...
2       Concierge,  Gym,  Indoor Pool,  Sauna,  Tennis...
3       Concierge,  Gym,  Indoor Pool,  Sauna,  Tennis...
4                                                     NaN
                              ...                        
5521                                           No amenity
5522    Concierge,  Exercise Room,  Gym,  Sauna,  Visi...
5523                                                  NaN
5524                                                  NaN
5525                                                  NaN
Name: Amenities:, Length: 5526, dtype: object

In [None]:
house_sigma_cleaned['Amenities:'] = house_sigma_cleaned.apply(
    lambda row: (
        (str(row['Amenities:']) + ", Pool") if (
            pd.notna(row['Pool:']) or
            (isinstance(row['description 1'], str) and 'pool' in row['description 1'].lower()) or
            (isinstance(row['description 2'], str) and 'pool' in row['description 2'].lower())
        ) else row['Amenities:']
    ),
    axis=1
)

In [None]:
house_sigma_cleaned["Amenities:"]

0                                                     NaN
1       Concierge,  Gym,  Pool,  Sauna,  Tennis Court,...
2       Concierge,  Gym,  Pool,  Sauna,  Tennis Court,...
3       Concierge,  Gym,  Pool,  Sauna,  Tennis Court,...
4                                                     NaN
                              ...                        
5521                                           No amenity
5522    Concierge,  Exercise Room,  Gym,  Sauna,  Visi...
5523                                                  NaN
5524                                                  NaN
5525                                                  NaN
Name: Amenities:, Length: 5526, dtype: object

In [None]:
house_sigma_cleaned[house_sigma_cleaned["Amenities:"].str.contains("Pool", na=False)]

Unnamed: 0,link,Listed Price,Sold Price,Type,address,Lat,Long,Column_7,lat,long,...,sqft,mls,beds,baths,Heating Type,Air Conditioning,Community,Frontage,Land Depth,price
1,/on/etobicoke-real-estate/508-40-richview-rd/h...,729888.0,680000.0,Condo Apt,"508 - 40 Richview Rd , Etobicoke - Humber Heights",,,,43.685146,-79.515954,...,1299.5,W9381787,2,2,Heat Pump,Central Air,Humber Heights,,,729888.0
2,/on/etobicoke-real-estate/508-40-richview-rd/h...,729888.0,680000.0,Condo Apt,"508 - 40 Richview Rd , Etobicoke - Humber Heights",,,,43.685146,-79.515954,...,1299.5,W9381787,2,2,Heat Pump,Central Air,Humber Heights,,,729888.0
3,/on/etobicoke-real-estate/508-40-richview-rd/h...,729888.0,680000.0,Condo Apt,"508 - 40 Richview Rd , Etobicoke - Humber Heights",,,,43.685146,-79.515954,...,1299.5,W9381787,2,2,Heat Pump,Central Air,Humber Heights,,,729888.0
11,/on/toronto-real-estate/2611-8-eglinton-ave-e/...,659800.0,650000.0,Condo Apt,"2611 - 8 Eglinton Ave E , Toronto - Mount Plea...",,,,43.707208,-79.397783,...,649.5,C9510047,1,2,Forced Air,Central Air,Mount Pleasant West,,,659800.0
12,/on/toronto-real-estate/2611-8-eglinton-ave-e/...,659800.0,650000.0,Condo Apt,"2611 - 8 Eglinton Ave E , Toronto - Mount Plea...",,,,43.707208,-79.397783,...,649.5,C9510047,1,2,Forced Air,Central Air,Mount Pleasant West,,,659800.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5489,/on/north-york-real-estate/3217-121-mcmahon-dr...,820000.0,798000.0,Condo Apt,"3217 - 121 Mcmahon Dr , North York - Bayview V...",,,,43.767647,-79.370696,...,949.5,C9363358,2,2,Forced Air,Central Air,Bayview Village,,,820000.0
5496,/on/toronto-real-estate/2709-30-grand-trunk-cr...,689900.0,660000.0,Condo Apt,"2709 - 30 Grand Trunk Cres , Toronto - Waterfr...",,,,43.642269,-79.382865,...,749.5,C9303034,2,1,Forced Air,Central Air,Waterfront Communities C1,,,689900.0
5497,/on/toronto-real-estate/1901-290-adelaide-st-w...,749998.0,742000.0,Condo Apt,"1901 - 290 Adelaide St W , Toronto - Waterfron...",,,,43.648081,-79.391026,...,649.5,C9248395,2,1,Forced Air,Central Air,Waterfront Communities C1,,,749998.0
5498,/on/north-york-real-estate/1004-8-covington-rd...,885000.0,825000.0,Condo Apt,"1004 - 8 Covington Rd , North York - Englemoun...",,,,43.721021,-79.431171,...,1099.5,C9357634,2,2,Forced Air,Central Air,Englemount-Lawrence,,,885000.0


In [None]:
merged = merged.drop(columns=[""])
merged = merged.loc[:, ~merged.columns.duplicated(keep='first')]
merged[merged["Pool"] ==1]

Unnamed: 0,price,address,mls,sqft,Community,beds,baths,Building Type,Air Conditioning,Heating Type,...,Balcony,Exercise Room,nan,Sauna,Guest Suites,Party Room,Storage - Locker,Exercise Centre,Bbqs Allowed,Security/Concierge
821,12990000.0,101 Dunloe Road,C11228279,5000.000000,Forest Hill South,5.0,7.0,Detached,Central Air,Forced Air,...,0,0,0,0,0,0,0,0,0,0
843,2199000.0,27 Wilmar Road,W11047391,2266.121714,Islington-City Centre West,5.0,4.0,Detached,Central Air,Forced Air,...,0,0,0,0,0,0,0,0,0,0
845,4395000.0,151 Yonge Boulevard,C11046995,4250.000000,Bedford Park-Nortown,5.0,5.0,Detached,Central Air,Forced Air,...,0,0,0,0,0,0,0,0,0,0
847,1499000.0,136 Anthony Road,W10993146,1757.997822,Downsview-Roding-CFB,4.0,2.0,Detached,Central Air,Forced Air,...,0,0,0,0,0,0,0,0,0,0
848,4898000.0,5 Cranleigh Court,W10957537,3607.256293,Edenbridge-Humber Valley,6.0,6.0,Detached,Central Air,Forced Air,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2601,20800000.0,27 South Drive,C8397592,5000.000000,Rosedale-Moore Park,6.0,9.0,Detached,Central Air,Radiant,...,0,0,0,0,0,0,0,0,0,0
2602,14680000.0,175 Teddington Park Avenue,C8397488,3538.606292,Lawrence Park North,6.0,9.0,Detached,Central Air,Forced Air,...,0,0,0,0,0,0,0,0,0,0
2617,10499000.0,8 Swansdown Drive,C8333054,4108.697957,St. Andrew-Windfields,5.0,10.0,Detached,Central Air,Forced Air,...,0,0,0,0,0,0,0,0,0,0
2618,14980000.0,48 Arjay Crescent,C8329424,5000.000000,Bridle Path-Sunnybrook-York Mills,6.0,9.0,Detached,Central Air,Forced Air,...,0,0,0,0,0,0,0,0,0,0


In [None]:
merged.isna().sum()

price                   183
address                   0
mls                    1478
sqft                   1478
Community              1507
beds                   1478
baths                  1478
Building Type          1486
Air Conditioning       1744
Heating Type           1527
lat                      87
long                     87
No amenity                0
Party/Meeting Room        0
Gym                       0
Rooftop Deck/Garden       0
Balcony                   0
Pool                      0
Exercise Room             0
Sauna                     0
Guest Suites              0
Party Room                0
Storage - Locker          0
Exercise Centre           0
Bbqs Allowed              0
Security/Concierge        0
Concierge                 0
Visitor Parking           0
Recreation Room           0
dtype: int64

In [None]:
final_merged = final_cleaning(merged)

In [None]:
merged["price"].isna().sum()

np.int64(183)

In [None]:
final_merged.shape

(4254, 29)

In [None]:
final_merged.isna().sum()

price                    0
address                  0
mls                      0
sqft                     0
Community               28
beds                     0
baths                    0
Building Type            8
Air Conditioning       182
Heating Type            30
lat                     74
long                    74
No amenity               0
Party/Meeting Room       0
Gym                      0
Rooftop Deck/Garden      0
Balcony                  0
Pool                     0
Exercise Room            0
Sauna                    0
Guest Suites             0
Party Room               0
Storage - Locker         0
Exercise Centre          0
Bbqs Allowed             0
Security/Concierge       0
Concierge                0
Visitor Parking          0
Recreation Room          0
dtype: int64

In [None]:
final_merged.to_csv("prelim_final_merged_need_to_check.csv")

In [None]:
# Listings through the years

parent_dir = Path.cwd().parent
hs0 = pd.read_csv(parent_dir/"0_raw_data"/"house_data"/"extracted_houses_housesigma_2003_with_properties 0.csv")
hs2 = pd.read_csv(parent_dir/"0_raw_data"/"house_data"/"extracted_houses_housesigma_2003_with_properties 2.csv")
hs3 = pd.read_csv(parent_dir/"0_raw_data"/"house_data"/"extracted_houses_housesigma_2003_with_properties 3.csv")
hs4 = pd.read_csv(parent_dir/"0_raw_data"/"house_data"/"extracted_houses_housesigma_2003_with_properties 4 - Copy.csv", encoding="utf-8", on_bad_lines='skip')
hs5 = pd.read_csv(parent_dir/"0_raw_data"/"house_data"/"extracted_houses_housesigma_2003_with_properties 5.csv")

hs_overview = pd.read_csv(parent_dir/"2_data_cleaning"/"cleaned_csv"/"housesigma_data_2003_with_coords.csv")

In [None]:
hs_total = pd.concat([hs0, hs2, hs3, hs4, hs5], ignore_index=True)

In [None]:
hs_total.shape

(87991, 35)

In [None]:
realtor_cleaned, zolo_cleaned, house_sigma_cleaned = clean_real_estate(pd.DataFrame(), pd.DataFrame(), hs_total, hs_overview)
merged = merge_real_estate(realtor_cleaned, zolo_cleaned, house_sigma_cleaned)


KeyError: ['mls']

In [None]:
parent_dir = Path.cwd().parent
realtor = pd.read_csv(parent_dir/"0_raw_data"/"house_data"/"realtorcom_listings.csv")
zolo = pd.read_csv(parent_dir/"1_data_extraction"/"newest_zolo_with_mls.csv")
house_sigma = pd.read_csv(parent_dir/"0_raw_data"/"house_data"/"properties_housesigma.csv")

In [None]:
# Numeric price
# house_sigma = numeric_price(house_sigma, ["Listed Price", "Sold Price"])

new_row = pd.DataFrame([house_sigma.columns], columns=house_sigma.columns)

# Step 2: Append the new row at the top of the DataFrame
new_house_sigma = pd.concat([new_row, house_sigma], ignore_index=True)

# Step 3: Set new column names (make sure the number of new column names matches the number of columns)
new_house_sigma.columns = ['Name', 'Property Info', 'Listing Info', 'Room Info', 'description 1', 'description 2', 'link']

new_house_sigma["Property Info"] = new_house_sigma["Property Info"].apply(ast.literal_eval)
new_house_sigma["Listing Info"] = new_house_sigma["Listing Info"].apply(ast.literal_eval)
new_house_sigma["Room Info"] = new_house_sigma["Room Info"].apply(ast.literal_eval)

property_expanded = pd.json_normalize(new_house_sigma['Property Info'])
listing_expanded = pd.json_normalize(new_house_sigma['Listing Info'])

# Concatenate the expanded DataFrames with the original DataFrame, excluding the old columns
df_expanded = pd.concat([new_house_sigma.drop(columns=['Property Info', 'Listing Info']), property_expanded, listing_expanded], axis=1)
    
df_expanded['calculated_sqft'] = df_expanded['Room Info'].apply(get_total_sqft)


df_expanded = df_expanded.loc[:, ~df_expanded.columns.duplicated(keep='last')]
size_columns = df_expanded.columns[df_expanded.columns.str.contains("Size:")]
size_columns

df_expanded["Size:"] = df_expanded["Size:"].fillna(0)

df_expanded["sqft"] = df_expanded.apply(lambda row: row["calculated_sqft"] if row["Size:"] == 0 else row["Size:"], axis=1)

house_sigma = df_expanded

house_sigma

Unnamed: 0,Name,Room Info,description 1,description 2,link,Tax:,Property Type:,Maintenance:,Included Utility:,Exposure:,...,Driveway Parking:,Parking Features:,Frontage Length:,Waterfront Features:,View:,Sloping:,Skiing:,Rolling:,calculated_sqft,sqft
0,"Key facts for Unit 508 - 40 Richview Rd, Humbe...","[{'type': 'Dining', 'size': '(4.70 x 3.22 m）',...","Discover this rare, spacious, and beautifully ...",['Spacious and beautifully renovated corner su...,/on/etobicoke-real-estate/508-40-richview-rd/h...,"$2, 265 / 2024",Condo Apt,$1113/month,"water, hydro, heat",Nw,...,,,,,,,,,844.716428,1200-1399 feet²
1,"Key facts for 24 Wallis Cres, Mount Olive-Silv...","[{'type': 'Living', 'size': '(5.48 x 3.05 m）',...",Welcome to 24 Wallis Cres. Make this detached ...,['Detached home located at 24 Wallis Cres in a...,/on/etobicoke-real-estate/24-wallis-cres/home/...,"$3, 039 / 2024",Detached,,,,...,,,,,,,,,825.374776,825.374776
2,"Key facts for Unit 2611 - 8 Eglinton Ave E, Mo...","[{'type': 'Kitchen', 'size': '(7.04 x 3.04 m）'...",Welcome To The Award Winning E-Condos In Highl...,['Award-winning E-Condos located in the desira...,/on/toronto-real-estate/2611-8-eglinton-ave-e/...,"$3, 627 / 2024",Condo Apt,$581/month,,S,...,,,,,,,,,620.095362,600-699 feet²
3,"Key facts for Unit 405 - 35 Fontenay Crt, Eden...","[{'type': 'Den', 'size': '(2.74 x 2.43 m）', 'l...",Boutique Building! One Bedroom + Den; 2 Bathro...,"[""Luxury living in a boutique building featuri...",/on/etobicoke-real-estate/405-35-fontenay-crt/...,"$2, 700 / 2024",Condo Apt,$758/month,"water, heat",Se,...,,,,,,,,,607.534967,700-799 feet²
4,"Key facts for Unit 12 - 51 Florence St, Little...",[],Brockton Commons is an exclusive collection of...,['Brockton Commons consists of 36 exclusive bo...,/on/toronto-real-estate/12-51-florence-st/home...,"$3, 285 / 2023",Condo Townhouse,$400/month,water,,...,,,,,,,,,0.000000,900-999 feet²
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2623,"Key facts for 282 Westlake Ave, Woodbine-Lumsd...","[{'type': 'Living', 'size': '(5.28 x 3.14 m）',...",Grand Solid Brick Home in Westlake Endless Po...,['Beautiful and well-maintained solid brick ho...,/on/toronto-real-estate/282-westlake-ave/home/...,"$4, 078 / 2024",Detached,,,,...,,,,,,,,,1098.966204,1098.966204
2624,"Key facts for 258 Perth Ave, Dovercourt-Wallac...","[{'type': 'Living', 'size': '(3.05 x 3.20 m）',...","A two storey, end unit, attached row house. Ac...",['Two-storey end unit attached row house with ...,/on/toronto-real-estate/258-perth-ave/home/EXr...,"$4, 371 / 2023",Freehold Townhouse,,,,...,,,,,,,,,1338.706243,1338.706243
2625,"Key facts for 93 Mount Olive Dr, Mount Olive-S...",[],Welcome to an incredible opportunity in the vi...,['Located in the vibrant Mount Olive neighborh...,/on/etobicoke-real-estate/93-mount-olive-dr/ho...,"$3, 369 / 2024",Detached,,,,...,,,,,,,,,0.000000,0.0
2626,"Key facts for 9 Maxwell Ave, Yonge-Eglinton, T...","[{'type': 'Living', 'size': '(6.43 x 3.51 m）',...",This exquisite side-centre hall home in Toront...,"[""Exquisite side-centre hall home located in T...",/on/toronto-real-estate/9-maxwell-ave/home/VgA...,"$8, 998 / 2024",Detached,,,,...,,,,,,,,,1562.018418,1562.018418
