- drop duplicates by MLS
- merge columns
- explodes amenities into individual columns
- calculate sqft when range or room dimensions (only) are given

In [552]:
import pandas as pd
import os
import numpy as np
import ast
import re
from pathlib import Path
from collections import Counter


In [553]:
import re

def get_street_address(address):
    if isinstance(address, str):
        # Step 1: Handle missing or invalid addresses
        address = address.strip()
        
        # Step 2: If there is a dash and the part after the dash seems like an apartment number, split
        if "-" in address:
            parts = address.split("-")
            # Check if the part after the dash is likely an apartment number (contains digits)
            if re.search(r'\d', parts[1]):  # If the second part contains digits, treat as apt number
                address = parts[1]  # This assumes the apartment number is the part after the dash
            else:
                address = parts[0]  # If not, we keep the first part (before the dash)

        # Step 3: Remove any content after the opening parenthesis (e.g., neighborhoods)
        address = address.split("(")[0].strip()

        # Step 4: Remove content after "Toronto" (city name) if necessary
        address = address.split("Toronto")[0].strip()

        # Step 5: Convert to lowercase
        return address.lower()
    return ''  # If it's not a valid string, return an empty string

In [554]:
def numeric_price(df, columns_list):
    """$600,000 (str) --> 600000 (float)"""
    df_copy = df.copy()
    
    for column in columns_list:
        # Remove "$" and "," then convert to numeric, invalid parsing will become NaN
        df_copy[column] = df_copy[column].str.replace("$", "", regex=False).str.replace(",", "", regex=False)
        df_copy[column] = pd.to_numeric(df_copy[column], errors='coerce')  # Convert to float, invalid entries become NaN
        
    return df_copy


In [555]:
def numeric_lot_dim(dim):
    """Converts dimension string (e.g., '122 ft, 8 in', '40 ft', '40') to a float representing feet."""
    
    if pd.isna(dim): # For the NaN values (str operations cannot operate on float-like NaN values)
        return np.nan
    if isinstance(dim, (int, float)):
        return float(dim)
    
    dim = dim.strip().lower()

    # Initialize
    feet = 0
    inches = 0

    # Both feet and inches (e.g., "122 ft, 8 in")
    feet_in_inches = re.match(r'(\d+)\s*ft.*?(\d+)\s*in', dim)
    if feet_in_inches:
        feet = int(feet_in_inches.group(1))  # feet
        inches = int(feet_in_inches.group(2))  # inches
    else:
        # Only feet is present (e.g., "40 ft" or "40")
        feet_in_feet_only = re.match(r'(\d+)\s*ft', dim)
        if feet_in_feet_only:
            feet = int(feet_in_feet_only.group(1))
        else:
            # String is just a number (e.g., "40")
            if dim.isdigit():
                feet = int(dim)

    # Convert inches to feet (12 inches = 1 foot)
    total_feet = feet + (inches / 12)
    
    return total_feet

In [556]:
def get_realtor_house_sigma_sqft(row):
    """For Realtor dfs.
    Remove 'sqft'.
    If Square footage is a range, take the average.
    If it is a single value, leave as is."""
    
    # Make sure to treat row as a string and remove 'sqft'
    if isinstance(row, str):
        row = row.replace("sqft", "")\
                    .replace("feet²", "")\
                    .replace("+", "")\
                    .replace("<", "")\
                    .replace("<", "")\
                    .strip()  # Remove 'units' and descriptors and strip spaces

        # Check if it's a range (contains '-')
        if "-" in row:
            lower, upper = row.split("-")
            return (float(lower.strip()) + float(upper.strip())) / 2
        else:
            return float(row)
    else:
        return None

In [557]:
def get_zolo_sqft(row, room_dimensions=None):
    """For Zolo dfs. 
    If Size (sq ft) is NaN, then replaces with the sqft calculated from the room dimensions.
    If Size (sq ft) is a range (i.e., contains "-"), replaces with the average of the bounds."""
    
    # If the 'Size (sq ft)' is a range (contains "-"):
    if isinstance(row, str) and '-' in row:
        lower, upper = row.split("-")
        # Calculate the average of the two bounds
        return (float(lower) + float(upper)) / 2
    
    # If 'Size (sq ft)' is NaN, calculate from room dimensions
    elif pd.isna(row) and room_dimensions:  
        try:
            # Check if 'room_dimensions' is a list-like or string that we can process
            dimensions = ast.literal_eval(room_dimensions)
            square_meters = 0

            # If room dimensions is a list of room sizes (i.e., '200x300', '100x200', etc.)
            if isinstance(dimensions, list):
                for dimension in dimensions:
                    # Only split if 'dimension' contains 'x'
                    length, width = dimension.split("x")
                    square_meters += float(length.strip()) * float(width.strip())
                    
            return square_meters*10.7639
        
        except (ValueError, SyntaxError, TypeError):
            # Handle invalid or malformed room dimensions
            return np.nan

    elif isinstance(row, str):
        # If it's a string that doesn't contain a "-", remove any "+" and return the value as float
        return float(row.replace("+", "").replace(">", "").replace("<", "").strip())
    
    else:
        # If it's a numeric value (already a number), return it
        return row


In [558]:
# Clean the string value associated with the key "size" in each dict the "Room Info" dict for each row.

def clean_size_string(size):
    """For House Sigma dfs"""
    # Remove all non-numeric characters and spaces around the dimensions
    cleaned_dimensions = re.sub(r'[^\d. x]', '', size)
    return cleaned_dimensions

# Calculate the area of each room (where each dict in the "Room Info" list represents a different room)

def get_area(room):
    """House Sigma dfs"""
    # Sometimes the "Room Info" list of dictionaries is an empty list [], or there's no key named "size"
    # for certain rooms in the list.
    if room['size'] is None:
        return 0
    
    # Clean to remove the mï¼‰ special characters after the dimensions for each room
    cleaned_size = clean_size_string(room['size'])
    
    # Use regex to identify form and pattern of dimensions in each dictionary in the list of rooms
    # Provided dimensions are actually in meters.
    match = re.match(r'([\d.]+) x ([\d.]+)', cleaned_size)
    if match:
        length = float(match.group(1))  
        width = float(match.group(2))   
        area_m2 = length * width       
        area_ft2 = area_m2 * 10.7639    
        return area_ft2
    else:
        return 0  
    
# Calculate total sqft (sum of the areas of all rooms in the list of dictionaries)

def get_total_sqft(rooms_list):
    return sum(get_area(room) for room in rooms_list)

In [559]:
def clean_real_estate(realtor, zolo, house_sigma, house_sigma_overview):
    
    """Cleans and processes realtor, zolo, and house_sigma dataframes."""
    
    house_sigma_overview = house_sigma_overview.rename(columns={
        "Column_8" : "lat",
        "Column_9": "long"
    })
    
    
    ##### REALTOR #####
    # Drop NaN MLSs and duplicate MLSs
    # Doubled checked many samples of MLSs and some listings just did not have a leading letter in the MLS.
    
    realtor.dropna(subset=['mls'], inplace=True)
    realtor.drop_duplicates(subset=['mls'], inplace=True)

    # Also cannot use any listings without sqft or address.
    realtor.dropna(subset=['Square Footage'], inplace=True)
    realtor.dropna(subset=['address'], inplace=True)

    # Get clean street address
    realtor["street address"] = realtor["address"].fillna('').apply(get_street_address)

    # Remove rentals
    realtor = realtor[~realtor["price"].str.contains("Monthly | Weekly", case=False, na=False)]


    # Get the numeric price
    realtor = numeric_price(realtor, ["price"])
    
    
    # Get sqft from sqft column (when scraping, there were no listings without sqft that had room dimensions, 
    # so we only scraped the sqft).
    realtor["sqft"] = realtor["Square Footage"].apply(get_realtor_house_sigma_sqft)
    realtor = realtor.drop(columns=["Square Footage"])
    
    # Beds and baths columns. Assume that a "partial bath" is 1/2 bath.
    realtor["beds"] = realtor["Above Grade"].fillna(0).astype(float) + realtor["Below Grade"].fillna(0).astype(float)
    realtor["baths"] = realtor["Total"].fillna(0).astype(float) + 0.5* realtor["Partial"].fillna(0).astype(float)
    realtor["Air Conditioning"] = realtor["Cooling"]
    realtor["Community"] = realtor["Community Name"]
    
    # Pool   
    
    realtor['Building Amenities'] = realtor['Building Amenities'].fillna('')  # Replace NaN with empty string so no float + str error
    
    # realtor['Building Amenities'] = realtor.apply(
    #     lambda row: (row['Building Amenities'] + ", Pool") if pd.notna(row['Pool Type']) else row['Building Amenities'],
    #     axis=1
    # )

    realtor['Building Amenities'] = realtor.apply(
        lambda row: (str(row['Building Amenities']) + ", Pool") if pd.notna(row['Pool Type']) else row['Building Amenities'],
        axis=1
    )
    
    realtor["Building Type"] = realtor.apply(
        lambda row: row["Style"] if row["Building Type"] == "House" else row["Building Type"], axis=1)
    
    
    realtor["Frontage"] = realtor["Frontage"].apply(numeric_lot_dim)
    realtor["Lot Depth"] = realtor["Land Depth"].apply(numeric_lot_dim)
    
    ##### ZOLO #####
    
    # Drop NaN MLSs and duplicate MLSs
    zolo["mls"] = zolo["mls"].astype(str).str[3:].str.replace("®", "", regex=False).str.strip()

    zolo.dropna(subset=['mls'], inplace=True)
    zolo.drop_duplicates(subset=['mls'], inplace=True)

    # Also cannot use any listings without an address.
    zolo.dropna(subset=['address'], inplace=True)

    # Get clean street address
    zolo["street address"] = zolo["address"].fillna('').apply(get_street_address)

    # Remove rentals
    zolo = zolo[~zolo["price"].str.contains("Monthly | Weekly", case=False, na=False)]
 
    # Get the numeric price
    zolo = numeric_price(zolo, ["price"])
   
    # Get sqft from sqft column (when scraping, there were no listings without sqft that had room dimensions, 
    # so we only scraped the sqft).
    
    
    
    # Extract first room dimension (length) if the room dimensions are in a list format
    zolo["sample_split_dimension"] = zolo["room dimensions"].apply(
        lambda x: ast.literal_eval(x)[0].split("x")[0] if isinstance(ast.literal_eval(x), list) and len(ast.literal_eval(x)) > 0 else None
    )

    # Convert the first dimension to float if it's not "N/A" or None, else set to None
    zolo["sample_split_dimension"] = zolo["sample_split_dimension"].apply(
        lambda x: float(x) if ((x != "N/A") and (x is not None)) else None
    )

    # Filter rows where sample_split_dimension is a valid float or Size (sq ft) is not NaN,
    # so that there is a way to get the sqft with either room dimensions or sqft.
    zolo = zolo[
        zolo["sample_split_dimension"].notna() |  # Keep rows where the dimension is a valid float (not NaN)
        zolo["Size (sq ft)"].notna()  # Keep rows where "Size (sq ft)" is not NaN
    ]
    
    zolo["sqft"] = zolo.apply(lambda row: get_zolo_sqft(row["Size (sq ft)"], row.get('room dimensions', None)), axis=1)
    zolo = zolo.drop(columns=["Size (sq ft)"])

    zolo.dropna(subset=['sqft'], inplace=True)
    
    zolo["beds"] = zolo["Bedrooms"].fillna(0).astype(float) + zolo["Bedrooms Plus"].fillna(0).astype(float)
    zolo["baths"] = zolo["Bathrooms"]
    zolo["Heating Type"] = zolo["Heating"]
    
    zolo = zolo.rename(columns={"Pool": "Pool Type"})
    zolo["Pool Type"] = zolo["Pool Type"].fillna("None")
    zolo["Pool Type"] = zolo["Pool Type"].apply(lambda x: "Pool" if pd.notna(x) and x != "None" else None)
    
    zolo['Amenity'] = zolo['Amenity'].fillna('')
    zolo['Amenity'] = zolo.apply(
        lambda row: row['Amenity'] + ", Pool" if pd.notna(row['Pool Type']) else row['Amenity'],
        axis=1
    )

    zolo['Amenity'] = zolo.apply(
        lambda row: (str(row['Amenity']) + ", Pool") if pd.notna(row['Pool Type']) else row['Amenity'],
        axis=1
    )
    
    zolo = zolo.rename(columns={"Type" : "Building Type"})

    zolo["Frontage"] = zolo["Frontage"].apply(numeric_lot_dim)
    zolo["Lot Depth"] = zolo["Lot Depth"].apply(numeric_lot_dim)

    ##### House Sigma #####

    new_row = pd.DataFrame([house_sigma.columns], columns=house_sigma.columns)

    # Step 2: Append the new row at the top of the DataFrame
    new_house_sigma = pd.concat([new_row, house_sigma], ignore_index=True)

    # Step 3: Set new column names (make sure the number of new column names matches the number of columns)
    new_house_sigma.columns = ['Name', 'Property Info', 'Listing Info', 'Room Info', 'description 1', 'description 2', 'link']
        
    new_house_sigma["Property Info"] = new_house_sigma["Property Info"].apply(ast.literal_eval)
    new_house_sigma["Listing Info"] = new_house_sigma["Listing Info"].apply(ast.literal_eval)
    new_house_sigma["Room Info"] = new_house_sigma["Room Info"].apply(ast.literal_eval)

    property_expanded = pd.json_normalize(new_house_sigma['Property Info'])
    listing_expanded = pd.json_normalize(new_house_sigma['Listing Info'])

    # Concatenate the expanded DataFrames with the original DataFrame, excluding the old columns
    df_expanded = pd.concat([new_house_sigma.drop(columns=['Property Info', 'Listing Info']), property_expanded, listing_expanded], axis=1)
    
    df_expanded['calculated_sqft'] = df_expanded['Room Info'].apply(get_total_sqft)


    # Drop the first instance of each duplicate column (property info and listing info had lots of duplicate keys)
    df_expanded = df_expanded.loc[:, ~df_expanded.columns.duplicated(keep='last')]
    
    df_expanded["Size:"] = df_expanded["Size:"].apply(get_realtor_house_sigma_sqft)

    df_expanded["Size:"] = df_expanded["Size:"].fillna(0)
    df_expanded["sqft"] = df_expanded.apply(lambda row: row["calculated_sqft"] if row["Size:"] == 0 else row["Size:"], axis=1)
    df_expanded = df_expanded.loc[df_expanded['sqft'] != 0]

    house_sigma = df_expanded

    house_sigma["mls"] = house_sigma["Listing #:"]
    house_sigma["beds"] = house_sigma["Bedrooms:"]
    house_sigma["baths"] = house_sigma["Bathrooms:"]
    house_sigma["Heating Type"] = house_sigma["Heating Type:"]
    house_sigma["Air Conditioning"] = house_sigma["Cooling:"]
    house_sigma["Community"] = house_sigma["Community:"]
    
    house_sigma["Frontage"] = house_sigma["Frontage:"].apply(numeric_lot_dim)
    house_sigma["Land Depth"] = house_sigma["Depth:"].apply(numeric_lot_dim)
    
    # house_sigma["Amenities:"] = house_sigma["Amenities:"].str.strip().replace({
    #     "Outdoor Pool": "Pool", 
    #     "Indoor Pool": "Pool"
    # })
    
    # house_sigma["Amenities:"] = house_sigma["Amenities:"].apply(
    #     lambda x: x.replace("Outdoor Pool", "Pool").replace("Indoor Pool", "Pool") if isinstance(x, str) else x
    # )

    house_sigma['Amenities:'] = house_sigma.apply(
        lambda row: (
            (str(row['Amenities:']) + ", Pool") if (
                pd.notna(row['Pool:']) or
                (isinstance(row['description 1'], str) and 'pool' in row['description 1'].lower()) or
                (isinstance(row['description 2'], str) and 'pool' in row['description 2'].lower())
            ) else row['Amenities:']
        ),
        axis=1
    )
    
    # house_sigma['Amenities:'] = house_sigma.apply(
    #     lambda row: (str(row['Amenities:']) + ", Pool") if (
    #     pd.notna(row['Pool:']) or
    #     (isinstance(row['description 1'], str) and 'pool' in row['description 1'].lower()) or # descriptions not str means that they are NaN or [] --> can ignore without removing these rows
    #     (isinstance(row['description 2'], str) and 'pool' in row['description 2'].lower())
    #     ) else row['Amenities:'],
    #     axis=1
    # )

    # house_sigma['Amenities:'] = house_sigma.apply(
    #     lambda row: (row['Amenities:'] + ", Pool") if (
    #                                                     pd.notna(row['Pool:']) or
    #                                                     row['description 1'].str.contains("pool", case=False, na=False) or
    #                                                     row['description 2'].str.contains("pool", case=False, na=False)
    #                                                 ) else row['Amenities:'],
    #                                                         axis=1
    # )
    
    # house_sigma['Amenities:'] = house_sigma.apply(
    #     lambda row: (row['Amenities:'] + ", Pool") if (pd.notna(row['Pool:']), |
    #                                                     house_sigma["description 1"].str.contains("pool", case=False), |
    #                                                     house_sigma["description 2"].str.contains("pool", case=False)
    #                                                     ) else row['Amenities:'],
    #                                                     axis=1)
    
    # Replace NaN values explicitly with "No Amenity"
    house_sigma["Amenities:"] = house_sigma["Amenities:"].fillna("No amenity")

    house_sigma_merged = pd.merge(house_sigma_overview, house_sigma, on="link", how="left")
    
    # Filter out listings for rent and convert price to float.
    
    house_sigma_merged = house_sigma_merged[~house_sigma_merged["Listed Price"].str.contains("Monthly | Weekly", case=False, na=False)]
    house_sigma_merged = numeric_price(house_sigma_merged, ["Listed Price", "Sold Price"])
    house_sigma_merged["price"] = house_sigma_merged["Listed Price"]
    
    house_sigma_merged = house_sigma_merged.rename(columns={"Property Type:" : "Building Type"})
    
    return realtor, zolo, house_sigma_merged

In [560]:
# def get_top_amenities(realtor, zolo, house_sigma):
#     """Gets bools for top 10 amenities across all three real estate dfs."""
    
#     # Concatenate the three dfs
#     concatenated = pd.concat([realtor, zolo, house_sigma], ignore_index=True)

#     # Function to handle both strings and lists
#     def split_string_or_list(value):
#         # If the value is a string and contains commas, split it
#         if isinstance(value, str):
#             return value.split(',')  # Split by commas to create a list
#         elif isinstance(value, list):
#             return value  # Return the list as is
#         return []  # In case of other types (e.g., NaN or unexpected values)

#     # Exploding the amenities columns, one by one, and handling both strings and lists
#     features_exploded = concatenated['Features'].apply(split_string_or_list).explode().dropna()
#     building_amenities_exploded = concatenated['Building Amenities'].apply(split_string_or_list).explode().dropna()
#     amenity_exploded = concatenated['Amenity'].apply(split_string_or_list).explode().dropna()
#     amenities_exploded = concatenated['Amenities:'].apply(split_string_or_list).explode().dropna()

#     # Concatenate all the exploded lists into a single Series
#     all_amenities = pd.concat([features_exploded, building_amenities_exploded, amenity_exploded, amenities_exploded])

#     # Count the occurrences of each amenity
#     counter = Counter(all_amenities)
    
#     return counter

In [561]:
# def split_string_or_list(value):
#     """Handles both strings and lists, ensuring proper splitting and cleaning."""
#     # If it's a string, ensure proper trimming and splitting
#     if isinstance(value, str):
#         # Convert to lowercase and strip leading/trailing spaces, then split by commas
#         value = value.lower().strip()
#         return value.split(',')  # Split by commas to create a list
#     elif isinstance(value, list):
#         return value  # Return the list as is
#     return []  # In case of NaN or other unexpected types

# def split_string_or_list(value):
#     """Handles both strings and lists, ensuring proper splitting and cleaning."""
#     # If it's a string, ensure proper trimming and splitting
#     if isinstance(value, str):
#         # Convert to lowercase and strip leading/trailing spaces, then split by commas
#         value = value.lower().strip()
#         return [v.strip() for v in value.split(',')]  # Split by commas and strip each item
#     elif isinstance(value, list):
#         return value  # Return the list as is
#     return []


# def get_top_amenities(realtor, zolo, house_sigma):
#     """Gets bools for top amenities across all real estate dfs."""
    
#     # Concatenate the three dfs
#     concatenated = pd.concat([realtor, zolo, house_sigma], ignore_index=True)

#     # Exploding the amenities columns, one by one, and handling both strings and lists
#     features_exploded = concatenated['Features'].apply(split_string_or_list).explode().dropna()
#     building_amenities_exploded = concatenated['Building Amenities'].apply(split_string_or_list).explode().dropna()
#     amenity_exploded = concatenated['Amenity'].apply(split_string_or_list).explode().dropna()
#     amenities_exploded = concatenated['Amenities:'].apply(split_string_or_list).explode().dropna()

#     # Concatenate all the exploded lists into a single Series
#     all_amenities = pd.concat([features_exploded, building_amenities_exploded, amenity_exploded, amenities_exploded])
#     print(all_amenities.unique())
#     # Count the occurrences of each amenity (case-insensitive)
#     counter = Counter(all_amenities)

#     # Print the top 20 and their respective counts.
#     print(counter.most_common(20))

#     return counter

In [562]:
def split_string_or_list(value):
    """Splits a string into a list or returns the value if it's already a list."""
    if isinstance(value, str):
        # If it's a string, split it by commas and strip whitespace
        return [item.strip() for item in value.split(',')]
    elif isinstance(value, list):
        # If it's already a list, return it as-is
        return value
    else:
        # Handle any other cases (e.g., NaN, None)
        return []

def lump_amenities(amenity):
    """Lumps related amenities into broader categories."""
    # Define groups of related amenities (e.g., pool-related, parking-related)
    amenity_groups = {
        'gym': ['gym', 'exercise room', 'exercise centre'],
        'parking': ['visitor parking', 'paved driveway', 'parking'],
        'balcony': ['balcony', 'rooftop deck/garden'],
        'security': ['security/concierge', 'concierge', 'security'],
        'guest suites': ['guest suite', 'guest suites'],
        'party room': ['party room', 'party/meeting room'],
        'fireplace': ['fireplace(s)'],
        'recreation': ['recreation room', 'recreation centre', 'games room'],
        'green_belt':["conservation/green belt", "'backs on greenbelt"],
        'wheelchair access': ["wheelchair access", "level lot", 'level', 'flat site'],
        'no amenity': ["no amenity", "nan", ""]
    }
    amenity_normalized = amenity.strip().lower()
    
    # Loop through the groups and return the group name if amenity matches
    for group, amenities in amenity_groups.items():

        if amenity_normalized in [a.strip().lower() for a in amenities]:
            return group
        
    return amenity  # If no match, return the original amenity (for ungrouped amenities)

def get_top_amenities(realtor, zolo, house_sigma):
    """Gets bools for top amenities across all real estate dfs."""
    
    # Concatenate the three DataFrames
    concatenated = pd.concat([realtor, zolo, house_sigma], ignore_index=True)

    # List of columns that contain amenities in your DataFrames
    amenity_columns = ['Features', 'Building Amenities', 'Amenity', 'Amenities:']
    
    # Exploding the amenities columns, one by one, and handling both strings and lists
    exploded_amenities = []
    for col in amenity_columns:
        # Split and explode the values from each column
        exploded_amenities.append(concatenated[col].apply(split_string_or_list).explode().dropna())
    
    # Concatenate all the exploded lists into a single Series
    all_amenities = pd.concat(exploded_amenities)
    
    # Lump similar amenities into their broader groups
    grouped_amenities = all_amenities.apply(lump_amenities)
    
    # Count the occurrences of each group (case-insensitive)
    counter = Counter(grouped_amenities.str.lower())  # Use lower() to count case-insensitively
    
    # Print the top 20 amenities and their respective counts.
    top_20_amenities = counter.most_common(20)
    print("Top 20 Amenities (Grouped):")
    for amenity, count in top_20_amenities:
        print(f"{amenity}: {count}")
    
    return counter


In [563]:
# def merge_real_estate(realtor, zolo, house_sigma):
#     """Gets bools for top 10 amenities across all three real estate dfs."""
    
#     concatenated = pd.concat([realtor, zolo, house_sigma], ignore_index=True)

#     all_amenities = concatenated['Features'].explode().tolist() \
#         + concatenated['Building Amenities'].explode().tolist() \
#         + concatenated['Amenity'].explode().tolist() \
#         + concatenated['Amenities:'].explode().tolist()
    
#     counter = get_top_amenities(realtor, zolo, house_sigma)
#     top_20_amenities = counter.most_common(20)
    
#     top_20_amenity_names = [amenity[0].strip() for amenity in top_20_amenities]
    
#     # Create a new column for each top amenity: 1 if it's in the row, 0 otherwise
#     for amenity in top_20_amenity_names:
#         concatenated[amenity] = concatenated.apply(lambda row: 1 if amenity in row.values else 0, axis=1)
        
#     print("Columns in concatenated dataframe:", concatenated.columns)
#     print("Top 20 amenity names:", top_20_amenity_names)
#     columns_to_select = [
#         "price", 
#         "address", 
#         "mls", 
#         "sqft", 
#         "Community", 
#         "beds", 
#         "baths", 
#         "Air Conditioning", 
#         "Heating"
#     ] + top_20_amenity_names  

#     selected_data = concatenated[columns_to_select]
    
#     return selected_data


In [564]:
def merge_real_estate(realtor, zolo, house_sigma):
    """Gets bools for top 10 amenities across all three real estate dfs."""
    
    # Concatenate all the DataFrames
    concatenated = pd.concat([realtor, zolo, house_sigma], ignore_index=True)
    # concatenated_lower = concatenated.applymap(lambda x: str(x).lower() if pd.notnull(x) else '')

    # Get the top amenities from the concatenated DataFrame
    # all_amenities = concatenated['Features'].explode().tolist() \
    #     + concatenated['Building Amenities'].explode().tolist() \
    #     + concatenated['Amenity'].explode().tolist() \
    #     + concatenated['Amenities:'].explode().tolist()

    # Get the top 20 amenities based on the count
    counter = get_top_amenities(realtor, zolo, house_sigma)
    top_20_amenities = counter.most_common(21) # Used top 21 because "no amenity" shows up as a frequent "amenity".
    
    # Extract the names of the top 20 amenities
    top_20_amenity_names = [amenity[0].strip() for amenity in top_20_amenities]
    print(top_20_amenity_names)
    # Create a new DataFrame with the boolean columns for each amenity
    amenity_columns = pd.DataFrame()

    for amenity in top_20_amenity_names:
        
        amenity_lower = amenity.lower()

        amenity_columns[amenity] = concatenated.apply(
            lambda row: 1 if any(amenity_lower in str(value).lower() for value in set(row[['Features', 'Building Amenities', 'Amenity', 'Amenities:']].values)) else 0, axis=1
        )
        # Check only the columns that contain relevant amenity info
        # amenity_columns[amenity] = concatenated.apply(
        #     lambda row: 1 if any(amenity_lower in str(value).lower() for value in row[['Features', 'Building Amenities', 'Amenity', 'Amenities:']].values) else 0, axis=1
        # )
        
        
    # Convert both the amenity and the row values to lowercase for case-insensitive comparison
        # amenity_columns[amenity] = concatenated.apply(
        #     lambda row: 1 if any(amenity.lower() in str(value).lower() for value in row.values) else 0, axis=1
        # )
        
        # amenity_columns[amenity] = concatenated.apply(lambda row: print(row.values) or (1 if amenity in row.values else 0), axis=1)

        # amenity_columns[amenity] = concatenated.apply(lambda row: 1 if amenity in row.values.str.lower() else 0, axis=1)

    # Concatenate the new boolean columns with the original DataFrame
    concatenated = pd.concat([concatenated, amenity_columns], axis=1)

    # Select the relevant columns
    columns_to_select = [
        "price", 
        "address", 
        "mls", 
        "sqft",
        "Community", 
        "beds", 
        "baths",
        "Building Type",
        "Air Conditioning", 
        "Heating Type",
        "lat",
        "long",
        "Frontage",
        "Land Depth",
    ] + top_20_amenity_names  

    selected_data = concatenated[columns_to_select]
    
    return selected_data


In [565]:
# def merge_real_estate(realtor, zolo, house_sigma):
#     """Gets bools for top amenities across all three real estate dfs."""
    
#     # Concatenate all the DataFrames
#     concatenated = pd.concat([realtor, zolo, house_sigma], ignore_index=True)
    
#     # Convert the relevant columns to lowercase for case-insensitive comparison
#     amenity_columns_list = ['Features', 'Building Amenities', 'Amenity', 'Amenities:']
    
#     # Apply .lower() to each value in the specified columns
#     concatenated[amenity_columns_list] = concatenated[amenity_columns_list].applymap(
#         lambda x: str(x).lower() if pd.notnull(x) else ''
#     )
    
#     # Get the top amenities from the concatenated DataFrame
#     counter = get_top_amenities(realtor, zolo, house_sigma)
#     top_20_amenities = counter.most_common(21)  # Used top 21 because "no amenity" shows up as a frequent "amenity".
    
#     # Extract the names of the top 20 amenities (group names, not individual amenities)
#     top_20_amenity_names = [amenity[0].strip() for amenity in top_20_amenities]
#     print(top_20_amenity_names)

#     # Create a new DataFrame with the boolean columns for each amenity group
#     amenity_columns = pd.DataFrame()

#     for amenity_group in top_20_amenity_names:
#         # Convert the amenity group to lowercase for case-insensitive comparison
#         amenity_group_lower = amenity_group.lower()

#         # Check if any value in the relevant columns belongs to the current amenity group
#         amenity_columns[amenity_group] = concatenated.apply(
#             lambda row: 1 if any(
#                 lump_amenities(str(value)).lower() == amenity_group_lower
#                 for value in row[amenity_columns_list].dropna().values  # Only check non-null values
#             ) else 0,
#             axis=1
#         )
    
#     # Concatenate the new boolean columns with the original DataFrame
#     concatenated = pd.concat([concatenated, amenity_columns], axis=1)

#     # Select the relevant columns
#     columns_to_select = [
#         "price", 
#         "address", 
#         "mls", 
#         "sqft",
#         "Community", 
#         "beds", 
#         "baths",
#         "Building Type",
#         "Air Conditioning", 
#         "Heating Type",
#         "lat",
#         "long",
#         "Frontage",
#         "Land Depth",
#     ] + top_20_amenity_names  

#     selected_data = concatenated[columns_to_select]
    
#     return selected_data


In [566]:
# def lump_amenities(amenity):
#     """Lumps related amenities into broader categories."""
#     # Define groups of related amenities (e.g., pool-related, parking-related)
#     amenity_groups = {
#         'gym': ['gym', 'exercise room', 'exercise centre'],
#         'parking': ['visitor parking', 'paved driveway', 'parking', 'underground parking'],
#         'balcony': ['balcony', 'rooftop deck/garden'],
#         'security': ['security/concierge', 'concierge', 'security'],
#         'guest suites': ['guest suite', 'guest suites'],
#         'party room': ['party room', 'party/meeting room'],
#         'fireplace': ['fireplace(s)'],
#         'recreation': ['recreation room', 'recreation centre', 'games room'],
#         'green_belt': ["conservation/green belt", "'backs on greenbelt"],
#         'wheelchair access': ["wheelchair access", "level lot", 'level', 'flat site'],
#         'no amenity': ["no amenity", "nan", ""]
#     }
    
#     amenity_normalized = amenity.strip().lower()
    
#     # Loop through the groups and return the group name if amenity matches
#     for group, amenities in amenity_groups.items():
#         if amenity_normalized in [a.strip().lower() for a in amenities]:
#             return group
        
#     return amenity  # If no match, return the original amenity (for ungrouped amenities)

# def check_amenity_groups(concatenated, amenity_groups):
#     """Checks if a listing contains any of the grouped amenities and creates boolean columns for each."""
    
#     # Convert the relevant columns to lowercase for case-insensitive comparison
#     amenity_columns_list = ['Features', 'Building Amenities', 'Amenity', 'Amenities:']
#     concatenated[amenity_columns_list] = concatenated[amenity_columns_list].applymap(
#         lambda x: str(x).lower() if pd.notnull(x) else ''
#     )
    
#     # Create a new DataFrame to store the boolean columns for each group
#     amenity_columns = pd.DataFrame()
    
#     # For each group in the amenity groups, check if any of the values in the relevant columns match
#     for amenity_group in amenity_groups:
#         # Check if any value in the relevant columns belongs to the current amenity group
#         amenity_columns[amenity_group] = concatenated.apply(
#             lambda row: 1 if any(
#                 lump_amenities(str(value)).lower() == amenity_group
#                 for value in row[amenity_columns_list].dropna().values
#             ) else 0,
#             axis=1
#         )

#     # Concatenate the new boolean columns with the original DataFrame
#     concatenated = pd.concat([concatenated, amenity_columns], axis=1)

#     return concatenated




In [567]:
def split_string_or_list(value):
    """Splits a string into a list or returns the value if it's already a list."""
    if isinstance(value, str):
        # If it's a string, split it by commas and strip whitespace
        return [item.strip() for item in value.split(',')]
    elif isinstance(value, list):
        # If it's already a list, return it as-is
        return value
    else:
        # Handle any other cases (e.g., NaN, None)
        return []

def lump_amenities(amenity):
    """Lumps related amenities into broader categories."""
    # Define groups of related amenities (e.g., pool-related, parking-related)
    amenity_groups = {
        'gym': ['gym', 'exercise room', 'exercise centre'],
        'parking': ['visitor parking', 'paved driveway', 'parking'],
        'balcony': ['balcony', 'rooftop deck/garden'],
        'security': ['security/concierge', 'concierge', 'security'],
        'guest suites': ['guest suite', 'guest suites'],
        'party room': ['party room', 'party/meeting room'],
        'fireplace': ['fireplace(s)'],
        'recreation': ['recreation room', 'recreation centre', 'games room'],
        'green_belt':["conservation/green belt", "'backs on greenbelt"],
        'wheelchair access': ["wheelchair access", "level lot", 'level', 'flat site'],
        'no amenity': ["no amenity", "nan", ""]
    }
    amenity_normalized = amenity.strip().lower()
    
    # Loop through the groups and return the group name if amenity matches
    for group, amenities in amenity_groups.items():
        if amenity_normalized in [a.strip().lower() for a in amenities]:
            return group
        
    return amenity  # If no match, return the original amenity (for ungrouped amenities)

def check_amenity_groups(concatenated, amenity_groups):
    """Checks if a listing contains any of the grouped amenities and creates boolean columns for each."""
    
    # Convert the relevant columns to lowercase for case-insensitive comparison
    amenity_columns_list = ['Features', 'Building Amenities', 'Amenity', 'Amenities:']
    concatenated[amenity_columns_list] = concatenated[amenity_columns_list].applymap(
        lambda x: str(x).lower() if pd.notnull(x) else ''
    )
    
    # Create a new DataFrame to store the boolean columns for each group
    amenity_columns = pd.DataFrame()
    
    # For each group in the amenity groups, check if any of the values in the relevant columns match
    for amenity_group in amenity_groups:
        # Check if any value in the relevant columns belongs to the current amenity group
        amenity_columns[amenity_group] = concatenated.apply(
            lambda row: 1 if any(
                lump_amenities(str(value)).lower() == amenity_group
                for value in row[amenity_columns_list].dropna().values
            ) else 0,
            axis=1
        )

    # Concatenate the new boolean columns with the original DataFrame
    concatenated = pd.concat([concatenated, amenity_columns], axis=1)

    return concatenated

def merge_real_estate(realtor, zolo, house_sigma):
    """Merges data and adds boolean columns for top amenities."""
    
    # Concatenate all the DataFrames
    concatenated = pd.concat([realtor, zolo, house_sigma], ignore_index=True)
    concatenated = concatenated.drop_duplicates(subset=["mls"])
    
    # Get the top amenities from the concatenated DataFrame
    counter = get_top_amenities(realtor, zolo, house_sigma)
    top_20_amenities = counter.most_common(21)  # Used top 21 because "no amenity" shows up as a frequent "amenity".
    
    # Extract the names of the top amenities
    top_20_amenity_names = [amenity[0].strip() for amenity in top_20_amenities]
    
    # Create a new DataFrame with the boolean columns for each amenity
    amenity_columns = pd.DataFrame()

    for amenity in top_20_amenity_names:
        amenity_lower = amenity.lower()

        amenity_columns[amenity] = concatenated.apply(
            lambda row: 1 if any(amenity_lower in str(value).lower() for value in set(row[['Features', 'Building Amenities', 'Amenity', 'Amenities:']].values)) else 0, axis=1
        )
        
    # Concatenate the new boolean columns with the original DataFrame
    concatenated = pd.concat([concatenated, amenity_columns], axis=1)

    # Select the relevant columns
    columns_to_select = [
        "price", 
        "address", 
        "mls", 
        "sqft",
        "Community", 
        "beds", 
        "baths",
        "Building Type",
        "Air Conditioning", 
        "Heating Type",
        "lat",
        "long",
        "Frontage",
        "Land Depth",
    ] + top_20_amenity_names  

    selected_data = concatenated[columns_to_select]
    
    return selected_data


In [568]:
def final_cleaning(merged):
    """Remove any remaining NaN values by price, address, MLS, and sqft.
    Drop duplicates by MLS across the three real estate data sources."""
    
    merged = merged.dropna(subset=['price', 'address', 'mls', 'sqft'])
                    
    merged = merged.drop_duplicates(subset=["mls"])
    
    return merged

In [569]:
parent_dir = Path.cwd().parent
realtor = pd.read_csv(parent_dir/"2_data_cleaning"/"cleaned_csv"/"realtor_with_coords.csv")
zolo = pd.read_csv(parent_dir/"2_data_cleaning"/"cleaned_csv"/"zolo_with_coords.csv")
house_sigma = pd.read_csv(parent_dir/"2_data_cleaning"/"cleaned_csv"/"properties_housesigma.csv")
house_sigma_overview = pd.read_csv(parent_dir/"2_data_cleaning"/"cleaned_csv"/"housesigma_data_with_coords.csv")

In [570]:
realtor_cleaned, zolo_cleaned, house_sigma_cleaned = clean_real_estate(realtor, zolo, house_sigma, house_sigma_overview)
merged = merge_real_estate(realtor_cleaned, zolo_cleaned, house_sigma_cleaned)



Top 20 Amenities (Grouped):
no amenity: 4329
gym: 2078
pool: 1816
party room: 1510
security: 1394
parking: 1309
balcony: 1023
indoor pool: 638
recreation: 571
sauna: 413
storage - locker: 403
guest suites: 370
outdoor pool: 348
bbqs allowed: 291
bike storage: 277
carpet free: 234
in suite laundry: 200
media room: 171
tennis court: 134
car wash: 82


In [571]:
merged["Building Type"].unique()

array(['Apartment', 'Detached', nan, 'Row / Townhouse', 'Semi-detached',
       'Duplex', 'Triplex', 'Condo Apt', 'Semi-Detached',
       'Att/Row/Twnhouse', 'Multiplex', 'Condo Townhouse',
       'Freehold Townhouse', 'Condo/Apt Unit', 'Co-Op Apt', 'Link',
       'Co-Ownership Apt'], dtype=object)

In [577]:
merged[(merged["pool"]==0) & (merged["Building Type"]=="Apartment")]

Unnamed: 0,price,address,mls,sqft,Community,beds,baths,Building Type,Air Conditioning,Heating Type,...,guest suites,outdoor pool,bbqs allowed,bike storage,carpet free,in suite laundry,media room,tennis court,car wash,squash/racquet court
0,888000.0,1515 - 8 HILLCREST AVENUEToronto (Willowdale E...,C10650046,1099.5,Willowdale East,3.0,2.0,Apartment,Central air conditioning,Forced air (Natural gas),...,0,0,0,0,0,0,0,0,0,0
1,639000.0,1502 - 10 YORK STREETToronto (Waterfront Commu...,C10640404,549.5,Waterfront Communities C1,1.0,1.0,Apartment,Central air conditioning,Forced air (Natural gas),...,0,0,0,0,0,1,0,0,0,0
3,855000.0,"506 - 12 REAN DRIVEToronto (Bayview Village), ...",C10511289,1299.5,Bayview Village,2.0,2.0,Apartment,Central air conditioning,Forced air (Natural gas),...,0,0,0,0,1,0,0,0,0,0
6,849900.0,1603 - 10 YONGE STREETToronto (Waterfront Comm...,C10477072,949.5,Waterfront Communities C1,2.0,2.0,Apartment,Central air conditioning,Forced air (Natural gas),...,0,0,0,0,0,0,0,0,0,0
8,648800.0,502 - 5 CONCORDE PLACEToronto (Banbury-Don Mil...,C10477051,1099.5,Banbury-Don Mills,2.0,2.0,Apartment,Central air conditioning,Forced air (Electric),...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
802,728000.0,1609 - 2033 KENNEDY ROADToronto (Agincourt Sou...,E10427714,949.5,Agincourt South-Malvern West,3.0,2.0,Apartment,Central air conditioning,Forced air (Natural gas),...,0,0,0,0,1,0,0,0,0,0
804,805900.0,"410 - 10 OAK STREETToronto (Regent Park), Onta...",C10428115,549.5,Regent Park,2.0,1.0,Apartment,Central air conditioning,Forced air,...,0,0,0,0,1,1,0,0,0,0
805,888900.0,"906 - 10 OAK STREETToronto (Regent Park), Onta...",C10428176,649.5,Regent Park,3.0,1.0,Apartment,Central air conditioning,Forced air,...,0,0,0,0,1,1,0,0,0,0
807,1199998.0,6116 - 388 YONGE STREET WToronto (Bay Street C...,C10428067,1299.5,Bay Street Corridor,3.0,2.0,Apartment,Central air conditioning,Forced air (Natural gas),...,0,0,0,0,0,1,0,0,0,0


In [573]:
merged[merged["party room"]]

KeyError: "None of [Index([0, 0, 1, 0, 0, 0, 0, 1, 1, 0,\n       ...\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n      dtype='int64', length=4421)] are in the [columns]"

In [None]:
merged.columns

Index(['price', 'address', 'mls', 'sqft', 'Community', 'beds', 'baths',
       'Building Type', 'Air Conditioning', 'Heating Type', 'lat', 'long',
       'Frontage', 'Land Depth', 'no amenity', 'gym', 'pool', 'party room',
       'security', 'parking', 'balcony', 'indoor pool', 'recreation', 'sauna',
       'storage - locker', 'guest suites', 'outdoor pool', 'bbqs allowed',
       'bike storage', 'carpet free', 'in suite laundry', 'media room',
       'tennis court', 'car wash', 'squash/racquet court'],
      dtype='object')

In [None]:
house_sigma_cleaned["Amenities:"].unique()

array([nan,
       'Concierge,  Gym,  Indoor Pool,  Sauna,  Tennis Court,  Visitor Parking, Pool',
       'No amenity', 'nan, Pool',
       'Exercise Room,  Outdoor Pool,  Party/Meeting Room,  Tennis Court,  Visitor Parking',
       'Concierge,  Guest Suites,  Gym,  Indoor Pool,  Sauna,  Visitor Parking, Pool',
       'Exercise Room,  Indoor Pool,  Outdoor Pool,  Party/Meeting Room,  Sauna,  Tennis Court, Pool',
       'Gym,  Party/Meeting Room,  Rooftop Deck/Garden', 'Concierge',
       'Concierge,  Gym,  Party/Meeting Room,  Recreation Room,  Squash/Racquet Court,  Visitor Parking',
       'Gym,  Indoor Pool,  Party/Meeting Room,  Squash/Racquet Court,  Tennis Court,  Visitor Parking',
       'Concierge,  Gym,  Party/Meeting Room,  Rooftop Deck/Garden,  Visitor Parking',
       'Concierge,  Gym,  Indoor Pool,  Outdoor Pool,  Party/Meeting Room,  Sauna',
       'Concierge,  Exercise Room,  Party/Meeting Room',
       'Concierge,  Exercise Room,  Gym,  Party/Meeting Room,  Recreation R

In [None]:
merged.isna().sum()

price                    183
address                    0
mls                     1478
sqft                    1478
Community               1507
beds                    1478
baths                   1478
Building Type           1486
Air Conditioning        1744
Heating Type            1527
lat                       87
long                      87
Frontage                4536
Land Depth              6332
no amenity                 0
gym                        0
pool                       0
party room                 0
security                   0
parking                    0
balcony                    0
indoor pool                0
recreation                 0
sauna                      0
storage - locker           0
guest suites               0
outdoor pool               0
bbqs allowed               0
bike storage               0
carpet free                0
in suite laundry           0
media room                 0
tennis court               0
car wash                   0
squash/racquet

In [None]:
house_sigma_cleaned['Amenities:'] = house_sigma_cleaned.apply(
    lambda row: (
        (str(row['Amenities:']) + ", Pool") if (
            pd.notna(row['Pool:']) or
            (isinstance(row['description 1'], str) and 'pool' in row['description 1'].lower()) or
            (isinstance(row['description 2'], str) and 'pool' in row['description 2'].lower())
        ) else row['Amenities:']
    ),
    axis=1
)

In [None]:
house_sigma_cleaned["Amenities:"]

0                                                     NaN
1       Concierge,  Gym,  Indoor Pool,  Sauna,  Tennis...
2       Concierge,  Gym,  Indoor Pool,  Sauna,  Tennis...
3       Concierge,  Gym,  Indoor Pool,  Sauna,  Tennis...
4                                                     NaN
                              ...                        
5521                                           No amenity
5522    Concierge,  Exercise Room,  Gym,  Sauna,  Visi...
5523                                                  NaN
5524                                                  NaN
5525                                                  NaN
Name: Amenities:, Length: 5526, dtype: object

In [None]:
merged.isna().sum()

price                    183
address                    0
mls                     1478
sqft                    1478
Community               1507
beds                    1478
baths                   1478
Building Type           1486
Air Conditioning        1744
Heating Type            1527
lat                       87
long                      87
Frontage                4536
Land Depth              6332
no amenity                 0
gym                        0
pool                       0
party room                 0
security                   0
parking                    0
balcony                    0
indoor pool                0
recreation                 0
sauna                      0
storage - locker           0
guest suites               0
outdoor pool               0
bbqs allowed               0
bike storage               0
carpet free                0
in suite laundry           0
media room                 0
tennis court               0
car wash                   0
squash/racquet

In [None]:
final_merged = final_cleaning(merged)

In [None]:
merged["price"].isna().sum()

np.int64(183)

In [None]:
final_merged.shape

(4254, 35)

In [None]:
final_merged.isna().sum()

price                      0
address                    0
mls                        0
sqft                       0
Community                 28
beds                       0
baths                      0
Building Type              8
Air Conditioning         182
Heating Type              30
lat                       74
long                      74
Frontage                1753
Land Depth              3356
no amenity                 0
gym                        0
pool                       0
party room                 0
security                   0
parking                    0
balcony                    0
indoor pool                0
recreation                 0
sauna                      0
storage - locker           0
guest suites               0
outdoor pool               0
bbqs allowed               0
bike storage               0
carpet free                0
in suite laundry           0
media room                 0
tennis court               0
car wash                   0
squash/racquet

In [None]:
final_merged.to_csv("nov30_evening_merged.csv")

In [None]:
merged.columns

Index(['price', 'address', 'mls', 'sqft', 'Community', 'beds', 'baths',
       'Building Type', 'Air Conditioning', 'Heating Type', 'lat', 'long',
       'Frontage', 'Land Depth', 'no amenity', 'gym', 'pool', 'party room',
       'security', 'parking', 'balcony', 'indoor pool', 'recreation', 'sauna',
       'storage - locker', 'guest suites', 'outdoor pool', 'bbqs allowed',
       'bike storage', 'carpet free', 'in suite laundry', 'media room',
       'tennis court', 'car wash', 'squash/racquet court'],
      dtype='object')

In [None]:
merged[merged["sauna"]==1]

Unnamed: 0,price,address,mls,sqft,Community,beds,baths,Building Type,Air Conditioning,Heating Type,...,guest suites,outdoor pool,bbqs allowed,bike storage,carpet free,in suite laundry,media room,tennis court,car wash,squash/racquet court
0,888000.0,1515 - 8 HILLCREST AVENUEToronto (Willowdale E...,C10650046,1099.5,Willowdale East,3.0,2.0,Apartment,Central air conditioning,Forced air (Natural gas),...,0,0,0,0,0,0,0,0,0,0
11,375000.0,407 - 235 GRANDRAVINE DRIVEToronto (Glenfield-...,W10477032,649.5,Glenfield-Jane Heights,1.0,1.0,Apartment,,Radiant heat (Natural gas),...,0,0,0,0,1,1,0,0,0,0
12,539900.0,503 - 117 GERRARD STREET EToronto (Church-Yong...,C10463710,749.5,Church-Yonge Corridor,2.0,1.0,Apartment,Central air conditioning,Forced air (Natural gas),...,0,0,0,0,0,0,0,0,0,0
26,650000.0,2301 - 135 ANTIBES DRIVEToronto (Westminster-B...,C10454134,1299.5,Westminster-Branson,2.0,2.0,Apartment,Central air conditioning,Forced air (Natural gas),...,0,0,0,0,1,0,0,0,0,0
28,626000.0,1116 - 509 BEECROFT ROADToronto (Willowdale We...,C10454126,749.5,Willowdale West,2.0,1.0,Apartment,Central air conditioning,Forced air (Natural gas),...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8098,498000.0,"1817 - 5 Greystone Walk Dr , Scarborough - Ken...",E9349620,649.5,Kennedy Park,1,1,Condo Apt,Central Air,Forced Air,...,0,1,0,0,0,0,0,0,0,0
8099,498000.0,"1817 - 5 Greystone Walk Dr , Scarborough - Ken...",E9349620,649.5,Kennedy Park,1,1,Condo Apt,Central Air,Forced Air,...,0,1,0,0,0,0,0,0,0,0
8104,657000.0,"2001 - 215 Fort York Blvd , Toronto - Waterfro...",C9389807,649.5,Waterfront Communities C1,1,1,Condo Apt,Central Air,Forced Air,...,1,0,0,0,0,0,0,0,0,0
8105,657000.0,"2001 - 215 Fort York Blvd , Toronto - Waterfro...",C9389807,649.5,Waterfront Communities C1,1,1,Condo Apt,Central Air,Forced Air,...,1,0,0,0,0,0,0,0,0,0


In [None]:
final_merged.columns

Index(['price', 'address', 'mls', 'sqft', 'Community', 'beds', 'baths',
       'Building Type', 'Air Conditioning', 'Heating Type', 'lat', 'long',
       'Frontage', 'Land Depth', 'no amenity', 'gym', 'pool', 'party room',
       'security', 'parking', 'balcony', 'indoor pool', 'recreation', 'sauna',
       'storage - locker', 'guest suites', 'outdoor pool', 'bbqs allowed',
       'bike storage', 'carpet free', 'in suite laundry', 'media room',
       'tennis court', 'car wash', 'squash/racquet court'],
      dtype='object')

In [None]:
# Listings through the years

parent_dir = Path.cwd().parent
hs0 = pd.read_csv(parent_dir/"0_raw_data"/"house_data"/"extracted_houses_housesigma_2003_with_properties 0.csv")
hs2 = pd.read_csv(parent_dir/"0_raw_data"/"house_data"/"extracted_houses_housesigma_2003_with_properties 2.csv")
hs3 = pd.read_csv(parent_dir/"0_raw_data"/"house_data"/"extracted_houses_housesigma_2003_with_properties 3.csv")
hs4 = pd.read_csv(parent_dir/"0_raw_data"/"house_data"/"extracted_houses_housesigma_2003_with_properties 4 - Copy.csv", encoding="utf-8", on_bad_lines='skip')
hs5 = pd.read_csv(parent_dir/"0_raw_data"/"house_data"/"extracted_houses_housesigma_2003_with_properties 5.csv")

hs_overview = pd.read_csv(parent_dir/"2_data_cleaning"/"cleaned_csv"/"housesigma_data_2003_with_coords.csv")

In [None]:
hs_total = pd.concat([hs0, hs2, hs3, hs4, hs5], ignore_index=True)

In [None]:
hs_total.shape

(87990, 32)

In [None]:
realtor_cleaned, zolo_cleaned, house_sigma_cleaned = clean_real_estate(pd.DataFrame(), pd.DataFrame(), hs_total, hs_overview)
merged = merge_real_estate(realtor_cleaned, zolo_cleaned, house_sigma_cleaned)


KeyError: ['mls']

In [None]:
parent_dir = Path.cwd().parent
realtor = pd.read_csv(parent_dir/"0_raw_data"/"house_data"/"realtorcom_listings.csv")
zolo = pd.read_csv(parent_dir/"1_data_extraction"/"newest_zolo_with_mls.csv")
house_sigma = pd.read_csv(parent_dir/"0_raw_data"/"house_data"/"properties_housesigma.csv")

In [None]:
# Numeric price
# house_sigma = numeric_price(house_sigma, ["Listed Price", "Sold Price"])

new_row = pd.DataFrame([house_sigma.columns], columns=house_sigma.columns)

# Step 2: Append the new row at the top of the DataFrame
new_house_sigma = pd.concat([new_row, house_sigma], ignore_index=True)

# Step 3: Set new column names (make sure the number of new column names matches the number of columns)
new_house_sigma.columns = ['Name', 'Property Info', 'Listing Info', 'Room Info', 'description 1', 'description 2', 'link']

new_house_sigma["Property Info"] = new_house_sigma["Property Info"].apply(ast.literal_eval)
new_house_sigma["Listing Info"] = new_house_sigma["Listing Info"].apply(ast.literal_eval)
new_house_sigma["Room Info"] = new_house_sigma["Room Info"].apply(ast.literal_eval)

property_expanded = pd.json_normalize(new_house_sigma['Property Info'])
listing_expanded = pd.json_normalize(new_house_sigma['Listing Info'])

# Concatenate the expanded DataFrames with the original DataFrame, excluding the old columns
df_expanded = pd.concat([new_house_sigma.drop(columns=['Property Info', 'Listing Info']), property_expanded, listing_expanded], axis=1)
    
df_expanded['calculated_sqft'] = df_expanded['Room Info'].apply(get_total_sqft)


df_expanded = df_expanded.loc[:, ~df_expanded.columns.duplicated(keep='last')]
size_columns = df_expanded.columns[df_expanded.columns.str.contains("Size:")]
size_columns

df_expanded["Size:"] = df_expanded["Size:"].fillna(0)

df_expanded["sqft"] = df_expanded.apply(lambda row: row["calculated_sqft"] if row["Size:"] == 0 else row["Size:"], axis=1)

house_sigma = df_expanded

house_sigma

Unnamed: 0,Name,Room Info,description 1,description 2,link,Tax:,Property Type:,Maintenance:,Included Utility:,Exposure:,...,Driveway Parking:,Parking Features:,Frontage Length:,Waterfront Features:,View:,Sloping:,Skiing:,Rolling:,calculated_sqft,sqft
0,"Key facts for Unit 508 - 40 Richview Rd, Humbe...","[{'type': 'Dining', 'size': '(4.70 x 3.22 m）',...","Discover this rare, spacious, and beautifully ...",['Spacious and beautifully renovated corner su...,/on/etobicoke-real-estate/508-40-richview-rd/h...,"$2, 265 / 2024",Condo Apt,$1113/month,"water, hydro, heat",Nw,...,,,,,,,,,844.716428,1200-1399 feet²
1,"Key facts for 24 Wallis Cres, Mount Olive-Silv...","[{'type': 'Living', 'size': '(5.48 x 3.05 m）',...",Welcome to 24 Wallis Cres. Make this detached ...,['Detached home located at 24 Wallis Cres in a...,/on/etobicoke-real-estate/24-wallis-cres/home/...,"$3, 039 / 2024",Detached,,,,...,,,,,,,,,825.374776,825.374776
2,"Key facts for Unit 2611 - 8 Eglinton Ave E, Mo...","[{'type': 'Kitchen', 'size': '(7.04 x 3.04 m）'...",Welcome To The Award Winning E-Condos In Highl...,['Award-winning E-Condos located in the desira...,/on/toronto-real-estate/2611-8-eglinton-ave-e/...,"$3, 627 / 2024",Condo Apt,$581/month,,S,...,,,,,,,,,620.095362,600-699 feet²
3,"Key facts for Unit 405 - 35 Fontenay Crt, Eden...","[{'type': 'Den', 'size': '(2.74 x 2.43 m）', 'l...",Boutique Building! One Bedroom + Den; 2 Bathro...,"[""Luxury living in a boutique building featuri...",/on/etobicoke-real-estate/405-35-fontenay-crt/...,"$2, 700 / 2024",Condo Apt,$758/month,"water, heat",Se,...,,,,,,,,,607.534967,700-799 feet²
4,"Key facts for Unit 12 - 51 Florence St, Little...",[],Brockton Commons is an exclusive collection of...,['Brockton Commons consists of 36 exclusive bo...,/on/toronto-real-estate/12-51-florence-st/home...,"$3, 285 / 2023",Condo Townhouse,$400/month,water,,...,,,,,,,,,0.000000,900-999 feet²
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2623,"Key facts for 282 Westlake Ave, Woodbine-Lumsd...","[{'type': 'Living', 'size': '(5.28 x 3.14 m）',...",Grand Solid Brick Home in Westlake Endless Po...,['Beautiful and well-maintained solid brick ho...,/on/toronto-real-estate/282-westlake-ave/home/...,"$4, 078 / 2024",Detached,,,,...,,,,,,,,,1098.966204,1098.966204
2624,"Key facts for 258 Perth Ave, Dovercourt-Wallac...","[{'type': 'Living', 'size': '(3.05 x 3.20 m）',...","A two storey, end unit, attached row house. Ac...",['Two-storey end unit attached row house with ...,/on/toronto-real-estate/258-perth-ave/home/EXr...,"$4, 371 / 2023",Freehold Townhouse,,,,...,,,,,,,,,1338.706243,1338.706243
2625,"Key facts for 93 Mount Olive Dr, Mount Olive-S...",[],Welcome to an incredible opportunity in the vi...,['Located in the vibrant Mount Olive neighborh...,/on/etobicoke-real-estate/93-mount-olive-dr/ho...,"$3, 369 / 2024",Detached,,,,...,,,,,,,,,0.000000,0.0
2626,"Key facts for 9 Maxwell Ave, Yonge-Eglinton, T...","[{'type': 'Living', 'size': '(6.43 x 3.51 m）',...",This exquisite side-centre hall home in Toront...,"[""Exquisite side-centre hall home located in T...",/on/toronto-real-estate/9-maxwell-ave/home/VgA...,"$8, 998 / 2024",Detached,,,,...,,,,,,,,,1562.018418,1562.018418
