In [3]:
import pandas as pd
import os

In [34]:
realtor = pd.read_csv("real_estate_listings.csv")
realtor.shape

(2976, 72)

In [35]:
# Unique listings based on MLS ID: 1071

realtor["mls"].unique().shape[0]

1071

In [36]:
import re

def extract_street_address(address):
    if isinstance(address, str):
        # Step 1: Handle missing or invalid addresses
        address = address.strip()
        
        # Step 2: If there is a dash and the part after the dash seems like an apartment number, split
        if "-" in address:
            parts = address.split("-")
            # Check if the part after the dash is likely an apartment number (contains digits)
            if re.search(r'\d', parts[1]):  # If the second part contains digits, treat as apt number
                address = parts[1]  # This assumes the apartment number is the part after the dash
            else:
                address = parts[0]  # If not, we keep the first part (before the dash)

        # Step 3: Remove any content after the opening parenthesis (e.g., neighborhoods)
        address = address.split("(")[0].strip()

        # Step 4: Remove content after "Toronto" (city name) if necessary
        address = address.split("Toronto")[0].strip()

        # Step 5: Convert to lowercase
        return address.lower()
    return ''  # If it's not a valid string, return an empty string

# Apply the function to the 'address' column
realtor["street address"] = realtor["address"].fillna('').apply(extract_street_address)

realtor

Unnamed: 0,price,address,mls,office_name,office_type,office_address,Property Type,Building Type,Storeys,Square Footage,...,Construction Status,Construction Material,Waterfront Name,Other Style,Fireplace Fuel,Utility-Hydro,Other,Age Of Building,Business Type,street address
0,"$899,800",50 - 719 LAWRENCE AVENUE WToronto (Yorkdale-Gl...,W10678586,RE/MAX ATRIUM HOME REALTY,Brokerage,"7100 WARDEN AVE #1AMARKHAM, Ontario L3R8B5",Single Family,Row / Townhouse,3.0,1000 - 1199 sqft,...,,,,,,,,,,719 lawrence avenue w
1,"$888,000",1515 - 8 HILLCREST AVENUEToronto (Willowdale E...,C10650046,HOMELIFE NEW WORLD REALTY INC.,Brokerage,"201 CONSUMERS RD., STE. 205TORONTO, Ontario M2...",Single Family,Apartment,,1000 - 1199 sqft,...,,,,,,,,,,8 hillcrest avenue
2,"$639,000",1502 - 10 YORK STREETToronto (Waterfront Commu...,C10640404,RE/MAX REALTRON TNS REALTY INC.,Brokerage,"7800 WOODBINE AVE PHMARKHAM, Ontario L3R2N7",Single Family,Apartment,,500 - 599 sqft,...,,,,,,,,,,10 york street
3,"$1,139,000",185 RICHARD CLARK DRIVEToronto (Downsview-Rodi...,W10562062,IPRO REALTY LTD.,Brokerage,"3079B DUNDAS ST WESTTORONTO, Ontario M6P1Z9",Single Family,House,1.0,,...,,,,,,,,,,185 richard clark drive
4,"$2,150,000","112 ANNDALE DRIVEToronto (Willowdale East), On...",C10564186,HOMELIFE GOLD PACIFIC REALTY INC.,Brokerage,"3601 VICTORIA PARK AVE #401TORONTO, Ontario M1...",Single Family,House,1.5,,...,,,,,,,,,,112 anndale drive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2971,"$745,000",PH2 - 77 MAITLAND PLACEToronto (Cabbagetown-So...,C10430862,RE/MAX REALTRON BARRY COHEN HOMES INC.,Brokerage,"309 YORK MILLS RO UNIT 7TORONTO, Ontario M2L1L3",Single Family,Apartment,,900 - 999 sqft,...,,,,,,,,,,77 maitland place
2972,"$745,000",1909 - 77 HARBOUR SQUAREToronto (Waterfront Co...,C10430973,ROYAL LEPAGE YOUR COMMUNITY REALTY,Brokerage,"187 KING STREET EASTTORONTO, Ontario M5A1J5",Single Family,Apartment,,700 - 799 sqft,...,,,,,,,,,,77 harbour square
2973,"$744,000",2101 - 50 LOMBARD STREETToronto (Church-Yonge ...,C10433752,HARVEY KALLES REAL ESTATE LTD.,Brokerage,"2145 AVENUE ROADTORONTO, Ontario M5M4B2",Single Family,Apartment,,1000 - 1199 sqft,...,,,,,,,,,,50 lombard street
2974,"$740,000",LPH 6 - 1 AVONDALE AVENUEToronto (Willowdale E...,C10440933,RE/MAX EXCEL REALTY LTD.,Brokerage,"120 WEST BEAVER CREEK RD #23RICHMOND HILL, Ont...",Single Family,,,800 - 899 sqft,...,,,,,,,,,,1 avondale avenue


In [37]:
realtor[realtor["mls"].isna()]

Unnamed: 0,price,address,mls,office_name,office_type,office_address,Property Type,Building Type,Storeys,Square Footage,...,Construction Status,Construction Material,Waterfront Name,Other Style,Fireplace Fuel,Utility-Hydro,Other,Age Of Building,Business Type,street address
145,,,,,,,,,,,...,,,,,,,,,,
146,,,,,,,,,,,...,,,,,,,,,,
147,,,,,,,,,,,...,,,,,,,,,,
148,,,,,,,,,,,...,,,,,,,,,,
149,,,,,,,,,,,...,,,,,,,,,,
150,,,,,,,,,,,...,,,,,,,,,,
151,,,,,,,,,,,...,,,,,,,,,,
152,,,,,,,,,,,...,,,,,,,,,,
153,,,,,,,,,,,...,,,,,,,,,,
154,,,,,,,,,,,...,,,,,,,,,,


In [38]:
realtor.dropna(subset=['mls'], inplace=True)
realtor.shape

(2963, 73)

In [40]:
realtor.drop_duplicates(subset=['mls'], inplace=True)
realtor.shape

(1070, 73)

In [41]:
realtor.to_csv("realtor_dropped_duplicates.csv", index=False)

In [21]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="CME538_project_email:laura.xu@mail.utoronto.ca")

realtor["location"] = geolocator.geocode(realtor["street address"])

In [None]:
realtor = pd.read_csv("real_estate_listings.csv")
house_sigma = pd.read_csv("housesigma_data_with_coords.csv")


(4000, 11)

In [None]:
house_sigma_realtor = pd.concat([house_sigma, realtor], axis=0, ignore_index=True)


(6976, 82)

In [35]:
zolo = pd.read_csv("zolo_listings_final_all_features.csv")


In [36]:
zolo.shape

(2176, 241)

In [37]:
zolo["price"] = zolo["price"].str.replace("$", "").str.replace(",", "")

In [38]:
zolo = zolo[zolo["price"].str.isdigit() & zolo["price"].notna()]

In [39]:
zolo.shape

(1784, 241)

In [40]:
zolo

Unnamed: 0,price,address,rooms,room dimensions,room_properties,Status,Type,Style,Size (sq ft),Area,...,Percent Building,Soil Test,Area Influence,Com_cn_fee,Ceil Height (ft),Ceiling Height,Crane,Industrial Area,Industrial Area Units,Central Vac
1,788888,415 - 23 Glebe Road W,"['Living', 'Dining', 'Kitchen', 'Prim Bdrm', '...","['3.05 x 3.17', '3.47 x 3.93', '3.47 x 3.93', ...","['Laminate, Open Concept, W/O To Balcony', 'La...",Sale,Condo Apt,Apartment,600-699,Toronto,...,,,,,,,,,,
2,649900,15 Lower Jarvis Street,"['4pc Bathroom', 'Primary Bedroom', 'Den', 'Fo...","['N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']","['Measurements not available', ""7'10'' x 13'5'...",,,,,,...,,,,,,,,,,
4,349900,212 - 234 Albion Road,"['Living', 'Dining', 'Kitchen', 'Prim Bdrm', '...","['4.10 x 6.55', '4.10 x 6.55', '2.48 x 5.21', ...","['Combined W/Dining, Broadloom, W/O To Balcony...",Sale,Condo Apt,Apartment,1000-1199,Toronto,...,,,,,,,,,,
5,575000,1408 - 23 Sheppard Avenue E,"['Living', 'Dining', 'Kitchen', 'Prim Bdrm', '...","['3.20 x 4.60', '3.20 x 4.60', '2.50 x 3.35', ...","['Combined W/Dining, Laminate, W/O To Balcony'...",Sale,Condo Apt,Apartment,500-599,Toronto,...,,,,,,,,,,
7,429000,1109 - 82 Dalhousie Street,"['Living', 'Dining', 'Kitchen']","['3.20 x 4.65', '3.20 x 4.65', '3.20 x 4.65']","['Combined W/Kitchen, Open Concept', 'Laminate...",Sale,Condo Apt,Apartment,0-499,Toronto,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2170,5500000,38 Salonica Road,[],[],[],Sale,Detached,2-Storey,,Toronto,...,,,,,,,,,,
2171,5500000,38 Salonica Road,[],[],[],Sale,Detached,2-Storey,,Toronto,...,,,,,,,,,,
2172,1719000,85 Kenilworth Avenue,[],[],[],Sale,Detached,2-Storey,,Toronto,...,,,,,,,,,,
2173,5500000,1 Country Lane,"['Kitchen', 'Dining', 'Living', 'Br', '2nd Br']","['N/A', 'N/A', 'N/A', 'N/A', 'N/A']","['', '', '', '', '']",Sale,Detached,Bungaloft,,Toronto,...,,,,,,,,,,


In [None]:
zolo = zolo[
    (zolo["room dimensions"].apply(lambda x: isinstance(x, list) and 'N/A' not in x)) &  # Ensure 'N/A' is not in lists
    (zolo["room dimensions"].apply(lambda x: isinstance(x, str) and x != 'N/A')) |  # Ensure 'N/A' is not in string values
    (zolo["Size (sq ft)"].notna()) |  # Keep rows where Size (sq ft) is not NaN
    (zolo["room dimensions"] != "[]")  # Ensure room dimensions is not '[]'
]
zolo = zolo[~zolo["room dimensions"].apply(lambda x: isinstance(x, list) and all(val == 'N/A' for val in x))]


# Convert the 'price' column to float
zolo["price"] = zolo["price"].astype(float)

TypeError: 'Series' object is not callable

In [63]:
zolo.shape

(1634, 241)

In [61]:
zolo

Unnamed: 0,price,address,rooms,room dimensions,room_properties,Status,Type,Style,Size (sq ft),Area,...,Percent Building,Soil Test,Area Influence,Com_cn_fee,Ceil Height (ft),Ceiling Height,Crane,Industrial Area,Industrial Area Units,Central Vac
1,788888.0,415 - 23 Glebe Road W,"['Living', 'Dining', 'Kitchen', 'Prim Bdrm', '...","['3.05 x 3.17', '3.47 x 3.93', '3.47 x 3.93', ...","['Laminate, Open Concept, W/O To Balcony', 'La...",Sale,Condo Apt,Apartment,600-699,Toronto,...,,,,,,,,,,
2,649900.0,15 Lower Jarvis Street,"['4pc Bathroom', 'Primary Bedroom', 'Den', 'Fo...","['N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']","['Measurements not available', ""7'10'' x 13'5'...",,,,,,...,,,,,,,,,,
4,349900.0,212 - 234 Albion Road,"['Living', 'Dining', 'Kitchen', 'Prim Bdrm', '...","['4.10 x 6.55', '4.10 x 6.55', '2.48 x 5.21', ...","['Combined W/Dining, Broadloom, W/O To Balcony...",Sale,Condo Apt,Apartment,1000-1199,Toronto,...,,,,,,,,,,
5,575000.0,1408 - 23 Sheppard Avenue E,"['Living', 'Dining', 'Kitchen', 'Prim Bdrm', '...","['3.20 x 4.60', '3.20 x 4.60', '2.50 x 3.35', ...","['Combined W/Dining, Laminate, W/O To Balcony'...",Sale,Condo Apt,Apartment,500-599,Toronto,...,,,,,,,,,,
7,429000.0,1109 - 82 Dalhousie Street,"['Living', 'Dining', 'Kitchen']","['3.20 x 4.65', '3.20 x 4.65', '3.20 x 4.65']","['Combined W/Kitchen, Open Concept', 'Laminate...",Sale,Condo Apt,Apartment,0-499,Toronto,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2167,4500000.0,47 Baby Point Crescent,"['Living', 'Dining', 'Kitchen', 'Family', '4th...","['4.27 x 7.29', '3.58 x 5.92', '2.74 x 5.92', ...","['W/O To Balcony, Fireplace, Parquet Floor', '...",Sale,Detached,Sidesplit 5,,Toronto,...,,,,,,,,,,
2168,1920000.0,180 Cocksfield Avenue,"['Living', 'Dining', 'Kitchen', 'Breakfast', '...","['3.35 x 6.70', '3.15 x 3.31', '3.48 x 4.70', ...","['Hardwood Floor, Window, Open Concept', 'Hard...",Sale,Detached,Bungalow,,Toronto,...,,,,,,,,,,
2169,22500000.0,45 Bayview Ridge Ridge,"['Dining', 'Library', 'Great Rm', 'Sitting', '...","['5.48 x 7.40', '7.28 x 7.34', '7.34 x 7.65', ...","['Hardwood Floor, Pot Lights, Fireplace', 'Har...",Sale,Detached,2-Storey,,Toronto,...,,,,,,,,,,
2173,5500000.0,1 Country Lane,"['Kitchen', 'Dining', 'Living', 'Br', '2nd Br']","['N/A', 'N/A', 'N/A', 'N/A', 'N/A']","['', '', '', '', '']",Sale,Detached,Bungaloft,,Toronto,...,,,,,,,,,,


In [None]:
import numpy as np 


def get_sqft(df):
    """For Zolo dfs. 
    If Size (sq ft) is NaN, then replaces with the sqft calculated from the room dimensions.
    If Size (sq ft) is a range (i.e., contains "-"), replaces with the average of the bounds."""
    
    # Sqft entry is a range:
    if row.__contains__("-"):
        lower, upper = row.split("-")
        row = 1/2* (lower + upper)
        
    elif row == np.nan:
        dimensions = df["room dimensions"].split(",")
        sqft = []
        for dimension in dimensions:
            length, width = dimensions.split("x")
            sqft.append(float(length.strip()) * float(width.strip()))
        row = sum(sqft)
        
    else:
        row
        
    return row

zolo["Size (sq ft)"] = get_sqft(zolo)

In [48]:
zolo

Unnamed: 0,price,address,rooms,room dimensions,room_properties,Status,Type,Style,Size (sq ft),Area,...,Percent Building,Soil Test,Area Influence,Com_cn_fee,Ceil Height (ft),Ceiling Height,Crane,Industrial Area,Industrial Area Units,Central Vac
1,788888.0,415 - 23 Glebe Road W,"['Living', 'Dining', 'Kitchen', 'Prim Bdrm', '...","['3.05 x 3.17', '3.47 x 3.93', '3.47 x 3.93', ...","['Laminate, Open Concept, W/O To Balcony', 'La...",Sale,Condo Apt,Apartment,600-699,Toronto,...,,,,,,,,,,
2,649900.0,15 Lower Jarvis Street,"['4pc Bathroom', 'Primary Bedroom', 'Den', 'Fo...","['N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']","['Measurements not available', ""7'10'' x 13'5'...",,,,,,...,,,,,,,,,,
4,349900.0,212 - 234 Albion Road,"['Living', 'Dining', 'Kitchen', 'Prim Bdrm', '...","['4.10 x 6.55', '4.10 x 6.55', '2.48 x 5.21', ...","['Combined W/Dining, Broadloom, W/O To Balcony...",Sale,Condo Apt,Apartment,1000-1199,Toronto,...,,,,,,,,,,
5,575000.0,1408 - 23 Sheppard Avenue E,"['Living', 'Dining', 'Kitchen', 'Prim Bdrm', '...","['3.20 x 4.60', '3.20 x 4.60', '2.50 x 3.35', ...","['Combined W/Dining, Laminate, W/O To Balcony'...",Sale,Condo Apt,Apartment,500-599,Toronto,...,,,,,,,,,,
7,429000.0,1109 - 82 Dalhousie Street,"['Living', 'Dining', 'Kitchen']","['3.20 x 4.65', '3.20 x 4.65', '3.20 x 4.65']","['Combined W/Kitchen, Open Concept', 'Laminate...",Sale,Condo Apt,Apartment,0-499,Toronto,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2167,4500000.0,47 Baby Point Crescent,"['Living', 'Dining', 'Kitchen', 'Family', '4th...","['4.27 x 7.29', '3.58 x 5.92', '2.74 x 5.92', ...","['W/O To Balcony, Fireplace, Parquet Floor', '...",Sale,Detached,Sidesplit 5,,Toronto,...,,,,,,,,,,
2168,1920000.0,180 Cocksfield Avenue,"['Living', 'Dining', 'Kitchen', 'Breakfast', '...","['3.35 x 6.70', '3.15 x 3.31', '3.48 x 4.70', ...","['Hardwood Floor, Window, Open Concept', 'Hard...",Sale,Detached,Bungalow,,Toronto,...,,,,,,,,,,
2169,22500000.0,45 Bayview Ridge Ridge,"['Dining', 'Library', 'Great Rm', 'Sitting', '...","['5.48 x 7.40', '7.28 x 7.34', '7.34 x 7.65', ...","['Hardwood Floor, Pot Lights, Fireplace', 'Har...",Sale,Detached,2-Storey,,Toronto,...,,,,,,,,,,
2173,5500000.0,1 Country Lane,"['Kitchen', 'Dining', 'Living', 'Br', '2nd Br']","['N/A', 'N/A', 'N/A', 'N/A', 'N/A']","['', '', '', '', '']",Sale,Detached,Bungaloft,,Toronto,...,,,,,,,,,,


In [None]:
import numpy as np
import pandas as pd

def get_sqft(row, room_dimension_column):
    """For Zolo dfs. 
    If Size (sq ft) is NaN, then replaces with the sqft calculated from the room dimensions.
    If Size (sq ft) is a range (i.e., contains "-"), replaces with the average of the bounds."""
    
    # If 'Size (sq ft)' is a range (contains "-"):
    if isinstance(row, str) and "-" in row:
        lower, upper = row.split("-")
        lower, upper = float(lower), float(upper)
        return (lower + upper) / 2  # Return the average of the bounds
    
    # If 'Size (sq ft)' is NaN, use room dimensions to calculate the sqft
    elif pd.isna(row):
        # Assuming 'room dimensions' is a string like "10x15, 12x14"
        dimensions = row[room_dimension_column].split(",")
        sqft = []
        for dimension in dimensions:
            length, width = dimension.split("x")
            length = float(length.strip())
            width = float(width.strip())
            sqft.append(length * width)
        return sum(sqft)  # Return the total area calculated
    
    else:
        # If 'Size (sq ft)' is already numeric, return it as is
        return row

# Apply the function to the 'Size (sq ft)' column
zolo["Size (sq ft)"] = zolo.apply(lambda row: get_sqft(row, "room dimensions"), axis=1)