In [1]:
import pandas as pd
import os
import numpy as np
import ast

In [2]:
realtor = pd.read_csv("real_estate_listings.csv")
realtor.shape

(2976, 72)

In [3]:
# Unique listings based on MLS ID: 1071

realtor["mls"].unique().shape[0]

1071

In [4]:
import re

def get_street_address(address):
    if isinstance(address, str):
        # Step 1: Handle missing or invalid addresses
        address = address.strip()
        
        # Step 2: If there is a dash and the part after the dash seems like an apartment number, split
        if "-" in address:
            parts = address.split("-")
            # Check if the part after the dash is likely an apartment number (contains digits)
            if re.search(r'\d', parts[1]):  # If the second part contains digits, treat as apt number
                address = parts[1]  # This assumes the apartment number is the part after the dash
            else:
                address = parts[0]  # If not, we keep the first part (before the dash)

        # Step 3: Remove any content after the opening parenthesis (e.g., neighborhoods)
        address = address.split("(")[0].strip()

        # Step 4: Remove content after "Toronto" (city name) if necessary
        address = address.split("Toronto")[0].strip()

        # Step 5: Convert to lowercase
        return address.lower()
    return ''  # If it's not a valid string, return an empty string

# Apply the function to the 'address' column
realtor["street address"] = realtor["address"].fillna('').apply(get_street_address)

realtor

Unnamed: 0,price,address,mls,office_name,office_type,office_address,Property Type,Building Type,Storeys,Square Footage,...,Construction Status,Construction Material,Waterfront Name,Other Style,Fireplace Fuel,Utility-Hydro,Other,Age Of Building,Business Type,street address
0,"$899,800",50 - 719 LAWRENCE AVENUE WToronto (Yorkdale-Gl...,W10678586,RE/MAX ATRIUM HOME REALTY,Brokerage,"7100 WARDEN AVE #1AMARKHAM, Ontario L3R8B5",Single Family,Row / Townhouse,3.0,1000 - 1199 sqft,...,,,,,,,,,,719 lawrence avenue w
1,"$888,000",1515 - 8 HILLCREST AVENUEToronto (Willowdale E...,C10650046,HOMELIFE NEW WORLD REALTY INC.,Brokerage,"201 CONSUMERS RD., STE. 205TORONTO, Ontario M2...",Single Family,Apartment,,1000 - 1199 sqft,...,,,,,,,,,,8 hillcrest avenue
2,"$639,000",1502 - 10 YORK STREETToronto (Waterfront Commu...,C10640404,RE/MAX REALTRON TNS REALTY INC.,Brokerage,"7800 WOODBINE AVE PHMARKHAM, Ontario L3R2N7",Single Family,Apartment,,500 - 599 sqft,...,,,,,,,,,,10 york street
3,"$1,139,000",185 RICHARD CLARK DRIVEToronto (Downsview-Rodi...,W10562062,IPRO REALTY LTD.,Brokerage,"3079B DUNDAS ST WESTTORONTO, Ontario M6P1Z9",Single Family,House,1.0,,...,,,,,,,,,,185 richard clark drive
4,"$2,150,000","112 ANNDALE DRIVEToronto (Willowdale East), On...",C10564186,HOMELIFE GOLD PACIFIC REALTY INC.,Brokerage,"3601 VICTORIA PARK AVE #401TORONTO, Ontario M1...",Single Family,House,1.5,,...,,,,,,,,,,112 anndale drive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2971,"$745,000",PH2 - 77 MAITLAND PLACEToronto (Cabbagetown-So...,C10430862,RE/MAX REALTRON BARRY COHEN HOMES INC.,Brokerage,"309 YORK MILLS RO UNIT 7TORONTO, Ontario M2L1L3",Single Family,Apartment,,900 - 999 sqft,...,,,,,,,,,,77 maitland place
2972,"$745,000",1909 - 77 HARBOUR SQUAREToronto (Waterfront Co...,C10430973,ROYAL LEPAGE YOUR COMMUNITY REALTY,Brokerage,"187 KING STREET EASTTORONTO, Ontario M5A1J5",Single Family,Apartment,,700 - 799 sqft,...,,,,,,,,,,77 harbour square
2973,"$744,000",2101 - 50 LOMBARD STREETToronto (Church-Yonge ...,C10433752,HARVEY KALLES REAL ESTATE LTD.,Brokerage,"2145 AVENUE ROADTORONTO, Ontario M5M4B2",Single Family,Apartment,,1000 - 1199 sqft,...,,,,,,,,,,50 lombard street
2974,"$740,000",LPH 6 - 1 AVONDALE AVENUEToronto (Willowdale E...,C10440933,RE/MAX EXCEL REALTY LTD.,Brokerage,"120 WEST BEAVER CREEK RD #23RICHMOND HILL, Ont...",Single Family,,,800 - 899 sqft,...,,,,,,,,,,1 avondale avenue


In [5]:
realtor[realtor["mls"].isna()]

Unnamed: 0,price,address,mls,office_name,office_type,office_address,Property Type,Building Type,Storeys,Square Footage,...,Construction Status,Construction Material,Waterfront Name,Other Style,Fireplace Fuel,Utility-Hydro,Other,Age Of Building,Business Type,street address
145,,,,,,,,,,,...,,,,,,,,,,
146,,,,,,,,,,,...,,,,,,,,,,
147,,,,,,,,,,,...,,,,,,,,,,
148,,,,,,,,,,,...,,,,,,,,,,
149,,,,,,,,,,,...,,,,,,,,,,
150,,,,,,,,,,,...,,,,,,,,,,
151,,,,,,,,,,,...,,,,,,,,,,
152,,,,,,,,,,,...,,,,,,,,,,
153,,,,,,,,,,,...,,,,,,,,,,
154,,,,,,,,,,,...,,,,,,,,,,


In [6]:
realtor.dropna(subset=['mls'], inplace=True)
realtor.shape

(2963, 73)

In [7]:
realtor.drop_duplicates(subset=['mls'], inplace=True)
realtor.shape

(1070, 73)

In [8]:
realtor.to_csv("realtor_dropped_duplicates.csv", index=False)

In [9]:
realtor = pd.read_csv("real_estate_listings.csv")
house_sigma = pd.read_csv("housesigma_data_with_coords.csv")

In [10]:
house_sigma_realtor = pd.concat([house_sigma, realtor], axis=0, ignore_index=True)