In [None]:
import pandas as pd
import os
import numpy as np
import ast

Drop duplicates (by address), calculate the sqft if the listing has a range for the sqft or room dimensions, and drop rows with missing values for either price or sqft

In [None]:
zolo = pd.read_csv("zolo_listings_final_all_features.csv") # Old (missing more values)

In [49]:
zolo = pd.read_csv("new_zolo_listings.csv")

In [50]:
zolo.shape

(2188, 238)

In [51]:
zolo["price"] = zolo["price"].str.replace("$", "").str.replace(",", "")

In [52]:
zolo = zolo[zolo["price"].str.isdigit() & zolo["price"].notna()]

In [53]:
zolo.shape

(1972, 238)

In [54]:
zolo.drop_duplicates(subset=['address'], inplace=True)
zolo.shape

(1952, 238)

In [57]:
# Filter out listings with no way to find the sqft.

import ast

# Extract first room dimension (length) if the room dimensions are in a list format
zolo["sample_split_dimension"] = zolo["room dimensions"].apply(
    lambda x: ast.literal_eval(x)[0].split("x")[0] if isinstance(ast.literal_eval(x), list) and len(ast.literal_eval(x)) > 0 else None
)

# Convert the first dimension to float if it's not "N/A" or None, else set to None
zolo["sample_split_dimension"] = zolo["sample_split_dimension"].apply(
    lambda x: float(x) if x != "N/A" and x is not None else None
)

# Now filter rows where `sample_split_dimension` is a valid float or `Size (sq ft)` is not NaN
zolo = zolo[
    zolo["sample_split_dimension"].notna() |  # Keep rows where the dimension is a valid float (not NaN)
    zolo["Size (sq ft)"].notna()  # Keep rows where "Size (sq ft)" is not NaN
]

In [58]:
zolo.shape

(1775, 239)

In [59]:
zolo[zolo["address"] == "136 Anthony Road"]

Unnamed: 0,price,address,rooms,room dimensions,room_properties,Status,Type,Style,Size (sq ft),Age,...,Percent Building,Soil Test,Com_cn_fee,Ceiling Height,Crane,Industrial Area,Industrial Area Units,Llbo,Central Vac,sample_split_dimension


In [60]:
def get_sqft(row, room_dimensions=None):
    """For Zolo dfs. 
    If Size (sq ft) is NaN, then replaces with the sqft calculated from the room dimensions.
    If Size (sq ft) is a range (i.e., contains "-"), replaces with the average of the bounds."""
    
    # If the 'Size (sq ft)' is a range (contains "-"):
    if isinstance(row, str) and '-' in row:
        lower, upper = row.split("-")
        # Calculate the average of the two bounds
        return (float(lower) + float(upper)) / 2
    
    # If 'Size (sq ft)' is NaN, calculate from room dimensions
    elif pd.isna(row) and room_dimensions:  
        try:
            # Check if 'room_dimensions' is a list-like or string that we can process
            dimensions = ast.literal_eval(room_dimensions)
            square_meters = 0

            # If room dimensions is a list of room sizes (i.e., '200x300', '100x200', etc.)
            if isinstance(dimensions, list):
                for dimension in dimensions:
                    # Only split if 'dimension' contains 'x'
                    length, width = dimension.split("x")
                    square_meters += float(length.strip()) * float(width.strip())
                    
            return square_meters*10.7639
        
        except (ValueError, SyntaxError, TypeError):
            # Handle invalid or malformed room dimensions
            return np.nan

    elif isinstance(row, str):
        # If it's a string that doesn't contain a "-", remove any "+" and return the value as float
        return float(row.replace("+", "").replace(">", "").replace("<", "").strip())
    
    else:
        # If it's a numeric value (already a number), return it
        return row

# Apply the function to the 'Size (sq ft)' column, and pass the 'room dimensions' when needed
zolo["Calculated_sqft"] = zolo.apply(lambda row: get_sqft(row["Size (sq ft)"], row.get('room dimensions', None)), axis=1)


In [61]:
zolo["Calculated_sqft"] = pd.to_numeric(zolo["Calculated_sqft"], errors='coerce')
zolo = zolo[zolo["Calculated_sqft"].notna()]
zolo["Calculated_sqft"].isna().sum()


np.int64(0)

In [62]:
# Make sure all sqft values are floats
zolo["Calculated_sqft"] = zolo["Calculated_sqft"].astype(float)

In [63]:
zolo.shape

(1685, 240)

In [65]:
zolo["address"].isna().sum()

np.int64(0)

In [66]:
zolo.to_csv("cleaned_zolo_from_newest_scraping.csv", index=False)