# 03 — Features distances

In [13]:
# Paths & setup
import pandas as pd
import numpy as np
from pathlib import Path
from math import radians, sin, cos, sqrt, atan2
try:
    PROJECT_ROOT = Path(__file__).resolve().parents[1]
except NameError:
    cwd = Path.cwd()
    PROJECT_ROOT = cwd if (cwd / 'data').exists() else cwd.parent

DATA_DIR = PROJECT_ROOT / 'data'
RAW_DIR = DATA_DIR / 'raw'
INTERIM_DIR = DATA_DIR / 'interim'
PROCESSED_DIR = DATA_DIR / 'processed'
df = pd.read_csv(INTERIM_DIR / "rentals_with_coords.csv")
for d in [RAW_DIR, INTERIM_DIR, PROCESSED_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print('Project root:', PROJECT_ROOT)

Project root: c:\Users\zhant\toronto-rentals


## 1) Load rentals with coordinates

In [14]:
stations_path = RAW_DIR / 'toronto_subway_stations.csv'
if stations_path.exists():
    stations = pd.read_csv(stations_path)
    stations.columns = [c.strip().lower() for c in stations.columns]
    assert {'name','lat','lon'} <= set(stations.columns), 'ttc_stations.csv must have columns: name, lat, lon'
    stations = stations[['name','lat','lon']].dropna()
    print(f'TTC stations loaded: {len(stations)}')
else:
    print('⚠️ TTC stations file not found:', stations_path)
    print('Create a CSV with columns: name,lat,lon and put it here.')
    stations = pd.DataFrame(columns=['name','lat','lon'])

TTC stations loaded: 68


## 2) TTC stations (no paid geocoding)

In [15]:


def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat/2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2)**2
    return 2 * R * atan2(sqrt(a), sqrt(1 - a))

## 3) Haversine distance (km)

In [16]:
if len(stations) > 0:
    # Ensure numeric
    for c in ['lat','lon']:
        df[c] = pd.to_numeric(df[c], errors='coerce')
        stations[c] = pd.to_numeric(stations[c], errors='coerce')

    apts_coords = df[['lat','lon']].to_numpy()
    stat_coords = stations[['lat','lon']].to_numpy()
    stat_names = stations['name'].to_numpy()

    nearest_name = []
    nearest_dist = []

    block = 500
    for i in range(0, len(apts_coords), block):
        block_coords = apts_coords[i:i+block]
        dists_block = np.empty((len(block_coords), len(stat_coords)))
        for j, (slat, slon) in enumerate(stat_coords):
            dists_block[:, j] = [haversine_km(alat, alon, slat, slon) for alat, alon in block_coords]
        idx = dists_block.argmin(axis=1)
        nearest_name.extend(stat_names[idx])
        nearest_dist.extend(dists_block[np.arange(len(block_coords)), idx])

    df['nearest_station'] = nearest_name
    df['dist_km_to_station'] = nearest_dist
else:
    df['nearest_station'] = np.nan
    df['dist_km_to_station'] = np.nan

print('Added nearest TTC station features.')

Added nearest TTC station features.


## 4) Compute nearest TTC station per listing

In [17]:
# Convert numeric
for c in ['price','sqft','bedrooms','bathrooms']:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# price per sqft
df['price_per_sqft'] = np.where(df['sqft'].gt(0), df['price'] / df['sqft'], np.nan)

# sqft per bedroom
df['sqft_per_bed'] = np.where(df['bedrooms'].gt(0), df['sqft'] / df['bedrooms'], np.nan)

# bed/bath ratio
df['bed_bath_ratio'] = np.where(df['bathrooms'].gt(0), df['bedrooms'] / df['bathrooms'], np.nan)

# simple room count feature (optional)
if {'bedrooms','bathrooms'} <= set(df.columns):
    df['rooms_total'] = df['bedrooms'].fillna(0) + df['bathrooms'].fillna(0)

print('Engineered feature columns added.')

Engineered feature columns added.


## 5) Basic engineered features

In [20]:
# If you later add data/raw/universities.csv with columns [name,lat,lon], you can compute nearest university similarly.
uni_path = RAW_DIR / 'universities.csv'
if uni_path.exists():
    universities = pd.read_csv(uni_path)
    universities.columns = [c.strip().lower() for c in universities.columns]
    assert {'name','lat','lon'} <= set(universities.columns)
    universities = universities[['name','lat','lon']].dropna()

    apts = df[['lat','lon']].to_numpy()
    unis = universities[['lat','lon']].to_numpy()
    uni_names = universities['name'].to_numpy()

    nearest_uni = []
    nearest_uni_dist = []

    for i in range(0, len(apts), 500):
        blk = apts[i:i+500]
        dblk = np.empty((len(blk), len(unis)))
        for j, (ul, un) in enumerate(unis):
            dblk[:, j] = [haversine_km(al, an, ul, un) for al, an in blk]
        idx = dblk.argmin(axis=1)
        nearest_uni.extend(uni_names[idx])
        nearest_uni_dist.extend(dblk[np.arange(len(blk)), idx])

    df['nearest_university'] = nearest_uni
    df['dist_km_to_university'] = nearest_uni_dist
    print('Added university distance features.')
else:
    print('universities.csv not found — skipping university features.')

Added university distance features.


In [24]:
union_station_lat, union_station_lon = 43.640497438, -79.3749985
df["distance_to_union"] = df.apply(
    lambda row: haversine_km(row["lat"], row["lon"], union_station_lat, union_station_lon)
    if pd.notna(row["lat"]) and pd.notna(row["lon"]) else np.nan,
    axis=1
)

## 6) Save processed dataset

In [25]:
outp = PROCESSED_DIR / 'rentals_features.csv'
df.to_csv(outp, index=False)
print('Saved:', outp.resolve())

df.head(10)

Saved: C:\Users\zhant\toronto-rentals\data\processed\rentals_features.csv


Unnamed: 0,title,price,sqft,bedrooms,bathrooms,address,neighbourhood,url,source,url_canon,...,formatted_address,nearest_station,dist_km_to_station,price_per_sqft,sqft_per_bed,bed_bath_ratio,rooms_total,nearest_university,dist_km_to_university,distance_to_union
0,,2995.0,750.0,1.5,1.0,"35 Mariner Terr - Toronto, ON",,,,,...,"35 Mariner Terrace, Toronto, ON M5V 3V9, Canada",St. Andrew,1.006189,3.993333,500.0,1.5,2.5,OCAD University,1.597374,1.377788
1,,2800.0,600.0,1.0,1.0,"224 King Street West - Toronto, ON",,,,,...,"2906-224 King St W, Toronto, ON M5H 0A6, Canada",St. Andrew,0.166247,4.666667,600.0,1.0,2.0,OCAD University,0.862257,1.263594
2,,2095.0,500.0,1.0,1.0,"11 Brunel Court - Toronto, ON",,,,,...,"11 Brunel Ct, Toronto, ON M5V 3Y3, Canada",St. Andrew,1.105247,4.19,500.0,1.0,2.0,OCAD University,1.649729,1.479241
3,,4200.0,1200.0,2.0,2.0,"85 Queens Wharf Road - Toronto, ON",,,,,...,"85 Queens Wharf Rd, Toronto, ON M5V 0J9, Canada",St. Andrew,1.441306,3.5,600.0,1.0,4.0,OCAD University,1.764275,1.924263
4,,2300.0,600.0,1.0,1.0,"81 Navy Wharf Court - Toronto, ON",,,,,...,"81 Navy Wharf Ct, Toronto, ON M5V 3M5, Canada",St. Andrew,0.849278,3.833333,600.0,1.0,2.0,OCAD University,1.447524,1.32425
5,,2925.0,750.0,2.0,1.0,"333 Adelaide Street East - Toronto, ON",,,,,...,"333A Adelaide St E, Toronto, ON M5A 1N2, Canada",King,0.75408,3.9,375.0,2.0,3.0,George Brown College - Waterfront,0.216377,1.347375
6,,3250.0,799.0,2.0,2.0,"488 University Avenue - Toronto, ON",,,,,...,"488 University Ave, Toronto, ON M5G 0C1, Canada",St. Patrick,0.079595,4.067584,399.5,1.0,4.0,OCAD University,0.321941,1.975229
7,,2400.0,599.0,1.0,1.0,"832 Bay Street - Toronto, ON",,,,,...,"Hamghlin Building, 832 Bay St., Toronto, ON M5...",College,0.294904,4.006678,599.0,1.0,2.0,University of Toronto - St. George,0.738075,2.57722
8,,2250.0,525.0,1.0,1.0,"77 & 99 Gerrard Street West - Toronto, ON",,,,,...,"Gerrard St W, Toronto, ON, Canada",College,0.394676,4.285714,525.0,1.0,2.0,Toronto Metropolitan University (TMU),0.55668,2.159392
9,,3099.0,820.0,1.0,1.0,"131 Bloor Street West - Toronto, ON",,,,,...,"131 Bloor St W, Toronto, ON M5S 3L7, Canada",Bay,0.160304,3.779268,820.0,1.0,2.0,University of Toronto - St. George,0.709494,3.441768
