# 02 — Geocoding

In [None]:
# Imports & paths
import os, json, time, hashlib, re
import pandas as pd
import requests
from dotenv import load_dotenv
from pathlib import Path

def find_project_root(start: Path, marker_dir: str = "data") -> Path:
    cur = start.resolve()
    # if we're in a file context, go to its parent; if in a dir (Jupyter), keep as is
    if cur.is_file():
        cur = cur.parent
    # Walk up until we find a directory containing marker_dir (e.g., 'data')
    while cur != cur.parent:
        if (cur / marker_dir).exists():
            return cur
        cur = cur.parent
    # Fallback: original start dir
    return start.resolve()

# Works both in Jupyter (no __file__) and when run as a script
try:
    _start = Path(__file__)
except NameError:
    _start = Path.cwd()

PROJECT_ROOT = find_project_root(_start, marker_dir="data")
DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
INTERIM_DIR = DATA_DIR / "interim"
PROCESSED_DIR = DATA_DIR / "processed"

for d in [RAW_DIR, INTERIM_DIR, PROCESSED_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Interim dir:", INTERIM_DIR)

In [None]:
# Load API key from .env
from dotenv import load_dotenv
load_dotenv(dotenv_path=PROJECT_ROOT / '.env')
import os
API_KEY = os.getenv('GOOGLE_MAPS_API_KEY')
assert API_KEY and API_KEY.strip(), 'GOOGLE_MAPS_API_KEY not set in .env'

## 1) Load cleaned listings

In [None]:
clean_path = INTERIM_DIR / 'clean_listings.csv'
assert clean_path.exists(), f'File not found: {clean_path}'

df = pd.read_csv(clean_path)
print(df.shape)
df.head(3)

In [None]:
CACHE_DIR = INTERIM_DIR / 'geocode_cache'
CACHE_DIR.mkdir(parents=True, exist_ok=True)

GEOCODE_URL = 'https://maps.googleapis.com/maps/api/geocode/json'

def normalize_address(addr: str) -> str:
    if pd.isna(addr):
        return ''
    s = str(addr).strip()
    s = re.sub(r'\s+', ' ', s)
    return s

def cache_key(addr: str) -> Path:
    h = hashlib.sha1(addr.lower().encode()).hexdigest()
    return CACHE_DIR / f'{h}.json'

def geocode(address: str, api_key: str, max_retries: int = 5, base_sleep: float = 0.5):
    """Return (lat, lon, status, resolved_address) or (None, None, status, None)"""
    addr = normalize_address(address)
    if not addr:
        return None, None, 'EMPTY_ADDRESS', None

    # Heuristic: ensure city/province for better accuracy
    addr_q = addr
    if 'toronto' not in addr.lower():
        addr_q += ', Toronto'
    if 'on' not in addr.lower() and 'ontario' not in addr.lower():
        addr_q += ', ON'
    addr_q += ', Canada'

    # Cache
    cpath = cache_key(addr_q)
    if cpath.exists():
        data = json.loads(cpath.read_text(encoding='utf-8'))
        return data['lat'], data['lon'], data.get('status','CACHED_OK'), data.get('formatted_address')

    params = {
        'address': addr_q,
        'key': api_key,
        'components': 'country:CA|administrative_area:ON|locality:Toronto'
    }

    for attempt in range(max_retries):
        r = requests.get(GEOCODE_URL, params=params, timeout=20)
        data = r.json()
        status = data.get('status', '')
        if status == 'OK' and data.get('results'):
            loc = data['results'][0]['geometry']['location']
            fmt = data['results'][0].get('formatted_address')
            cpath.write_text(json.dumps({'lat': loc['lat'], 'lon': loc['lng'], 'status': status, 'formatted_address': fmt}), encoding='utf-8')
            return loc['lat'], loc['lng'], status, fmt
        elif status in {'OVER_QUERY_LIMIT','RESOURCE_EXHAUSTED'}:
            sleep = base_sleep * (2 ** attempt)
            time.sleep(sleep)
            continue
        else:
            # ZERO_RESULTS / REQUEST_DENIED / INVALID_REQUEST / ...
            cpath.write_text(json.dumps({'lat': None, 'lon': None, 'status': status, 'formatted_address': None}), encoding='utf-8')
            return None, None, status, None

    return None, None, 'RETRY_EXCEEDED', None

In [None]:
# Prepare columns
for col in ['lat','lon','geocode_status','formatted_address']:
    if col not in df.columns:
        df[col] = pd.NA

mask = (df['lat'].isna() | df['lon'].isna()) & df['address'].notna() & (df['address'].str.strip() != '')

pending = df[mask]
print('Rows to geocode:', len(pending))

fail_log = []

for idx, row in pending.iterrows():
    lat, lon, status, fmt = geocode(row['address'], API_KEY)
    df.at[idx, 'lat'] = lat
    df.at[idx, 'lon'] = lon
    df.at[idx, 'geocode_status'] = status
    df.at[idx, 'formatted_address'] = fmt

    if status not in ('OK','CACHED_OK'):
        fail_log.append({'index': int(idx), 'address': row['address'], 'status': status})

    # polite delay to avoid hitting per-second limits
    time.sleep(0.15)

print('Done. Failures:', len(fail_log))

In [None]:
out_path = INTERIM_DIR / 'rentals_with_coords.csv'
df.to_csv(out_path, index=False)
print('Saved:', out_path.resolve())

if len(fail_log):
    fail_path = INTERIM_DIR / 'geocode_failures.csv'
    pd.DataFrame(fail_log).to_csv(fail_path, index=False)
    print('Failures log:', fail_path.resolve())

cols = [c for c in ['address','formatted_address','lat','lon','geocode_status','price','sqft','bedrooms','bathrooms'] if c in df.columns]
df[cols].head(10)