## Imports

In [1]:
#!/usr/bin/env python3
"""
generate_geojson_from_docs.py

Read a DataFrame of metro names and Google Doc links, export each Doc as HTML,
geocode each metro via Google Maps API, and write out a GeoJSON of summaries.
"""
import os
import re
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from bs4 import BeautifulSoup
from google.oauth2 import service_account
from googleapiclient.discovery import build
import requests
import googlemaps
import gspread
from oauth2client.service_account import ServiceAccountCredentials# Define the scope of the application
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
import googleapiclient.errors

## CONFIGURATION

In [2]:
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']

# Add credentials to the account
creds = ServiceAccountCredentials.from_json_keyfile_name('autoscraper-380600-0d0c84856d6b.json', scope)

# Authorize the clientsheet 
client = gspread.authorize(creds)

sheet = client.open_by_key('11UHXwJ_A9-kJZANhI3JvqKBGhIlGilmaAuX7NVL7YA4')

# Drive client
drivesvc = build("drive", "v3", credentials=creds)

range_name = 'A1:AO200'

In [3]:
def fetch_data(sheet, worksheet_name, range_name, df_name=None):
    print('Fetching data from Google Sheets...')
    worksheet = sheet.worksheet(worksheet_name)
    data = worksheet.get(range_name)
    df = pd.DataFrame(data)
    df.columns = df.iloc[0]  # Set first row as column headers
    df = df.drop(0).reset_index(drop=True)  # Drop the header row from the dataframe and reset index

    df.drop_duplicates(inplace=True)  # Drop duplicate rows
    print(f'Number of rows in {worksheet_name} worksheet: {len(df)}')
    return df

In [4]:
df = fetch_data(sheet, 'Sheet1', range_name, 'df')

Fetching data from Google Sheets...
Number of rows in Sheet1 worksheet: 18


In [5]:
%store -r google_maps_API_Key
GOOGLE_MAPS_API_KEY = googlemaps.Client(key=google_maps_API_Key)

In [6]:
# Define the geocode function
def geocode(add):
    g = GOOGLE_MAPS_API_KEY.geocode(add)
    if g:
        lat = g[0]["geometry"]["location"]["lat"]
        lng = g[0]["geometry"]["location"]["lng"]
        return (lat, lng)
    else:
        return None

# Apply geocoding to the 'geo_address' column and store the results in 'geocoded' column
df['geocoded'] = df['Metro'].apply(geocode)

In [7]:
# ────────────────────────────────────────────────────────────────
#  HELPERS
# ────────────────────────────────────────────────────────────────
def extract_doc_id(url: str) -> str:
    """Extract the Doc ID from a Google Docs URL."""
    m = re.search(r"/d/([a-zA-Z0-9_-]+)", url)
    if not m:
        raise ValueError(f"Unable to parse document ID from URL: {url}")
    return m.group(1)


def geocode(address: str, api_key: str) -> tuple:
    """Return (lat, lon) for a given address using Google Maps Geocoding API."""
    endpoint = "https://maps.googleapis.com/maps/api/geocode/json"
    resp = requests.get(endpoint, params={"address": address, "key": api_key})
    resp.raise_for_status()
    data = resp.json()
    if data.get("status") != "OK" or not data.get("results"):
        raise ValueError(f"Geocoding failed for '{address}': {data.get('status')}")
    loc = data["results"][0]["geometry"]["location"]
    return loc["lat"], loc["lng"]

In [8]:
df.columns

Index(['Metro', 'Landing Page', 'Overview Draft Doc', 'Last Updated',
       'Last Updated By', 'geocoded'],
      dtype='object', name=0)

In [9]:
# ────────────────────────────────────────────────────────────────
#  PROCESS EACH DOC & BUILD RECORDS
# ────────────────────────────────────────────────────────────────
records = []
for _, row in df.iterrows():
    metro = row['Metro']
    doc_url = row['Overview Draft Doc']
    geocoded = row['geocoded']
    last_updated_date = row['Last Updated'],
    last_updated_author = row['Last Updated By']
    print(f"Processing metro: {metro}")

    # 1) extract Doc ID and export as HTML
    doc_id = extract_doc_id(doc_url)
    html_bytes = drivesvc.files().export(
        fileId=doc_id,
        mimeType="text/html"
    ).execute()
    soup = BeautifulSoup(html_bytes, 'html.parser')
    body = soup.body
    summary_html = ''.join(str(el) for el in body.contents).strip()

    records.append({
        'metro': metro,
        'summary': summary_html,
        'geocoded':geocoded,
        'last_updated_date':last_updated_date,
        'last_updated_author':last_updated_author
    })

if not records:
    raise ValueError("No records processed. Check your DOC_LINKS_CSV.")

Processing metro: New York
Processing metro: South Florida
Processing metro: Los Angeles
Processing metro: Chicago
Processing metro: San Francisco (and San Jose/Silicon Valley)
Processing metro: Texas (Dallas, Houston, Austin, San Antonio)
Processing metro: Boston
Processing metro: Washington, D.C.
Processing metro: Philadelphia
Processing metro: Atlanta
Processing metro: Tampa
Processing metro: Orlando
Processing metro: Charlotte
Processing metro: Nashville
Processing metro: Phoenix
Processing metro: Seattle
Processing metro: Denver
Processing metro: Las Vegas


In [10]:
OUTPUT_GEOJSON = "market_overviews.geojson"

In [12]:
df

Unnamed: 0,Metro,Landing Page,Overview Draft Doc,Last Updated,Last Updated By,geocoded
0,New York,https://therealdeal.com/new-york/,https://docs.google.com/document/d/1V_9byGw5Af...,"May 12, 2025",Mary Diduch,"(40.7127753, -74.0059728)"
1,South Florida,https://therealdeal.com/miami/,https://docs.google.com/document/d/11BBpyeLXZj...,"May 12, 2025",Mary Diduch,"(26.522474, -81.1637245)"
2,Los Angeles,https://therealdeal.com/la/,https://docs.google.com/document/d/1D7MgwzuE3H...,"May 12, 2025",Mary Diduch,"(34.0549076, -118.242643)"
3,Chicago,https://therealdeal.com/chicago/,https://docs.google.com/document/d/18nhSNkDeo-...,"May 12, 2025",Mary Diduch,"(41.8781136, -87.6297982)"
4,San Francisco (and San Jose/Silicon Valley),https://therealdeal.com/san-francisco/,https://docs.google.com/document/d/19AxLoIrrJm...,"May 12, 2025",Mary Diduch,"(37.33874, -121.8852525)"
5,"Texas (Dallas, Houston, Austin, San Antonio)",https://therealdeal.com/texas/,https://docs.google.com/document/d/1YnJCsS-yuD...,"May 12, 2025",Mary Diduch,"(30.267153, -97.7430608)"
6,Boston,https://therealdeal.com/national/boston/,https://docs.google.com/document/d/1w_cSk1I22h...,"May 12, 2025",Mary Diduch,"(42.3555076, -71.0565364)"
7,"Washington, D.C.",https://therealdeal.com/national/washington-dc/,https://docs.google.com/document/d/16JBncASAOo...,"May 12, 2025",Mary Diduch,"(38.9071923, -77.0368707)"
8,Philadelphia,https://therealdeal.com/national/philadelphia/,https://docs.google.com/document/d/1nV6VB0fEEV...,"May 12, 2025",Mary Diduch,"(39.9525839, -75.1652215)"
9,Atlanta,https://therealdeal.com/national/atlanta/,https://docs.google.com/document/d/1SGcedyLLjg...,"May 12, 2025",Mary Diduch,"(33.7501275, -84.3885209)"


In [11]:
# ────────────────────────────────────────────────────────────────
#  BUILD GEOJSON
# ────────────────────────────────────────────────────────────────
gdf = gpd.GeoDataFrame(
    records,
    geometry=[Point(r['lon'], r['lat']) for r in records],
    crs="EPSG:4326"
)

gdf.to_file(f"{OUTPUT_GEOJSON}.geojson", driver="GeoJSON")

print(f"GeoJSON with {len(gdf)} features written to {OUTPUT_GEOJSON}")

KeyError: 'lon'