## Imports

In [1]:
import os
import re
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from bs4 import BeautifulSoup
from google.oauth2 import service_account
from googleapiclient.discovery import build
import requests
import googlemaps
import gspread
from oauth2client.service_account import ServiceAccountCredentials# Define the scope of the application
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
import googleapiclient.errors

## CONFIGURATION

In [2]:
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']

# Add credentials to the account
creds = ServiceAccountCredentials.from_json_keyfile_name('autoscraper-380600-0d0c84856d6b.json', scope)

# Authorize the clientsheet 
client = gspread.authorize(creds)

sheet = client.open_by_key('11UHXwJ_A9-kJZANhI3JvqKBGhIlGilmaAuX7NVL7YA4')

# Drive client
drivesvc = build("drive", "v3", credentials=creds)

range_name = 'A1:AO200'

In [3]:
def fetch_data(sheet, worksheet_name, range_name, df_name=None):
    print('Fetching data from Google Sheets...')
    worksheet = sheet.worksheet(worksheet_name)
    data = worksheet.get(range_name)
    df = pd.DataFrame(data)
    df.columns = df.iloc[0]  # Set first row as column headers
    df = df.drop(0).reset_index(drop=True)  # Drop the header row from the dataframe and reset index

    df.drop_duplicates(inplace=True)  # Drop duplicate rows
    print(f'Number of rows in {worksheet_name} worksheet: {len(df)}')
    return df

# ────────────────────────────────────────────────────────────────
# HELPERS
# ────────────────────────────────────────────────────────────────
def extract_doc_id(url: str) -> str:
    """Extract the Doc ID from a Google Docs URL."""
    m = re.search(r"/d/([a-zA-Z0-9_-]+)", url)
    if not m:
        raise ValueError(f"Unable to parse document ID from URL: {url}")
    return m.group(1)

def export_doc_html(doc_id: str) -> str:
    """
    Export a Google Doc as HTML via Drive API and return
    just the <body> contents (preserves paragraphs, bold, links, etc).
    """
    html_bytes = drivesvc.files().export(
        fileId=doc_id,
        mimeType="text/html"
    ).execute()
    soup = BeautifulSoup(html_bytes, "html.parser")
    # Grab only the inner <body> so we don’t pull in full <head>, styles, etc.
    return "".join(str(el) for el in soup.body.contents).strip()

def geocode(address: str, api_key: str) -> tuple:
    """Return (lat, lon) for a given address using Google Maps Geocoding API."""
    endpoint = "https://maps.googleapis.com/maps/api/geocode/json"
    resp = requests.get(endpoint, params={"address": address, "key": api_key})
    resp.raise_for_status()
    data = resp.json()
    if data.get("status") != "OK" or not data.get("results"):
        raise ValueError(f"Geocoding failed for '{address}': {data.get('status')}")
    loc = data["results"][0]["geometry"]["location"]
    return loc["lat"], loc["lng"]


In [4]:
df = fetch_data(sheet, 'Sheet1', range_name, 'df')

Fetching data from Google Sheets...
Number of rows in Sheet1 worksheet: 18


In [5]:
%store -r google_maps_API_Key

In [6]:
# geocode each Metro once, via googlemaps.Client:
gmaps = google_maps_API_Key = googlemaps.Client(key=google_maps_API_Key)
df['geocoded'] = df['Metro'].apply(lambda m: gmaps.geocode(m)[0]['geometry']['location'].values())

# ────────────────────────────────────────────────────────────────
#  HELPERS
# ────────────────────────────────────────────────────────────────

records = []
for _, row in df.iterrows():
    metro    = row['Metro']
    doc_url  = row['Overview Draft Doc']
    lat, lon = row['geocoded']           # unpack here

    print(f"Processing {metro}")
    doc_id      = extract_doc_id(doc_url)
    summary_html = export_doc_html(doc_id)

    records.append({
        'Metro':            metro,
        'Landing Page':     row['Landing Page'],
        'summary':          summary_html,
        'Last Updated':     row['Last Updated'],
        'Last Updated By':  row['Last Updated By'],
        'geometry':         Point(lon, lat)   # now lat/lon are defined
    })

if not records:
    raise RuntimeError("No records!")

# build GeoDataFrame straight from records
gdf = gpd.GeoDataFrame(records, crs="EPSG:4326")
gdf.to_file("market_overviews.geojson", driver="GeoJSON")
print(f"Wrote {len(gdf)} features")

Processing New York


NameError: name 'export_doc_html' is not defined

In [None]:
# Define the geocode function
def geocode(add):
    g = GOOGLE_MAPS_API_KEY.geocode(add)
    if g:
        lat = g[0]["geometry"]["location"]["lat"]
        lng = g[0]["geometry"]["location"]["lng"]
        return (lat, lng)
    else:
        return None

# Apply geocoding to the 'geo_address' column and store the results in 'geocoded' column
df['geocoded'] = df['Metro'].apply(geocode)

In [None]:
# ────────────────────────────────────────────────────────────────
#  PROCESS EACH DOC & BUILD RECORDS
# ────────────────────────────────────────────────────────────────
records = []
for _, row in df.iterrows():
    metro = row['Metro']
    doc_url = row['Overview Draft Doc']
    geocoded = row['geocoded']
    last_updated_date = row['Last Updated'],
    last_updated_author = row['Last Updated By']
    print(f"Processing metro: {metro}")

    # 1) extract Doc ID and export as HTML
    doc_id = extract_doc_id(doc_url)
    html_bytes = drivesvc.files().export(
        fileId=doc_id,
        mimeType="text/html"
    ).execute()
    soup = BeautifulSoup(html_bytes, 'html.parser')
    body = soup.body
    summary_html = ''.join(str(el) for el in body.contents).strip()

    records.append({
        'Metro':            metro,
        'Landing Page':     row['Landing Page'],
        'summary':          summary_html,        # ← your HTML string here
        'Last Updated':     row['Last Updated'],
        'Last Updated By':  row['Last Updated By'],
        'geometry':         Point(lng, lat)      # ← proper geometry
    })


if not records:
    raise ValueError("No records processed. Check your DOC_LINKS_CSV.")

In [None]:
OUTPUT_GEOJSON = "market_overviews.geojson"

In [None]:
# ────────────────────────────────────────────────────────────────
#  BUILD GEOJSON
# ────────────────────────────────────────────────────────────────
gdf = gpd.GeoDataFrame(
    records,
    geometry=[ Point(lon, lat) for lat, lon in df['geocoded'] ],
    crs="EPSG:4326"
)

gdf.to_file(f"{OUTPUT_GEOJSON}.geojson", driver="GeoJSON")

print(f"GeoJSON with {len(gdf)} features written to {OUTPUT_GEOJSON}")