# Download and Format Lawmatics Events

This Notebook downloads all events from Lawmatics, formats them into a tabular format, applies data types, then loads them into a BigQuery datastore table for further processing.

## Install Requirements

In [None]:
!pip install google.cloud google-cloud-secret-manager



## Retrieve Lawmatics API Token

In [None]:
#!/usr/bin/env python3
from google.cloud import secretmanager
from hashlib import sha1



def access_secret(project_id, secret_id, version_id="latest"):
    """
    Access a secret from Google Cloud Secret Manager

    Args:
        project_id: Your Google Cloud project ID
        secret_id: The ID of the secret
        version_id: The version of the secret (defaults to "latest")

    Returns:
        The secret value as a string
    """
    # Create the Secret Manager client
    client = secretmanager.SecretManagerServiceClient()

    # Build the resource name of the secret version
    name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"

    # Access the secret version
    response = client.access_secret_version(request={"name": name})

    # Return the decoded secret
    return response.payload.data.decode("UTF-8")

# Example usage
if __name__ == "__main__":
    project_id = '566451752897'
    secret_id = 'mlo-lawmaticsApiKey'

    try:
        secret_value = access_secret(project_id, secret_id)
        secret_value_sha1 = sha1(secret_value.encode())
        secret_value_digest = secret_value_sha1.hexdigest()
        print(f"Secret retrieved successfully: (sha1) {secret_value_digest}")
    except Exception as e:
        print(f"Error accessing secret: {e}")

Secret retrieved successfully: (sha1) c32003bafa4577ecdbce1036cc7cedc098ae378a


## Get API JSON Data
Retrieve paginated data from a REST API using a non-expiring OAuth2 access token and transform it into a tabular format for analysis.

### Set up api request

#### Subtask:
Define the API endpoint, headers (including the access token), and any initial parameters for the first page of data.


In [None]:
api_endpoint = 'https://api.lawmatics.com/v1/events'
headers = {
    'Authorization': f'Bearer {secret_value}'
}
params = {
    'page': 1,
    'per_page': 100 # Or any other appropriate value based on API documentation
}

### Implement pagination logic

#### Subtask:
Handle the API's pagination, making requests for subsequent pages until all data is retrieved.


In [12]:
import requests
import time

all_data = []
page_num = 1
max_retries = 3
retry_delay = 5 # seconds

# api_endpoint is already defined in a previous cell
# headers and params are already defined in a previous cell

# Remove the Cookie header as it's not necessary according to API documentation
if 'Cookie' in headers:
    del headers['Cookie']


while True:
    params['page'] = page_num
    retries = 0
    while retries < max_retries:
        try:
            # Allow redirects by default (requests handles --location)
            response = requests.get(api_endpoint, headers=headers, params=params)
            response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
            page_data = response.json()

            # Check the structure of page_data based on the likely API response.
            # Assuming the API returns data in a key like 'data' and pagination info in 'meta'
            # If the API response structure is different, this part needs to be adjusted.
            if 'data' in page_data and isinstance(page_data['data'], list):
                current_page_items = page_data['data']
                if not current_page_items:  # Check if the 'data' list is empty
                    print("Empty data list received, assuming end of data.")
                    break # No more data

                all_data.extend(current_page_items)
                page_num += 1
                time.sleep(1) # Add a small delay to avoid overwhelming the API
                break # Break out of retry loop on success
            else:
                print("API response does not contain expected 'data' key with a list.")
                # Depending on the actual API response, you might want to
                # inspect page_data here to understand the structure.
                print(f"Response content: {page_data}")
                break # Exit if response structure is unexpected


        except requests.exceptions.RequestException as e:
            retries += 1
            print(f"Error during API request (Attempt {retries}/{max_retries}): {e}")
            if retries < max_retries:
                time.sleep(retry_delay)
            else:
                print("Max retries reached. Exiting.")
                break # Exit retry loop after max retries

    if retries == max_retries or ('data' in page_data and not page_data['data']):
        break # Exit main loop if max retries reached or no data received in the last successful attempt

print(f"Retrieved {len(all_data)} total records.")

Empty data list received, assuming end of data.
Retrieved 1336 total records.


## Process and convert data to tabular format

Convert the retrieved data from the API response (likely JSON) into a tabular format, such as a pandas DataFrame.


In [13]:
import pandas as pd

df = pd.DataFrame(all_data)
display(df.head())
display(df.info())

Unnamed: 0,id,type,attributes,relationships
0,3445077,event,"{'name': 'Life and Legacy Planning Session', '...","{'event_type': {'data': {'id': '26427', 'type'..."
1,3444089,event,"{'name': 'Trust Review - Randy Culpepper', 'de...","{'event_type': {'data': {'id': '28575', 'type'..."
2,3443004,event,"{'name': 'Trust Review - Becky Haley', 'descri...","{'event_type': {'data': {'id': '28575', 'type'..."
3,3438307,event,{'name': 'Life and Legacy Planning Session - C...,"{'event_type': {'data': {'id': '26427', 'type'..."
4,3438037,event,{'name': 'Signing Ceremony - Dennis Scott Shau...,"{'event_type': {'data': {'id': '26429', 'type'..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1336 entries, 0 to 1335
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             1336 non-null   object
 1   type           1336 non-null   object
 2   attributes     1336 non-null   object
 3   relationships  1336 non-null   object
dtypes: object(4)
memory usage: 41.9+ KB


None

### Extract nested data from columns

Extract relevant information from the 'attributes' and 'relationships' columns and create new columns in the DataFrame.

In [14]:
# Extract data from the 'attributes' column if it exists
if 'attributes' in df.columns:
    attributes_df = df['attributes'].apply(pd.Series)
    # Drop the original 'attributes' column after processing
    df = df.drop(columns=['attributes'])
else:
    # If 'attributes' column is not in df, assume it was already processed
    # and the relevant columns are already in the DataFrame.
    # In this case, we don't need to extract from 'attributes' again.
    attributes_df = pd.DataFrame() # Create an empty DataFrame to concatenate later


# Extract relevant data from the 'relationships' column if it exists
def extract_relationship_data(relationship):
    event_type_id = None
    event_type_type = None
    eventable_id = None

    if isinstance(relationship, dict):
        if 'event_type' in relationship and isinstance(relationship['event_type'], dict) and 'data' in relationship['event_type'] and isinstance(relationship['event_type']['data'], dict):
            event_type_data = relationship['event_type']['data']
            event_type_id = event_type_data.get('id')
            event_type_type = event_type_data.get('type')

        if 'eventable' in relationship and isinstance(relationship['eventable'], dict) and 'data' in relationship['eventable'] and isinstance(relationship['eventable']['data'], dict):
            eventable_data = relationship['eventable']['data']
            eventable_id = eventable_data.get('id')

    return event_type_id, event_type_type, eventable_id

if 'relationships' in df.columns:
    relationships_data = df['relationships'].apply(extract_relationship_data)

    # Create new columns in the original DataFrame
    df['event_type_id'] = relationships_data.apply(lambda x: x[0])
    df['event_type_type'] = relationships_data.apply(lambda x: x[1])
    df['eventable_id'] = relationships_data.apply(lambda x: x[2])

    # Drop the original 'relationships' column after processing
    df = df.drop(columns=['relationships'])


# Concatenate the new columns from attributes_df with the original DataFrame
# We'll keep the original 'id' and 'type' columns from the API response
# Only concatenate attributes_df if it's not empty (i.e., if 'attributes' column existed)
if not attributes_df.empty:
    df = pd.concat([df[['id', 'type']], attributes_df, df[['event_type_id', 'event_type_type', 'eventable_id']]], axis=1)
else:
    # If attributes_df was empty, it means 'attributes' was already processed
    # and the relevant columns should already be in df.
    # In this case, just reorder columns to match the desired structure.
    # Ensure 'id', 'type', 'event_type_id', 'event_type_type', 'eventable_id' are in df
    # and then add any other existing columns from the previous attributes_df extraction.
    # Get columns that were likely extracted from attributes in a previous run
    # This assumes the columns extracted from attributes_df in the first run are still in df
    attributes_extracted_cols = [col for col in df.columns if col not in ['id', 'type', 'event_type_id', 'event_type_type', 'eventable_id']]
    # Define the desired order of columns, ensuring eventable_id is included
    ordered_cols = ['id', 'type'] + attributes_extracted_cols + ['event_type_id', 'event_type_type', 'eventable_id']
    # Reindex the DataFrame to apply the new column order
    # Use intersection to avoid KeyError if some columns are missing
    df = df.reindex(columns=list(set(ordered_cols) & set(df.columns)))


# Display the first few rows of the updated DataFrame to verify the changes
display(df.head())
display(df.info())

Unnamed: 0,id,type,name,description,time_zone,start_date,end_date,all_day,created_at,updated_at,canceled_at,event_type_id,event_type_type,eventable_id
0,3445077,event,Life and Legacy Planning Session,Matter Phone: (970) 672-6907,America/Denver,2026-02-03T08:00:00.000-08:00,2026-02-03T10:00:00.000-08:00,False,2026-01-13T11:51:46.150-08:00,2026-01-13T11:51:46.150-08:00,,26427,event_type,15338944
1,3444089,event,Trust Review - Randy Culpepper,Matter Phone: 303-596-0200,America/Denver,2026-01-27T12:30:00.000-08:00,2026-01-27T13:30:00.000-08:00,False,2026-01-13T10:03:19.617-08:00,2026-01-13T11:40:58.734-08:00,,28575,event_type,9336863
2,3443004,event,Trust Review - Becky Haley,Matter Phone: 970-867-9299,America/Denver,2026-02-03T12:00:00.000-08:00,2026-02-03T13:00:00.000-08:00,False,2026-01-13T08:17:38.211-08:00,2026-01-13T11:40:58.874-08:00,,28575,event_type,10058824
3,3438307,event,Life and Legacy Planning Session - Carl Rogensues,Matter Phone: 970-988-6564,America/Denver,2026-01-26T15:00:00.000-08:00,2026-01-26T17:00:00.000-08:00,False,2026-01-12T10:35:15.914-08:00,2026-01-13T11:40:58.582-08:00,,26427,event_type,15324261
4,3438037,event,Signing Ceremony - Dennis Scott Shaulis,Matter Phone: (505) 508-9057,America/Denver,2026-02-03T10:00:00.000-08:00,2026-02-03T11:00:00.000-08:00,,2026-01-12T09:59:59.378-08:00,2026-01-12T11:28:24.399-08:00,,26429,event_type,14847571


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1336 entries, 0 to 1335
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               1336 non-null   object
 1   type             1336 non-null   object
 2   name             1336 non-null   object
 3   description      1336 non-null   object
 4   time_zone        1336 non-null   object
 5   start_date       1335 non-null   object
 6   end_date         1335 non-null   object
 7   all_day          543 non-null    object
 8   created_at       1336 non-null   object
 9   updated_at       1336 non-null   object
 10  canceled_at      236 non-null    object
 11  event_type_id    1336 non-null   object
 12  event_type_type  1336 non-null   object
 13  eventable_id     1333 non-null   object
dtypes: object(14)
memory usage: 146.3+ KB


None

In [15]:
# Check if 'eventable_id' column exists in the DataFrame
if 'eventable_id' in df.columns:
    print("The 'eventable_id' column exists in the DataFrame.")
    # Display the number of non-null values in the 'eventable_id' column
    print(f"Number of non-null values in 'eventable_id': {df['eventable_id'].notna().sum()}")
else:
    print("The 'eventable_id' column does NOT exist in the DataFrame.")

# Display the list of all columns in the DataFrame for verification
print("\nDataFrame columns:")
print(df.columns.tolist())

The 'eventable_id' column exists in the DataFrame.
Number of non-null values in 'eventable_id': 1333

DataFrame columns:
['id', 'type', 'name', 'description', 'time_zone', 'start_date', 'end_date', 'all_day', 'created_at', 'updated_at', 'canceled_at', 'event_type_id', 'event_type_type', 'eventable_id']


## Load the formatted data frame into a BQ table

In [16]:
import pandas_gbq

project_id = 'www-prod-389819'
table_id = 'mlo_kpi.staging_lm_events'

# Load the DataFrame to BigQuery, replacing existing data
pandas_gbq.to_gbq(
    df,
    table_id,
    project_id=project_id,
    if_exists='replace'
)

print(f"DataFrame loaded to {table_id} in project {project_id} successfully, replacing existing data.")

100%|██████████| 1/1 [00:00<00:00, 6250.83it/s]

DataFrame loaded to mlo_kpi.staging_lm_events in project www-prod-389819 successfully, replacing existing data.



