# Download and Format Lawmatics Matters

This Notebook downloads all matters from Lawmatics, formats them into a tabular format, applies data types, then loads them into a BigQuery datastore table for further processing.

## Install Requirements

In [1]:
!pip install google.cloud google-cloud-secret-manager



## Retrieve Lawmatics API Token

In [2]:
#!/usr/bin/env python3
from google.cloud import secretmanager
from hashlib import sha1



def access_secret(project_id, secret_id, version_id="latest"):
    """
    Access a secret from Google Cloud Secret Manager

    Args:
        project_id: Your Google Cloud project ID
        secret_id: The ID of the secret
        version_id: The version of the secret (defaults to "latest")

    Returns:
        The secret value as a string
    """
    # Create the Secret Manager client
    client = secretmanager.SecretManagerServiceClient()

    # Build the resource name of the secret version
    name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"

    # Access the secret version
    response = client.access_secret_version(request={"name": name})

    # Return the decoded secret
    return response.payload.data.decode("UTF-8")

# Example usage
if __name__ == "__main__":
    project_id = '566451752897'
    secret_id = 'mlo-lawmaticsApiKey'

    try:
        secret_value = access_secret(project_id, secret_id)
        secret_value_sha1 = sha1(secret_value.encode())
        secret_value_digest = secret_value_sha1.hexdigest()
        print(f"Secret retrieved successfully: (sha1) {secret_value_digest}")
    except Exception as e:
        print(f"Error accessing secret: {e}")

Secret retrieved successfully: (sha1) c32003bafa4577ecdbce1036cc7cedc098ae378a


## Get API JSON Data
Retrieve paginated data from a REST API using a non-expiring OAuth2 access token and transform it into a tabular format for analysis.

### Set up api request

#### Subtask:
Define the API endpoint, headers (including the access token), and any initial parameters for the first page of data.


In [3]:
api_endpoint = 'https://api.lawmatics.com/v1/custom_fields'
headers = {
    'Authorization': f'Bearer {secret_value}'
}
params = {
    'fields': 'all',
    'page': 1,
    'per_page': 100 # Or any other appropriate value based on API documentation
}

### Implement pagination logic

#### Subtask:
Handle the API's pagination, making requests for subsequent pages until all data is retrieved.


In [4]:
import requests
import time

all_data = []
page_num = 1
max_retries = 3
retry_delay = 5 # seconds

# api_endpoint is already defined in a previous cell
# headers and params are already defined in a previous cell

# Remove the Cookie header as it's not necessary according to API documentation
if 'Cookie' in headers:
    del headers['Cookie']


while True:
    params['page'] = page_num
    retries = 0
    while retries < max_retries:
        try:
            # Allow redirects by default (requests handles --location)
            response = requests.get(api_endpoint, headers=headers, params=params)
            response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
            page_data = response.json()

            # Check the structure of page_data based on the likely API response.
            # Assuming the API returns data in a key like 'data' and pagination info in 'meta'
            # If the API response structure is different, this part needs to be adjusted.
            if 'data' in page_data and isinstance(page_data['data'], list):
                current_page_items = page_data['data']
                if not current_page_items:  # Check if the 'data' list is empty
                    print("Empty data list received, assuming end of data.")
                    break # No more data

                all_data.extend(current_page_items)
                page_num += 1
                time.sleep(1) # Add a small delay to avoid overwhelming the API
                break # Break out of retry loop on success
            else:
                print("API response does not contain expected 'data' key with a list.")
                # Depending on the actual API response, you might want to
                # inspect page_data here to understand the structure.
                print(f"Response content: {page_data}")
                break # Exit if response structure is unexpected


        except requests.exceptions.RequestException as e:
            retries += 1
            print(f"Error during API request (Attempt {retries}/{max_retries}): {e}")
            if retries < max_retries:
                time.sleep(retry_delay)
            else:
                print("Max retries reached. Exiting.")
                break # Exit retry loop after max retries

    if retries == max_retries or ('data' in page_data and not page_data['data']):
        break # Exit main loop if max retries reached or no data received in the last successful attempt

print(f"Retrieved {len(all_data)} total records.")

Empty data list received, assuming end of data.
Retrieved 496 total records.


## Process and convert data to tabular format

Convert the retrieved data from the API response (likely JSON) into a tabular format, such as a pandas DataFrame.


In [5]:
import pandas as pd

df = pd.DataFrame(all_data)
display(df.head())
display(df.info())

Unnamed: 0,id,type,attributes
0,518291,custom_field,"{'name': 'Tangible Personal Property', 'field_..."
1,516148,custom_field,"{'name': 'LLPS_inv_Date of Wedding', 'field_ty..."
2,472941,custom_field,{'name': 'Send Engagement Agreement through Ne...
3,453991,custom_field,"{'name': 'Estate Plan Design Meeting Date', 'f..."
4,518361,custom_field,{'name': 'LLPS_Real Estate_Do you have Real Es...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          496 non-null    object
 1   type        496 non-null    object
 2   attributes  496 non-null    object
dtypes: object(3)
memory usage: 11.8+ KB


None

### Extract nested data from columns

Extract relevant information from the 'attributes' and 'relationships' columns and create new columns in the DataFrame.

In [7]:
import pandas as pd
import re
import json
# Import json_normalize from pandas for flattening nested JSON/dictionaries
from pandas import json_normalize

# Make a copy of the original DataFrame to work with
df_processed = df.copy()

# Prepare the attributes DataFrame
attributes_df = pd.DataFrame() # Initialize as empty
if 'attributes' in df_processed.columns:
    # Use json_normalize to flatten the dictionaries in 'attributes' into a new DataFrame
    # It handles cases where 'attributes' might contain different keys across rows
    attributes_df = json_normalize(df_processed['attributes'])

    # Rename the 'type' column from attributes to 'attribute_type'
    if 'type' in attributes_df.columns:
        attributes_df = attributes_df.rename(columns={'type': 'attribute_type'})

    # Process 'list_options' column if it exists in attributes_df
    # json_normalize typically parses JSON strings to Python objects if they are fields.
    # However, keeping safe_json_loads ensures consistency and handles any remaining string representations.
    if 'list_options' in attributes_df.columns:
        def safe_json_loads(x):
            if isinstance(x, str):
                try:
                    return json.loads(x)
                except json.JSONDecodeError:
                    # Handle cases where the string is not valid JSON
                    return None
            return x # Return as is if it's not a string (e.g., already a list/dict or None)
        attributes_df['list_options'] = attributes_df['list_options'].apply(safe_json_loads)

# Drop the original 'attributes' and 'relationships' columns from the main DataFrame AFTER processing them
columns_to_drop = []
if 'attributes' in df_processed.columns:
    columns_to_drop.append('attributes')
if columns_to_drop:
    df_processed = df_processed.drop(columns=columns_to_drop)

# Drop the top-level 'type' column as these are all the same value
if 'type' in df_processed.columns:
    df_processed = df_processed.drop(columns='type')

# Concatenate all parts
all_parts_final = []
all_parts_final.append(df_processed.reset_index(drop=True))

if not attributes_df.empty:
    all_parts_final.append(attributes_df.reset_index(drop=True))

df = pd.concat(all_parts_final, axis=1, join='outer')


# Convert common date fields to datetime objects
date_columns_to_convert = [
    'created_at', 'updated_at'
]

for col in date_columns_to_convert:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce', utc=True)

# Convert NaN values to None for all columns
df = df.where(pd.notna(df), None)

# Function to clean column names for BigQuery compatibility
def clean_column_name(col_name):
    # Replace spaces with underscores
    col_name = col_name.replace(' ', '_')
    # Remove characters that are not alphanumeric or underscores
    col_name = re.sub(r'[^0-9A-Za-z_]', '', col_name)
    # Convert to lowercase
    col_name = col_name.lower()
    # Ensure it doesn't start with a number (BigQuery requirement)
    if col_name and col_name[0].isdigit():
        col_name = '_' + col_name
    # Truncate to BigQuery's max length (300 characters) if necessary
    col_name = col_name[:300]
    return col_name

# Apply column name cleaning to all columns in the DataFrame
df.columns = [clean_column_name(col) for col in df.columns]

# Display the first few rows of the updated DataFrame to verify the changes
display(df.head())
display(df.info())

Unnamed: 0,id,name,field_type,visibility,attribute_type,list_options,practice_area,created_at,updated_at,lookup_type
0,518291,Tangible Personal Property,list,hidden,PracticeArea,"[{'id': '444243', 'name': 'To Living Children'...",Estate Planning,2024-11-11 16:09:38.125000+00:00,2024-11-11 16:09:38.125000+00:00,
1,516148,LLPS_inv_Date of Wedding,date,default,Prospect,,,2024-11-07 16:02:13.557000+00:00,2025-05-05 21:30:25.622000+00:00,
2,472941,Send Engagement Agreement through New Client P...,boolean,hidden,Prospect,,,2024-07-31 22:01:21.596000+00:00,2024-07-31 22:01:21.596000+00:00,
3,453991,Estate Plan Design Meeting Date,date,starred,Prospect,,,2024-06-16 13:42:02.101000+00:00,2024-06-16 14:58:11.624000+00:00,
4,518361,LLPS_Real Estate_Do you have Real Estate you w...,list,hidden,PracticeArea,"[{'id': '444312', 'name': 'Yes'}, {'id': '4443...",Estate Planning,2024-11-11 18:26:44.505000+00:00,2024-11-11 18:26:44.505000+00:00,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   id              496 non-null    object             
 1   name            496 non-null    object             
 2   field_type      496 non-null    object             
 3   visibility      496 non-null    object             
 4   attribute_type  496 non-null    object             
 5   list_options    138 non-null    object             
 6   practice_area   100 non-null    object             
 7   created_at      496 non-null    datetime64[ns, UTC]
 8   updated_at      496 non-null    datetime64[ns, UTC]
 9   lookup_type     70 non-null     object             
dtypes: datetime64[ns, UTC](2), object(8)
memory usage: 38.9+ KB


None

In [9]:
print(df['list_options'].dtype)

object


## Load the formatted data frame into a BQ table

In [10]:
import pandas_gbq

project_id = 'www-prod-389819'
table_id = 'mlo_kpi.rel_custom_fields'

# Load the DataFrame to BigQuery, replacing existing data
pandas_gbq.to_gbq(
    df,
    table_id,
    project_id=project_id,
    if_exists='replace'
)

print(f"DataFrame loaded to {table_id} in project {project_id} successfully, replacing existing data.")

100%|██████████| 1/1 [00:00<00:00, 7463.17it/s]

DataFrame loaded to mlo_kpi.rel_custom_fields in project www-prod-389819 successfully, replacing existing data.



