# Download and Format Lawmatics Time Entries

This Notebook downloads all time entries from Lawmatics, formats them into a tabular format, applies data types, then loads them into a BigQuery datastore table for further processing.

## Install Requirements

In [8]:
!pip install google.cloud google-cloud-secret-manager



## Retrieve Lawmatics API Token

In [9]:
#!/usr/bin/env python3
from google.cloud import secretmanager
from hashlib import sha1



def access_secret(project_id, secret_id, version_id="latest"):
    """
    Access a secret from Google Cloud Secret Manager

    Args:
        project_id: Your Google Cloud project ID
        secret_id: The ID of the secret
        version_id: The version of the secret (defaults to "latest")

    Returns:
        The secret value as a string
    """
    # Create the Secret Manager client
    client = secretmanager.SecretManagerServiceClient()

    # Build the resource name of the secret version
    name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"

    # Access the secret version
    response = client.access_secret_version(request={"name": name})

    # Return the decoded secret
    return response.payload.data.decode("UTF-8")

# Example usage
if __name__ == "__main__":
    project_id = '566451752897'
    secret_id = 'LM_API_TOKEN'

    try:
        secret_value = access_secret(project_id, secret_id)
        secret_value_sha1 = sha1(secret_value.encode())
        secret_value_digest = secret_value_sha1.hexdigest()
        print(f"Secret retrieved successfully: (sha1) {secret_value_digest}")
    except Exception as e:
        print(f"Error accessing secret: {e}")

Secret retrieved successfully: (sha1) 9793ec7db1c271bf0898ec6a4423c80a863799e6


## Get API JSON Data
Retrieve paginated data from a REST API using a non-expiring OAuth2 access token and transform it into a tabular format for analysis.

### Set up api request

#### Subtask:
Define the API endpoint, headers (including the access token), and any initial parameters for the first page of data.


In [10]:
api_endpoint = 'https://api.lawmatics.com/v1/time_entries'
headers = {
    'Authorization': f'Bearer {secret_value}'
}
params = {
    'fields': 'all',
    'page': 1,
    'per_page': 100 # Or any other appropriate value based on API documentation
}

### Implement pagination logic

#### Subtask:
Handle the API's pagination, making requests for subsequent pages until all data is retrieved.


In [11]:
import requests
import time

all_data = []
page_num = 1
max_retries = 3
retry_delay = 5 # seconds

# api_endpoint is already defined in a previous cell
# headers and params are already defined in a previous cell

# Remove the Cookie header as it's not necessary according to API documentation
if 'Cookie' in headers:
    del headers['Cookie']


while True:
    params['page'] = page_num
    retries = 0
    while retries < max_retries:
        try:
            # Allow redirects by default (requests handles --location)
            response = requests.get(api_endpoint, headers=headers, params=params)
            response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
            page_data = response.json()

            # Check the structure of page_data based on the likely API response.
            # Assuming the API returns data in a key like 'data' and pagination info in 'meta'
            # If the API response structure is different, this part needs to be adjusted.
            if 'data' in page_data and isinstance(page_data['data'], list):
                current_page_items = page_data['data']
                if not current_page_items:  # Check if the 'data' list is empty
                    print("Empty data list received, assuming end of data.")
                    break # No more data

                all_data.extend(current_page_items)
                page_num += 1
                time.sleep(1) # Add a small delay to avoid overwhelming the API
                break # Break out of retry loop on success
            else:
                print("API response does not contain expected 'data' key with a list.")
                # Depending on the actual API response, you might want to
                # inspect page_data here to understand the structure.
                print(f"Response content: {page_data}")
                break # Exit if response structure is unexpected


        except requests.exceptions.RequestException as e:
            retries += 1
            print(f"Error during API request (Attempt {retries}/{max_retries}): {e}")
            if retries < max_retries:
                time.sleep(retry_delay)
            else:
                print("Max retries reached. Exiting.")
                break # Exit retry loop after max retries

    if retries == max_retries or ('data' in page_data and not page_data['data']):
        break # Exit main loop if max retries reached or no data received in the last successful attempt

print(f"Retrieved {len(all_data)} total records.")

Empty data list received, assuming end of data.
Retrieved 588 total records.


## Process and convert data to tabular format

Convert the retrieved data from the API response (likely JSON) into a tabular format, such as a pandas DataFrame.


In [12]:
import pandas as pd

df = pd.DataFrame(all_data)
display(df.head())
display(df.info())

Unnamed: 0,id,type,attributes,relationships
0,1892963,time_entry,{'description': 'Document Preparation and fili...,"{'staff': {'data': {'id': '19961', 'type': 'us..."
1,1891485,time_entry,"{'description': 'Meeting', 'duration': 0.75, '...","{'staff': {'data': {'id': '19961', 'type': 'us..."
2,1890462,time_entry,"{'description': 'Phone call', 'duration': 0.5,...","{'staff': {'data': {'id': '19961', 'type': 'us..."
3,1890460,time_entry,"{'description': 'Document Review', 'duration':...","{'staff': {'data': {'id': '19961', 'type': 'us..."
4,1888410,time_entry,"{'description': 'L&L Stage 4 - Delivery', 'dur...","{'staff': {'data': {'id': '19961', 'type': 'us..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 588 entries, 0 to 587
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             588 non-null    object
 1   type           588 non-null    object
 2   attributes     588 non-null    object
 3   relationships  588 non-null    object
dtypes: object(4)
memory usage: 18.5+ KB


None

### Extract nested data from columns

Extract relevant information from the 'attributes' and 'relationships' columns and create new columns in the DataFrame.

In [13]:
import pandas as pd

# Extract data from the 'attributes' column if it exists
attributes_df = pd.DataFrame() # Initialize as empty
if 'attributes' in df.columns:
    attributes_df = df['attributes'].apply(pd.Series)
    df = df.drop(columns=['attributes'])


# Modified function to extract all relevant data from the 'relationships' column
# This function will now return a dictionary for each row
def extract_relationship_data_generic(relationship):
    extracted = {}
    if isinstance(relationship, dict):
        for key, value in relationship.items():
            if isinstance(value, dict) and 'data' in value:
                data_item = value['data']
                if isinstance(data_item, dict):
                    # For single related items, extract id and type
                    extracted[f"{key}_id"] = data_item.get('id')
                    extracted[f"{key}_type"] = data_item.get('type')
                elif isinstance(data_item, list):
                    # For lists of related items, join IDs and types
                    ids = [item.get('id') for item in data_item if isinstance(item, dict)]
                    types = [item.get('type') for item in data_item if isinstance(item, dict)]
                    # Use filter(None, ...) to remove None values before joining
                    extracted[f"{key}_ids"] = ','.join(filter(None, ids)) if ids else None
                    extracted[f"{key}_types"] = ','.join(filter(None, types)) if types else None
    return extracted


relationships_df_expanded = pd.DataFrame() # Initialize as empty
if 'relationships' in df.columns:
    # Apply the generic extraction function
    relationships_extracted_series = df['relationships'].apply(extract_relationship_data_generic)

    # Convert the Series of dictionaries into a DataFrame
    # This will create columns dynamically based on the keys present in the dictionaries
    relationships_df_expanded = pd.DataFrame(relationships_extracted_series.tolist())

    # Drop the original 'relationships' column
    df = df.drop(columns=['relationships'])

# Final concatenation of all parts
# Ensure all parts have their index reset for correct concatenation.
all_parts = []

# Add the base dataframe (containing 'id', 'type', and other top-level columns after 'attributes', 'relationships', 'custom_fields' drops)
all_parts.append(df.reset_index(drop=True))

# Add the attributes dataframe if it's not empty
if not attributes_df.empty:
    all_parts.append(attributes_df.reset_index(drop=True))

# Add the relationships dataframe if it's not empty:
if not relationships_df_expanded.empty:
    all_parts.append(relationships_df_expanded.reset_index(drop=True))

# Concatenate all parts along columns, filling missing values with NaN.
df = pd.concat(all_parts, axis=1, join='outer')

# Display the first few rows of the updated DataFrame to verify the changes
display(df.head())
display(df.info())

Unnamed: 0,id,type,description,duration,duration_rounded,running_since,started_at,completed_at,is_billable,is_flat_fee,...,staff_id,staff_type,created_by_id,created_by_type,activity_type_id,activity_type_type,contactable_id,contactable_type,invoice_id,invoice_type
0,1892963,time_entry,Document Preparation and filing (multiple dates),2.3,2.3,,2026-01-12T13:29:57.180-08:00,2026-01-12T15:47:57.180-08:00,True,False,...,19961,user,19961,user,14498,activity_type,10786551,prospect,,
1,1891485,time_entry,Meeting,0.75,0.75,,2026-01-12T09:55:08.723-08:00,2026-01-12T10:40:08.723-08:00,True,False,...,19961,user,19963,user,14506,activity_type,15323307,prospect,,
2,1890462,time_entry,Phone call,0.5,0.5,,2026-01-06T07:37:32.390-08:00,2026-01-06T08:07:32.390-08:00,True,False,...,19961,user,19963,user,14507,activity_type,7074791,prospect,,
3,1890460,time_entry,Document Review,2.0,2.0,,2026-01-05T07:37:32.390-08:00,2026-01-05T09:37:32.390-08:00,True,False,...,19961,user,19963,user,14499,activity_type,7074791,prospect,,
4,1888410,time_entry,L&L Stage 4 - Delivery,0.0,0.0,,2026-01-09T13:50:10.771-08:00,2026-01-09T13:50:10.771-08:00,True,True,...,19961,user,19963,user,14504,activity_type,13055584,prospect,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 588 entries, 0 to 587
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  588 non-null    object 
 1   type                588 non-null    object 
 2   description         525 non-null    object 
 3   duration            588 non-null    float64
 4   duration_rounded    588 non-null    float64
 5   running_since       0 non-null      object 
 6   started_at          574 non-null    object 
 7   completed_at        573 non-null    object 
 8   is_billable         588 non-null    bool   
 9   is_flat_fee         588 non-null    bool   
 10  rate_cents          588 non-null    int64  
 11  created_at          588 non-null    object 
 12  updated_at          588 non-null    object 
 13  staff_id            588 non-null    object 
 14  staff_type          588 non-null    object 
 15  created_by_id       588 non-null    object 
 16  created_

None

## Load the formatted data frame into a BQ table

In [14]:
import pandas_gbq

project_id = 'www-prod-389819'
table_id = 'ohlaw.staging_lm_time_entries'

# Load the DataFrame to BigQuery, replacing existing data
pandas_gbq.to_gbq(
    df,
    table_id,
    project_id=project_id,
    if_exists='replace'
)

print(f"DataFrame loaded to {table_id} in project {project_id} successfully, replacing existing data.")

100%|██████████| 1/1 [00:00<00:00, 6842.26it/s]

DataFrame loaded to ohlaw.staging_lm_time_entries in project www-prod-389819 successfully, replacing existing data.



