# Download and Format Lawmatics Contacts

This Notebook downloads all contacts from Lawmatics, formats them into a tabular format, applies data types, then loads them into a BigQuery datastore table for further processing.

## Install Requirements

In [1]:
!pip install google.cloud google-cloud-secret-manager

Collecting google.cloud
  Downloading google_cloud-0.34.0-py2.py3-none-any.whl.metadata (2.7 kB)
Downloading google_cloud-0.34.0-py2.py3-none-any.whl (1.8 kB)
Installing collected packages: google.cloud
Successfully installed google.cloud-0.34.0


## Retrieve Lawmatics API Token

In [2]:
#!/usr/bin/env python3
from google.cloud import secretmanager
from hashlib import sha1



def access_secret(project_id, secret_id, version_id="latest"):
    """
    Access a secret from Google Cloud Secret Manager

    Args:
        project_id: Your Google Cloud project ID
        secret_id: The ID of the secret
        version_id: The version of the secret (defaults to "latest")

    Returns:
        The secret value as a string
    """
    # Create the Secret Manager client
    client = secretmanager.SecretManagerServiceClient()

    # Build the resource name of the secret version
    name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"

    # Access the secret version
    response = client.access_secret_version(request={"name": name})

    # Return the decoded secret
    return response.payload.data.decode("UTF-8")

# Example usage
if __name__ == "__main__":
    project_id = '566451752897'
    secret_id = 'mlo-lawmaticsApiKey'

    try:
        secret_value = access_secret(project_id, secret_id)
        secret_value_sha1 = sha1(secret_value.encode())
        secret_value_digest = secret_value_sha1.hexdigest()
        print(f"Secret retrieved successfully: (sha1) {secret_value_digest}")
    except Exception as e:
        print(f"Error accessing secret: {e}")

Secret retrieved successfully: (sha1) c32003bafa4577ecdbce1036cc7cedc098ae378a


## Get API JSON Data
Retrieve paginated data from a REST API using a non-expiring OAuth2 access token and transform it into a tabular format for analysis.

### Set up api request

#### Subtask:
Define the API endpoint, headers (including the access token), and any initial parameters for the first page of data.


In [3]:
api_endpoint = 'https://api.lawmatics.com/v1/contacts'
headers = {
    'Authorization': f'Bearer {secret_value}'
}
params = {
    'fields': 'all',
    'page': 1,
    'per_page': 100 # Or any other appropriate value based on API documentation
}

### Implement pagination logic

#### Subtask:
Handle the API's pagination, making requests for subsequent pages until all data is retrieved.


In [4]:
import requests
import time

all_data = []
page_num = 1
max_retries = 3
retry_delay = 5 # seconds

# api_endpoint is already defined in a previous cell
# headers and params are already defined in a previous cell

# Remove the Cookie header as it's not necessary according to API documentation
if 'Cookie' in headers:
    del headers['Cookie']


while True:
    params['page'] = page_num
    retries = 0
    while retries < max_retries:
        try:
            # Allow redirects by default (requests handles --location)
            response = requests.get(api_endpoint, headers=headers, params=params)
            response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
            page_data = response.json()

            # Check the structure of page_data based on the likely API response.
            # Assuming the API returns data in a key like 'data' and pagination info in 'meta'
            # If the API response structure is different, this part needs to be adjusted.
            if 'data' in page_data and isinstance(page_data['data'], list):
                current_page_items = page_data['data']
                if not current_page_items:  # Check if the 'data' list is empty
                    print("Empty data list received, assuming end of data.")
                    break # No more data

                all_data.extend(current_page_items)
                page_num += 1
                time.sleep(1) # Add a small delay to avoid overwhelming the API
                break # Break out of retry loop on success
            else:
                print("API response does not contain expected 'data' key with a list.")
                # Depending on the actual API response, you might want to
                # inspect page_data here to understand the structure.
                print(f"Response content: {page_data}")
                break # Exit if response structure is unexpected


        except requests.exceptions.RequestException as e:
            retries += 1
            print(f"Error during API request (Attempt {retries}/{max_retries}): {e}")
            if retries < max_retries:
                time.sleep(retry_delay)
            else:
                print("Max retries reached. Exiting.")
                break # Exit retry loop after max retries

    if retries == max_retries or ('data' in page_data and not page_data['data']):
        break # Exit main loop if max retries reached or no data received in the last successful attempt

print(f"Retrieved {len(all_data)} total records.")

Empty data list received, assuming end of data.
Retrieved 8982 total records.


## Process and convert data to tabular format

Convert the retrieved data from the API response (likely JSON) into a tabular format, such as a pandas DataFrame.


In [5]:
import pandas as pd

df = pd.DataFrame(all_data)
display(df.head())
display(df.info())

Unnamed: 0,id,type,attributes,relationships
0,24819750,contact,"{'first_name': 'Steve', 'last_name': 'Johnson'...","{'prospects': {'data': []}, 'tags': {'data': [..."
1,24817857,contact,"{'first_name': 'Stephen', 'last_name': 'Johnso...","{'prospects': {'data': [{'id': '15338944', 'ty..."
2,24805411,contact,"{'first_name': 'Terie', 'last_name': 'Russell'...","{'prospects': {'data': [{'id': '15335337', 'ty..."
3,24805399,contact,"{'first_name': 'Roxanne', 'last_name': 'Cunnin...","{'prospects': {'data': [{'id': '15335323', 'ty..."
4,24802896,contact,"{'first_name': 'Charles', 'last_name': 'Patti'...","{'prospects': {'data': [{'id': '15332766', 'ty..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8982 entries, 0 to 8981
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             8982 non-null   object
 1   type           8982 non-null   object
 2   attributes     8982 non-null   object
 3   relationships  8982 non-null   object
dtypes: object(4)
memory usage: 280.8+ KB


None

### Extract nested data from columns

Extract relevant information from the 'attributes' and 'relationships' columns and create new columns in the DataFrame.

In [6]:
import pandas as pd

# Extract data from the 'attributes' column if it exists
attributes_df = pd.DataFrame() # Initialize as empty
if 'attributes' in df.columns:
    attributes_df = df['attributes'].apply(pd.Series)
    df = df.drop(columns=['attributes'])


# Modified function to extract all relevant data from the 'relationships' column
# This function will now return a dictionary for each row
def extract_relationship_data_generic(relationship):
    extracted = {}
    if isinstance(relationship, dict):
        for key, value in relationship.items():
            if isinstance(value, dict) and 'data' in value:
                data_item = value['data']
                if isinstance(data_item, dict):
                    # For single related items, extract id and type
                    extracted[f"{key}_id"] = data_item.get('id')
                    extracted[f"{key}_type"] = data_item.get('type')
                elif isinstance(data_item, list):
                    # For lists of related items, join IDs and types
                    ids = [item.get('id') for item in data_item if isinstance(item, dict)]
                    types = [item.get('type') for item in data_item if isinstance(item, dict)]
                    # Use filter(None, ...) to remove None values before joining
                    extracted[f"{key}_ids"] = ','.join(filter(None, ids)) if ids else None
                    extracted[f"{key}_types"] = ','.join(filter(None, types)) if types else None
    return extracted


relationships_df_expanded = pd.DataFrame() # Initialize as empty
if 'relationships' in df.columns:
    # Apply the generic extraction function
    relationships_extracted_series = df['relationships'].apply(extract_relationship_data_generic)

    # Convert the Series of dictionaries into a DataFrame
    # This will create columns dynamically based on the keys present in the dictionaries
    relationships_df_expanded = pd.DataFrame(relationships_extracted_series.tolist())

    # Drop the original 'relationships' column
    df = df.drop(columns=['relationships'])


# Process 'custom_fields' column
custom_fields_df_expanded = pd.DataFrame() # Initialize as empty
if 'custom_fields' in df.columns:
    def extract_custom_fields(custom_fields_list):
        extracted_fields = {}
        if isinstance(custom_fields_list, list):
            for field in custom_fields_list:
                if isinstance(field, dict) and 'name' in field and 'formatted_value' in field:
                    # Use the 'name' as the column name and 'formatted_value' as the cell value
                    extracted_fields[field['name']] = field['formatted_value']
        return extracted_fields

    custom_fields_extracted_series = df['custom_fields'].apply(extract_custom_fields)
    custom_fields_df_expanded = pd.DataFrame(custom_fields_extracted_series.tolist())

    # Drop the original 'custom_fields' column
    df = df.drop(columns=['custom_fields'])


# Final concatenation of all parts
# Ensure all parts have their index reset for correct concatenation.
all_parts = []

# Add the base dataframe (containing 'id', 'type', and other top-level columns after 'attributes', 'relationships', 'custom_fields' drops)
all_parts.append(df.reset_index(drop=True))

# Add the attributes dataframe if it's not empty
if not attributes_df.empty:
    all_parts.append(attributes_df.reset_index(drop=True))

# Add the relationships dataframe if it's not empty:
if not relationships_df_expanded.empty:
    all_parts.append(relationships_df_expanded.reset_index(drop=True))

# Add the custom_fields dataframe if it's not empty
if not custom_fields_df_expanded.empty:
    all_parts.append(custom_fields_df_expanded.reset_index(drop=True))

# Concatenate all parts along columns, filling missing values with NaN.
df = pd.concat(all_parts, axis=1, join='outer')

# Display the first few rows of the updated DataFrame to verify the changes
display(df.head())
display(df.info())

Unnamed: 0,id,type,first_name,last_name,email,email_address,phone,phone_number,address,birthdate,...,folders_ids,folders_types,created_by_id,created_by_type,custom_contact_type_id,custom_contact_type_type,client_id,client_type,company_id,company_type
0,24819750,contact,Steve,Johnson,sbjohnson001@gmail.com,sbjohnson001@gmail.com,970-672-6907,970-672-6907,,,...,,,23805,user,25081.0,custom_contact_type,,,,
1,24817857,contact,Stephen,Johnson,sbjohnson001@gmail.com,sbjohnson001@gmail.com,(970) 672-6907,(970) 672-6907,,,...,,,17204,user,,,,,,
2,24805411,contact,Terie,Russell,terie54@msn.com,terie54@msn.com,(970) 381-4527,(970) 381-4527,,,...,,,17204,user,,,,,,
3,24805399,contact,Roxanne,Cunningham,hflams47@gmail.com,hflams47@gmail.com,(970) 552-9003,(970) 552-9003,,,...,,,17204,user,,,,,,
4,24802896,contact,Charles,Patti,charlespatti007@gmail.com,charlespatti007@gmail.com,(720) 833-7700,(720) 833-7700,,,...,,,17204,user,,,,,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8982 entries, 0 to 8981
Data columns (total 62 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           8982 non-null   object 
 1   type                         8982 non-null   object 
 2   first_name                   8982 non-null   object 
 3   last_name                    6748 non-null   object 
 4   email                        4896 non-null   object 
 5   email_address                4896 non-null   object 
 6   phone                        4594 non-null   object 
 7   phone_number                 4594 non-null   object 
 8   address                      2825 non-null   object 
 9   birthdate                    1069 non-null   object 
 10  name_prefix                  7 non-null      object 
 11  middle_name                  1470 non-null   object 
 12  name_suffix                  23 non-null     object 
 13  informal_name     

None

## Load the formatted data frame into a BQ table

In [7]:
import pandas_gbq

project_id = 'www-prod-389819'
table_id = 'mlo_kpi.staging_lm_contacts'

# Load the DataFrame to BigQuery, replacing existing data
pandas_gbq.to_gbq(
    df,
    table_id,
    project_id=project_id,
    if_exists='replace'
)

print(f"DataFrame loaded to {table_id} in project {project_id} successfully, replacing existing data.")

100%|██████████| 1/1 [00:00<00:00, 6250.83it/s]

DataFrame loaded to mlo_kpi.staging_lm_contacts in project www-prod-389819 successfully, replacing existing data.



