# Executive Order Tagging

Use the clusters to figure out tags.

In [7]:
import json
import pandas
import os
from typing import List
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import BaseModel, Field
from concurrent.futures import ThreadPoolExecutor, as_completed

## Combine Datasets

Combine some datasets so we can understand which presidents and parties are in each cluster.

In [2]:
clusters = pandas.read_csv('data/executive_orders/clusters.csv')
clusters['name'] = clusters['path'].apply(lambda x: x.split('/')[-1].split('.')[0])

sentiment = pandas.read_csv('data/executive_orders/sentiment.csv')
presidents = pandas.read_csv('data/executive_orders/presidents.csv')

# add president to the clusters
combined = clusters.merge(sentiment, on='name')
# join with president
combined = combined.merge(presidents, on='president')

# save
combined.to_csv('data/executive_orders/combined.csv', index=False)

In [3]:
combined = pandas.read_csv('data/executive_orders/combined.csv')

# group by cluster and create a column of array of unique presidents and parties
grouped = combined.groupby('cluster').agg({
    'president': lambda x: list(set(x)),
    'party': lambda x: list(set(x))
}).reset_index()
grouped

Unnamed: 0,cluster,president,party
0,0,"[Benjamin Harrison, John Tyler, Franklin D. Ro...","[Whig, Democratic, No Party, Republican]"
1,1,"[Ronald Reagan, Harry S Truman, Donald J. Trum...","[Democratic, Republican]"
2,2,"[Theodore Roosevelt, Calvin Coolidge, Woodrow ...","[Democratic, Republican]"
3,3,"[Harry S Truman, Woodrow Wilson, Gerald R. For...","[Democratic, Republican]"
4,4,"[Theodore Roosevelt, Calvin Coolidge, Harry S ...","[Democratic, Republican]"
5,5,"[Ronald Reagan, Harry S Truman, Donald J. Trum...","[Democratic, Republican]"
6,6,"[Harry S Truman, Donald J. Trump (1st Term), J...","[Democratic, Republican]"
7,7,"[Theodore Roosevelt, Calvin Coolidge, Harry S ...","[Democratic, Republican]"
8,8,"[Ronald Reagan, Donald J. Trump (1st Term), Ly...","[Democratic, Republican]"
9,9,"[Herbert Hoover, Franklin D. Roosevelt]","[Democratic, Republican]"


## Sampling

Sample documents within a cluster to analyze and start describing.

In [9]:
gemini = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.1,
    max_retries=3
)

tags_dir = 'data/executive_orders/tags/'

class Tags(BaseModel):
    analysis: str = Field(description='Analysis of the texts.')
    tags: List[str] = Field(description='Tags assigned to the cluster..')
    title: str = Field(description='Title of the cluster.')

def build_tags_pipeline(model, n_texts: int):
    prompt = [
        'Your task is to understand why the given documents were assigned to the same cluster of documents of the same type.',
        '- First analyze the documents for common themes, topics, and sentiment.',
        '- Then assign tags that would apply to all given documents and unique to this cluster based on your analysis.',
        '- Finally give the cluster a title based on the analysis and derived tags to distinguish it from the other clusters.',
        '- Do not use tags or titles that describe the document type, such as "Executive Order" or "Presidential Orders".',
        '- Do not use generic phrases like "Policies", "Regulations", "Orders", or "Actions" in titles, be specific about what is in common in the documents.'
    ]
    for i in range(n_texts):
        prompt.extend([
            f'# Document {i+1}',
            f'{{document_{i}}}',
        ])

    return ChatPromptTemplate.from_template(
        '\n'.join(prompt)
    ) | model.with_structured_output(Tags, include_raw=True)

def sample_in_cluster(cluster: int, n_texts: int):
    population = combined[combined['cluster'] == cluster]['path']
    sampled = population.sample(min(n_texts, population.shape[0]))

    documents = []
    for p in sampled:
        with open('data/executive_orders/raw/' + p) as f:
            eo = json.load(f)
            documents.append('\n'.join(eo["content"]))

    return documents

def sample_all_clusters(sample_size: int, sample_i: int):
    results = [f'Sample size: {sample_size}']
    for i in range(grouped.shape[0]):
        sampled = sample_in_cluster(i, sample_size)
        response = build_tags_pipeline(gemini, len(sampled))\
            .invoke({f'document_{d_i}': d for d_i, d in enumerate(sampled)})
        
        result = response['parsed'].model_dump() | {
            'request': {
                'cluster': i,
                'sample_size': sample_size
            },
            'stats': {
                'input_tokens': response['raw'].usage_metadata['input_tokens'],
                'output_tokens': response['raw'].usage_metadata['output_tokens'],
            }
        }

        with open(tags_dir + f'{i}_{sample_size}_{sample_i}.json', 'w') as f:
            json.dump(result, f, indent=4)

        results.append(f'{str(i)}: {response['parsed'].title}')

    return results

In [None]:
sample_sizes = [10, 25, 50]
base_samples = 2

sample_params = []
for sample_size in sample_sizes:
    for i in range(base_samples * 50//sample_size):
        sample_params.append((sample_size, i))

with ThreadPoolExecutor(max_workers=2) as executor:
        futures = [executor.submit(sample_all_clusters, *params) for params in sample_params]

        for future in as_completed(futures):
            print('\n'.join(future.result()))


## Finalize Cluster Titles

Go over all the samples and finalize a title for each cluster.

In [19]:
samples = []

for tag_file in [f for f in os.listdir(tags_dir) if f.endswith('.json')]:
    with open(tags_dir + tag_file) as f:
        samples.append(json.load(f))

samples

[{'analysis': 'The documents involve the transfer, restoration, or reservation of land, often for specific purposes such as military use, lighthouse services, or public use. Many documents describe land in Hawaii, but also include locations such as Puerto Rico, California, Alaska, and the Philippines. The documents often include specific coordinates and detailed descriptions of the land parcels.',
  'tags': ['Land Transfer',
   'Land Restoration',
   'Land Reservation',
   'Geographic Locations',
   'Coordinate Descriptions'],
  'title': 'Land Management Actions',
  'request': {'cluster': 17, 'sample_size': 10},
  'stats': {'input_tokens': 6210, 'output_tokens': 84}},
 {'analysis': 'The documents primarily concern modifications, revocations, and withdrawals of public lands, often related to power sites, water reserves, oil shale, and other resource management. They specify land descriptions using meridian coordinates and reference various acts of Congress authorizing these actions. The