# Executive Order Analysis

Use the clusters to analyze executive orders to build tags and cluster titles.

In [2]:
import json
import pandas
import os
from typing import List, Dict
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import BaseModel, Field
from concurrent.futures import ThreadPoolExecutor, as_completed

In [6]:
analysis_dir = 'data/executive_orders/cluster_analysis/'

clusters_df = pandas.read_csv('data/executive_orders/clusters.csv')

gemini = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.2,
    max_retries=3,
    max_output_tokens=8000
)

class SampleAnalysis(BaseModel):
    analysis: str = Field(description='Analysis of the texts.')
    tags: List[str] = Field(description='Tags assigned to the cluster..')
    title: str = Field(description='Title of the cluster.')

class SampleAnalysisWithCluster(SampleAnalysis):
    cluster: int = Field(description='Cluster the analysis is for.')

class RefinedAnalysis(BaseModel):
    clusters: List[SampleAnalysisWithCluster] = Field(description='Refined analysis for each cluster in input order.')

In [None]:
def sample_to_markdown(sample):
    return '\n'.join([
        f'## Analysis',
        sample['analysis'],
        f'## Tags',
        ', '.join(sample['tags']),
        f'## Title',
        sample['title']
    ])

## Sampling

Sample documents within a cluster to analyze and start describing it.

In [None]:
def build_tags_pipeline(model, n_texts: int):
    """
    Pipeline to summarize a set of sample texts for a cluster.
    """
    prompt = [
        'Your task is to understand why the given documents were assigned to the same cluster of documents of the same type.',
        '- First analyze the documents for common themes, topics, and sentiment.',
        '- Then assign tags that would apply to all given documents and unique to this cluster based on your analysis.',
        '- Finally give the cluster a title based on the analysis and derived tags to distinguish it from the other clusters.',
        '- Do not use tags or titles that describe the document type, such as "Executive Order" or "Presidential Orders".',
        '- Do not use generic phrases like "Policies", "Regulations", "Orders", or "Actions" in titles, be specific about what is in common in the documents.'
    ]
    for i in range(n_texts):
        prompt.extend([
            f'# Document {i+1}',
            f'{{document_{i}}}',
        ])

    return ChatPromptTemplate.from_template(
        '\n'.join(prompt)
    ) | model.with_structured_output(SampleAnalysis, include_raw=True)

def sample_in_cluster(cluster: int, n_texts: int):
    population = clusters_df[clusters_df['cluster'] == cluster]['path']
    sampled = population.sample(min(n_texts, population.shape[0]))

    documents = []
    for p in sampled:
        with open('data/executive_orders/raw/' + p) as f:
            eo = json.load(f)
            documents.append('\n'.join(eo["content"]))

    return documents

def sample_all_clusters(sample_size: int, sample_i: int):
    results = [f'Sample size: {sample_size}']
    for i in range(grouped.shape[0]):
        sampled = sample_in_cluster(i, sample_size)
        response = build_tags_pipeline(gemini, len(sampled))\
            .invoke({f'document_{d_i}': d for d_i, d in enumerate(sampled)})
        
        result = response['parsed'].model_dump() | {
            'request': {
                'cluster': i,
                'sample_size': sample_size
            },
            'stats': {
                'input_tokens': response['raw'].usage_metadata['input_tokens'],
                'output_tokens': response['raw'].usage_metadata['output_tokens'],
            }
        }

        with open(analysis_dir + f'{i}_{sample_size}_{sample_i}.json', 'w') as f:
            json.dump(result, f, indent=4)

        results.append(f'{str(i)}: {response['parsed'].title}')

    return results

In [None]:
sample_sizes = [10, 25, 50]
base_samples = 2

sample_params = []
for sample_size in sample_sizes:
    for i in range(base_samples * 50//sample_size):
        sample_params.append((sample_size, i))

with ThreadPoolExecutor(max_workers=2) as executor:
        futures = [executor.submit(sample_all_clusters, *params) for params in sample_params]

        for future in as_completed(futures):
            print('\n'.join(future.result()))


## Finalize Cluster Titles

Go over all the samples and finalize a title for each cluster.

In [4]:
samples = []
for tag_file in [f for f in os.listdir(analysis_dir) if f.endswith('.json')]:
    with open(analysis_dir + tag_file) as f:
        sample = json.load(f)

        samples.append({
            'analysis': sample['analysis'],
            'tags': sample['tags'],
            'title': sample['title'],
            'cluster': sample['request']['cluster'],
            'sample_size': sample['request']['sample_size']
        })

samples_df = pandas.DataFrame(samples)

### Coallate samples within each cluster

In [None]:
def build_cluster_pipeline(model, n_texts: int):
    prompt = [
        'Your task is build the final analysis, tags, and title for the document cluster given analysis across multiple samples of the cluster.',
        '- Construct a meta-analysis of the differences and similarities in the analysis, tags, and titles from the invdividual samples.',
        '- Based on the meta-analysis, refine the existing tags from the individual samples and coalesce them into a final set of tags.',
        '- Based on the meta-analysis and revised tags, refine the existing titles from the individual samples and come up with a final title.',
        '- Do not use generic phrases like "Policies", "Executive Orders", or "Executive Actions" in titles, be specific about what makes the cluster unique.'
    ]
    for i in range(n_texts):
        prompt.extend([
            f'# Sample {i+1}',
            f'{{sample_{i}}}',
        ])

    return ChatPromptTemplate.from_template(
        '\n'.join(prompt)
    ) | model.with_structured_output(SampleAnalysis, include_raw=True)

In [None]:
coalesced_analysis = []
for cluster in samples_df['cluster'].unique():
    cluster_samples = samples_df[samples_df['cluster'] == cluster].reset_index(drop=True)

    response = build_cluster_pipeline(gemini, cluster_samples.shape[0]).invoke({
        f'sample_{i}': sample_to_markdown(sample) for i, sample in cluster_samples.iterrows()
    })

    coalesced = response['parsed'].model_dump() | {
        'cluster': int(cluster),
        'stats': {
            'input_tokens': response['raw'].usage_metadata['input_tokens'],
            'output_tokens': response['raw'].usage_metadata['output_tokens'],
        }
    }
    print(f'{cluster}: {coalesced["title"]}')

    coalesced_analysis.append(coalesced)

29: Military and Veteran Personnel: Regulation Amendments and Management
1: Amendments to Executive Orders: Government Structure and Personnel
47: Executive Actions Modifying Government Regulations and Policies
39: International Trade, Immigration, and Canal Regulations
8: Presidential Directives on National and International Governance
6: Executive Actions: Directing National Policy and Governance
34: Customs Ports of Entry and Collection Districts: Modifications and Adjustments
26: Governmental Actions on National Issues
12: Adjustments to Existing Executive Orders and Delegations of Authority
28: National Industrial Recovery Act (NIRA) Implementation and Industry Regulation
27: Native American Land Reservations: Establishment, Modification, and Administration
19: Presidential Exemptions to Mandatory Retirement for Government Employees
10: Land Reservation and Withdrawal Management
0: Military, Reconstruction, and Territorial Governance Directives, Civil War Era to Early 20th Century

In [None]:
with open('data/executive_orders/coalesced_clusters.json', 'w') as f:
    json.dump(coalesced_analysis, f, indent=4)

### Refine across clusters.

With a single coallated analysis for each cluster, refine them by considering all of the other clusters.

In [7]:
def build_refine_pipeline(model, n_texts: int):
    prompt = [
        'Your task is to consider the coalesced analysis across all clusters of documents to refine into a final analysis.',
        '- The given analysis only considers sampled document sets within each cluster, revise the analysis considering every other cluster to focus on what is unique.',
        '- Considering the revised analysis and tags of all other clusters, refine the tags for each cluster.',
        '- With the revised analysis and tags and considering all other clusters, refine the title to help characterize the current cluster and differentiate it from the rest.',
        '- Do not use generic phrases like "Policies", "Executive Orders", or "Executive Actions" in titles, be specific about what makes the cluster unique.',
    ]
    for i in range(n_texts):
        prompt.extend([
            f'# Cluster {i}',
            f'{{cluster_{i}}}',
        ])

    return ChatPromptTemplate.from_template(
        '\n'.join(prompt)
    ) | model.with_structured_output(RefinedAnalysis, include_raw=True)


In [10]:
with open('data/executive_orders/coalesced_clusters.json') as f:
    initial_coalesced = json.load(f)
initial_coalesced.sort(key=lambda x: x['cluster'])

refined_coalesced = build_refine_pipeline(gemini, len(initial_coalesced)).invoke({
    f'cluster_{cluster['cluster']}': sample_to_markdown(cluster) for _, cluster in enumerate(initial_coalesced)
})

In [11]:
with open('data/executive_orders/refined_clusters.json', 'w') as f:
    json.dump(list(map(lambda c: c.model_dump(), refined_coalesced['parsed'].clusters)), f, indent=4)