# Executive Order Cleanup

In [1]:
import json
import os
import pandas
from concurrent.futures import ThreadPoolExecutor, as_completed
import queue
from typing import List
from langchain_ollama import ChatOllama
from langchain.prompts import ChatPromptTemplate
from langchain_core.language_models.chat_models import BaseChatModel
from pydantic import BaseModel, Field

## Masking

Given how prevalent presidential names are in language, LLMs would have learned specific sentiments about them. To avoid names from biasing the embedding and sentiment analysis of executive orders, build a pipeline to make out names from the last few signature lines of each EO.

In [None]:
def make_model(url):
    return ChatOllama(
        base_url=url,
        model = "qwen2.5",
        temperature = 0.0,
        num_predict = 9000,
        retries = 3
    )

model_pool = queue.Queue()
model_pool.put(make_model("http://tr-pro.l.co:11434"))
model_pool.put(make_model("http://tr-pro.l.co:11435"))
model_pool.put(make_model("http://localhost:11434"))

In [None]:
class MaskedResponse(BaseModel):
    lines: List[str] = Field(description="Masked lines of input.")

def make_pipeline(model: BaseChatModel):
    prompt = ChatPromptTemplate.from_template(
        '\n '.join([
            'Mask all versions, full and abbreviated, of the name "{name}" with "[NAME]" from the inputs. For example: ',
            '- [{name} THE WHITE HOUSE, September 30, 1993"] should be masked as ["[NAME] THE WHITE HOUSE, September 30, 1993"]',
            '- ["{name}, Assistant-Adjutant General."] should be masked as ["[NAME], Assistant-Adjutant General."]',
            '- ["{name}", "The White House,July 30, 1932."] should be masked as ["[NAME]", "The White House,July 30, 1932."]',
            '- ["Approved:", "{name}."] should be masked as ["Approved:", "[NAME]."]',
            '- ["{name}."] should be masked as ["[NAME]."]',
            'Ensure every input string has a corresponding string in the response.'
            'Inputs: {input}',
        ])
    )

    return prompt | model.with_structured_output(MaskedResponse, include_raw=True)

In [4]:
def process_response(result):
    return {
        'masked': result['parsed'].lines if result['parsed'] else None,
        'model': result['raw'].response_metadata['model'],
        'done': result['raw'].response_metadata['done'],
        'total_duration': result['raw'].response_metadata['total_duration'],
        'load_duration': result['raw'].response_metadata['load_duration'],
        'prompt_eval_duration': result['raw'].response_metadata['prompt_eval_duration'],
        'eval_duration': result['raw'].response_metadata['eval_duration'],
        'input_tokens': result['raw'].usage_metadata['input_tokens'],
        'output_tokens': result['raw'].usage_metadata['output_tokens']
    }

In [None]:
# try is out
model = model_pool.get()
masked = make_pipeline(model).batch([
    {
        'name': 'William J. Clinton',
        'input': json.dumps([
            "Sec. 4. This order shall take effect immediately.",
            "William J. ClintonTHE WHITE HOUSE,September 30, 1993."
        ])
    },
    {
        'name': 'William J. Clinton',
        'input': json.dumps([
            "Sec. 4. This order shall take effect immediately.",
            "William Clinton\nTHE WHITE HOUSE\n,September 30, 1993."
        ])
    },
    {
        'name': 'George Bush',
        'input': json.dumps([
            "GEORGE BUSH\nThe White House,\nDecember 3, 1992."
        ])
    },
    {
        'name': 'Ulysses S. Grant',
        'input': json.dumps([
            "E. D. TOWNSEND, Assistant-Adjutant General."
        ])
    },
    {
        'name': 'Ulysses S. Grant',
        'input': json.dumps([
            "Approved:",
            "U. S. Grant."
        ])
    },
    {
        'name': 'Donald J. Trump',
        'input': json.dumps([
            "DONALD J. TRUMP",
            "The White House,",
            "February 19, 2025."
        ])
    },
    {
        'name': 'Franklin D. Roosevelt',
        'input': json.dumps([
            "FDR",
            "The White House,April 7, 1934."
        ])
    },
    {
        'name': 'George Washington',
        'input': json.dumps([
            "By the President",
            "George Washington",
            "Thomas Jefferson[APP Note: Secretary of State.]"
        ])
    },
    {
        'name': 'Herbert Hoover',
        'input': json.dumps([
            "HERBERT HOOVER",
            "The White House,July 30, 1932."
        ])
    },
])
model_pool.put(model)

pandas.DataFrame(map(process_response, masked))

Unnamed: 0,masked,model,done,total_duration,load_duration,prompt_eval_duration,eval_duration,input_tokens,output_tokens
0,[Sec. 4. This order shall take effect immediat...,qwen2.5,True,939400561,33521893,138000000,749000000,382,50
1,[Sec. 4. This order shall take effect immediat...,qwen2.5,True,1644145153,41287426,43000000,660000000,384,50
2,"[[NAME]\nThe White House,\nDecember 3, 1992.]",qwen2.5,True,726999866,33183590,44000000,636000000,368,38
3,"[[NAME], Assistant-Adjutant General.]",qwen2.5,True,1301686880,38818643,45000000,469000000,365,31
4,"[Approved:, [NAME].]",qwen2.5,True,1152407238,38498454,29000000,395000000,360,26
5,"[[NAME], The White House,, February 19, 2025.]",qwen2.5,True,799421951,37118831,118000000,608000000,375,41
6,"[[NAME], The White House,April 7, 1934.]",qwen2.5,True,786130652,33777478,129000000,595000000,376,40
7,"[By the President, [NAME], [NAME][APP Note: Se...",qwen2.5,True,1392878128,40218654,33000000,559000000,370,38
8,"[[NAME], The White House,July 30, 1932.]",qwen2.5,True,1682791436,42005772,30000000,500000000,372,40


## Cleanup

In [None]:
source_dir = 'data/executive_orders/raw/'
target_dir = 'data/executive_orders/cleaned/'
log_path = 'data/executive_orders/cleaned_log.csv'

def process_eo(eo_path):
    # block until a model becomes available
    model = model_pool.get()

    result_row = {
        'path': eo_path
    }
    try:
        with open(source_dir + eo_path) as source_file:
            eo = json.load(source_file)

            # clean the byline by replacing \u2010 with -
            eo['president_byline'] = eo['president_byline'].replace('\u2010', '-')

            # content is an array of strings, flatmap strings with new lines into seperate strings
            flattened_content = []
            for s in eo['content']:
                flattened_content.extend(s.splitlines())
            eo["content"] = flattened_content

            # starting from the end, pick lines with less than 20 words up to the last 5 lines
            signature_lines = []
            for i in range(1, min(6, len(eo['content']))):
                cur_line = eo['content'][-i]
                if len(cur_line.split()) < 20:
                    signature_lines.append(cur_line)
                else:
                    break
            
            # mask if there are lines to mask
            if len(signature_lines) > 0:
                # try to mask, use unmaked if it fails
                try:
                    # reverse the signature lines
                    signature_lines.reverse()

                    # mask the signature lines
                    masked_response = process_response(
                        make_pipeline(model).invoke({
                            'input': json.dumps(signature_lines)
                        })
                    )

                    result_row = result_row | masked_response.copy()
                    result_row.pop('masked', None)

                    masked_lines = [] if masked_response.get('masked') is None else masked_response.get('masked')

                    # skip if the LLM had an obvious wrong solution
                    if (len(masked_lines) == len(signature_lines)):
                        # replace the signature lines with the masked lines
                        eo['content'] = eo['content'][0:-len(signature_lines)] + masked_lines
                except Exception as e:
                    result_row['error'] = str(e)

            # write the cleaned eo to the target directory
            with open(target_dir + eo_path, 'w') as target_file:
                json.dump(eo, target_file, indent=4)
        
    finally:
        model_pool.put(model)

    return result_row

In [None]:
# load all the raw EOs do some cleanup
eo_paths = os.listdir(source_dir)
eo_paths.sort()

results = []
with ThreadPoolExecutor(max_workers=len(model_pool)) as executor:
    futures = [
        executor.submit(process_eo, path) for path in eo_paths
    ]

    for future in as_completed(futures):
        results.append(future.result())
        
pandas.DataFrame(results).to_csv(log_path, index=False)