# Executive Order Embeddings

In [None]:
%run notebooks/Setup.ipynb

import pandas
import numpy
import re
import json
from sentence_transformers import SentenceTransformer
import os
import pandas
from concurrent.futures import ThreadPoolExecutor
import queue
from sklearn.cluster import KMeans

In [None]:
# cpu
"""
models = [
    [0, SentenceTransformer("Salesforce/SFR-Embedding-2_R")]
]
"""

# gpu
num_gpus = 1
models = [
    [i, SentenceTransformer("Salesforce/SFR-Embedding-2_R", device=f'cuda:{i}')] for i in range(num_gpus)
]

## Embed EOs

In [None]:
# create batches of EOs to embed
eo_dir = 'data/executive_orders/raw/'
eo_paths = os.listdir(eo_dir)

def batched(list_in, batch_size):
    return [list_in[i * batch_size:(i + 1) * batch_size] for i in range((len(list_in) + batch_size - 1) // batch_size )]

eo_paths_batches = batched(eo_paths, 10)

In [None]:
# how to embed
if not os.path.exists('data/executive_orders/embeddings'):
    os.makedirs('data/executive_orders/embeddings')

def embed_batch(batch_i, batch, model):
    eos = []
    for path in batch:
        with open(eo_dir + path, 'r') as f:
            eo_doc = json.load(f)
        eos.append('\n'.join(eo_doc['content']))

    embeddings = model.encode(list(map(lambda x: str(x), eos)))

    df = pandas.DataFrame(embeddings)
    df.insert(0, 'file', batch)
    df.to_csv(f'data/executive_orders/embeddings/batch_{batch_i}.csv')

In [None]:
# perform the emdeddings
model_pool = queue.Queue()
for model in models:
    model_pool.put(model)

def process_batch(i, batch):
    # block until a model becomes available
    model = model_pool.get()
    try:
        embed_batch(i, batch, model[1])
        print(f"Embedded batch {i} with model {model[0]}")
    finally:
        model_pool.put(model)

with ThreadPoolExecutor(max_workers=len(models)) as executor:
    for i, batch in enumerate(eo_paths_batches):
        executor.submit(process_batch, i, batch)

## Clustering

In [19]:
embeddings_dir = 'data/executive_orders/embeddings/'
embeddings_paths = os.listdir(embeddings_dir)

def load_csv(file_name):
    return pandas.read_csv(embeddings_dir + file_name, index_col=0)

with ThreadPoolExecutor() as executor:
    embeddings_dfs = list(executor.map(load_csv, embeddings_paths))

embeddings_df = pandas.concat(embeddings_dfs, ignore_index=True)
embeddings = embeddings_df.iloc[:, 1:].to_numpy()

In [29]:
k = 20
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(embeddings)
embeddings_df['cluster'] = clusters

output_path = 'data/executive_orders/clusters.csv'
embeddings_df[["file", "cluster"]]\
    .sort_values(by=["cluster", "file"])\
    .to_csv(output_path, index=False)