# Executive Order Embeddings

In [1]:
%run notebooks/Setup.ipynb

import pandas
import numpy
import re
import json
from sentence_transformers import SentenceTransformer
import os
import pandas
from concurrent.futures import ThreadPoolExecutor
import queue

In [None]:
num_gpus = 2
models = [
    [i, SentenceTransformer("Salesforce/SFR-Embedding-2_R", device=f'cuda:{i}')] for i in range(num_gpus)
]


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
eo_dir = 'data/executive_orders/raw/'
eo_paths = os.listdir(eo_dir)

def batched(list_in, batch_size):
    return [list_in[i * batch_size:(i + 1) * batch_size] for i in range((len(list_in) + batch_size - 1) // batch_size )]

eo_paths_batches = batched(eo_paths, 10)

In [5]:
if not os.path.exists('data/executive_orders/embeddings'):
    os.makedirs('data/executive_orders/embeddings')

def embed_batch(batch_i, batch, model):
    eos = []
    for path in batch:
        with open(eo_dir + path, 'r') as f:
            eo_doc = json.load(f)
        eos.append('\n'.join(eo_doc['content']))

    embeddings = model.encode(list(map(lambda x: str(x), eos)))

    df = pandas.DataFrame(embeddings)
    df.insert(0, 'file', batch)
    df.to_csv(f'data/executive_orders/embeddings/batch_{batch_i}.csv')

In [None]:
model_pool = queue.Queue()
for model in models:
    model_pool.put(model)

def process_batch(i, batch):
    # block until a model becomes available
    model = model_pool.get()
    try:
        embed_batch(i, batch, model[1])
        print(f"Embedded batch {i} with model {model[0]}")
    finally:
        model_pool.put(model)

with ThreadPoolExecutor(max_workers=len(models)) as executor:
    for i, batch in enumerate(eo_paths_batches):
        executor.submit(process_batch, i, batch)

In [10]:
%%time

embed_batch(10, eo_paths_batches[1], models[0][1])

CPU times: user 31.2 s, sys: 4.79 s, total: 36 s
Wall time: 36.2 s


In [11]:
%%time

embed_batch(10, eo_paths_batches[10], models[0][1])

CPU times: user 5.42 s, sys: 1.15 s, total: 6.56 s
Wall time: 6.54 s
