In [None]:
import multiprocessing as mp
import tempfile
import json
import os

import boto3
from botocore import UNSIGNED, config
import pymongo

unsigned=config.Config(signature_version=UNSIGNED)

In [None]:
s3 = boto3.resource("s3", config=unsigned)
bucket = s3.Bucket("mirrulations")

In [None]:
# inspired by https://stackoverflow.com/questions/54833895/how-to-get-top-level-folders-in-an-s3-bucket-using-boto3
client = boto3.client('s3', config=unsigned)
paginator = client.get_paginator('list_objects')
result = paginator.paginate(Bucket='mirrulations', Delimiter='/')

agencies = [prefix.get("Prefix").rstrip("/") for prefix in result.search('CommonPrefixes') if prefix is not None]
dockets = []
for i, agency in enumerate(agencies):
    print(f"[{i}/{len(agencies)}]({agency})", end=" "*100 + "\r")
    result = paginator.paginate(
        Bucket='mirrulations',
        Delimiter='/',
        Prefix=f"{agency}/"
    )
    dockets.extend([prefix.get("Prefix") for prefix in result.search("CommonPrefixes") if prefix is not None])

In [None]:
dockets[0]

In [None]:
def getSubKeys(path, fullKey=True):
    if fullKey:
        extract = lambda key: key 
    else:
        extract = lambda keystr: keystr.split("/")[-2]
    return [extract(prefix["Prefix"]) for prefix in client.list_objects(
        Bucket="mirrulations", 
        Prefix=path,
        Delimiter="/"
    )["CommonPrefixes"]]

def getFileKeys(path):
    return [metadata["Key"] for metadata in client.list_objects(
        Bucket="mirrulations", 
        Prefix=path,
        Delimiter="/"
    )["Contents"]]

In [None]:
def getContents(base_path, subkey):
    path = base_path + f"{subkey}/"
    keys = getFileKeys(path)
    return keys

def getSubContents(base_path, subkey):
    path = base_path + f"{subkey}/"
    keys = []
    for key in getSubKeys(path, fullKey=True):
        keys.extend(getFileKeys(key))
    return keys

getFileName = lambda path: path.split("/")[-1].split(".")[0]
def getFileData(paths, dataExtractor=lambda file: json.load(file)):
    data = {getFileName(path).split("_")[0]: [] for path in paths} # init with []
    temp = tempfile.NamedTemporaryFile(delete=False, delete_on_close=False)
    for path in paths:
        bucket.download_file(path, temp.name)
        with open(temp.name, "r") as file:
            comment_key = getFileName(path).split("_")[0]
            data[comment_key].append(dataExtractor(file))

    os.remove(temp.name)
    return data

In [None]:
def addUpdate(dict_, key, val):
    if key not in dict_:
        dict_[key] = {}
    dict_[key].update(val)
    
def updateJson(dict_, base_path, key):
    paths = list(filter(
        lambda path: path.split(".")[-1] == "json", 
        getContents(base_path, key)
    ))
    data = getFileData(paths)
    for ID, json_data in data.items():
        addUpdate(dict_, ID, json_data[0])

def updateText(dict_, base_path, key):
    paths = getSubContents(base_path, key)
    data = getFileData(paths, dataExtractor=lambda file: file.read())
    for ID, text in data.items():
        addUpdate(dict_, ID, {"text": text})

In [None]:
def updateCollection(obj, collection):
    docDoesNotExist = lambda data: collection.count_documents({"id": data["id"]}) == 0
    for ID in obj:
        obj[ID]["id"] = ID
    filtered = list(filter(docDoesNotExist, [data for data in obj.values()]))
    if len(filtered) > 0:
        collection.insert_many(filtered)

In [None]:
testDocketPath = "USTR/USTR-2015-0010/"

In [None]:
def storeDocketInfo(docketPath):
    base_path =  f"{docketPath}text-{docketPath.split("/")[-2]}/"
    fields = getSubKeys(base_path, fullKey=False)
    bson_comments = {}
    bson_documents = {}
    bson_docket = {}
    if "comments" in fields:
        updateJson(bson_comments, base_path, "comments")
    if "comments_extracted_text" in fields:
        updateText(bson_comments, base_path, "comments_extracted_text")
        
    if "documents" in fields:
        updateJson(bson_documents, base_path, "documents")
    if "documents_extracted_text" in fields:
        updateText(bson_documents, base_path, "documents_extracted_text")

    if "docket" in fields:
        updateJson(bson_docket, base_path, "docket")
    
    db = pymongo.MongoClient().mirrulations
    updateCollection(bson_comments, db.raw_comments)
    updateCollection(bson_documents, db.raw_documents)
    updateCollection(bson_docket, db.raw_dockets)

storeDocketInfo(testDocketPath)

In [None]:
with mp.Pool(4) as pool:
    pool.map(storeDocketInfo, dockets)