In [1]:
import boto3
import csv
import codecs

In [7]:
import json

In [35]:
# General AWS setup
def setup_s3(path, filename):
    aws_id, aws_secret = aws_credentials_from_file(path, filename)
    region = 'us-east-1'

    s3 = boto3.resource(
        service_name='s3',
        region_name=region,
        aws_access_key_id=aws_id,
        aws_secret_access_key=aws_secret
    )

    return s3

def aws_credentials_from_file(path, filename):
    with open(f"{path}/{filename}", 'r') as f:
        key_dict = json.load(f)
    for key in key_dict:
        aws_id = key
        aws_secret = key_dict[key]
    return aws_id, aws_secret


def move_s3_object(obj_name, obj_old_folder, obj_new_folder, s3):
    """
    Moves an object from a given S3 folder to another by copying it to the new folder it and then deleting it from the old one
    """
    try:
        s3.Object(BUCKET_NAME, f"{obj_old_folder}/{obj_name}").copy_from(
            CopySource=f"{BUCKET_NAME}/{obj_new_folder}/{obj_name}")
        _ = s3.Object(BUCKET_NAME, f"{obj_old_folder}/{obj_name}").delete()
    except Exception as e:
        print(f"Error while moving {obj_name} from {obj_old_folder} to {obj_new_folder}.")
        print(e)
        

############################################# 
# Methods for sentence splitting
#############################################
def load_text_files_for_language(language, s3):
    new_text_files_folder = f"{language}_documents/text_files/new"
    file_id_to_text = {}
    for obj in s3.Bucket(BUCKET_NAME).objects.all().filter(Prefix=new_text_files_folder):
        if not obj.key.endswith("/"):
            file_id = obj.key.replace(new_text_files_folder, "").replace(".txt", "")
            text = obj.get()['Body'].read().decode('utf-8')
            file_id_to_text[file_id] = text
    return file_id_to_text
   
def load_text_files_for_language1(language, s3):
    new_text_files_folder = f"{language}_documents/text_files/new"
    
    for obj in s3.Bucket(BUCKET_NAME).objects.all().filter(Prefix=new_text_files_folder):
        if not obj.key.endswith("/"):
            file_id = obj.key.replace(new_text_files_folder, "").replace(".txt", "")
            text = obj.get()['Body'].read().decode('utf-8')
            yield file_id, text
    
def output_sents(sents, f_name, f_uuid, language, s3):
    """
    Store a JSON file containing the metadata and sentences for a given text file in the S3 bucket
    """
    sents_json = {}
    fformat = f_name.split(".")[-1]
    sents_json[f_uuid] = {"metadata":
                              {"n_sentences": len(sents),
                               "file_name": f_name,
                               "file_format": fformat},
                          "sentences": sents}

    s3.Object(BUCKET_NAME, f"{language}_documents/sentences/{f_uuid}_sents.json").put(
        Body=(json.dumps(sents_json, indent=4)))
    
    
############################################# 
# Methods for assisted labeling
#############################################
def labeled_sentences_from_dataset(dataset):
    sentence_tags_dict = {}

    for document in dataset.values():
        sentence_tags_dict.update(document['sentences'])

    return sentence_tags_dict

def load_sentences_for_language(language, s3, init_doc, end_doc):
    policy_dict = {}
    sents_folder = f"{language}_documents/sentences"

    for i, obj in enumerate(s3.Bucket(BUCKET_NAME).objects.all().filter(Prefix=sents_folder)):

        if not obj.key.endswith("/") and init_doc <= i < end_doc:
            serializedObject = obj.get()['Body'].read()
            policy_dict = {**policy_dict, **json.loads(serializedObject)}

    return labeled_sentences_from_dataset(policy_dict)

def load_sentences_for_language1(language, s3, init_doc, end_doc):
    sents_folder = f"{language}_documents/sentences"

    for i, obj in enumerate(s3.Bucket(BUCKET_NAME).objects.all().filter(Prefix=sents_folder)):
        if not obj.key.endswith("/") and init_doc <= i < end_doc:
            yield labeled_sentences_from_dataset(json.loads(obj.get()['Body'].read()))


def save_results_as_separate_csv(results_dictionary, queries_dictionary, init_doc, results_limit, aws_id, aws_secret):
    path = "s3://wri-nlp-policy/english_documents/assisted_labeling"
    col_headers = ["sentence_id", "similarity_score", "text"]
    for i, query in enumerate(results_dictionary.keys()):
        filename = f"{path}/query_{queries_dictionary[query]}_{i}_results_{init_doc}.csv"
        pd.DataFrame(results_dictionary[query], columns=col_headers).head(results_limit).to_csv(filename, storage_options={"key": aws_id, "secret": aws_secret})

############################################# 
# Method for metadata reader
#############################################
def doc_ids_per_country(country, s3):
    """
    Get a list of text document file IDs for a given country from the CSV database in the S3 bucket.
    In the CSV, the file id is the file name without the file extension ("23sd45fg.txt" without the ".txt")
    """
    metadata_fname = f"metadata/{country}_metadata.csv"
    obj = s3.Object(bucket_name=BUCKET_NAME, key=metadata_fname)

    doc_ids = []
    for row in csv.reader(codecs.getreader("utf-8")(obj.get()['Body'])):
        # Add original file ID without the file format
        doc_ids.append(row[3][:-4])

    return doc_ids

############################################# 
# Methods for getting abbreviations
#############################################
def get_abbreviations(language, s3):
    """
    Gets the set of abbreviations for a given language, from the text file in the S3 bucket
    """
    abbreviations_fname = f"abbreviations/{language}_abbreviations.txt"
    obj = s3.Object(bucket_name=BUCKET_NAME, key=abbreviations_fname)
    abbreviations_str = obj.get()['Body'].read().decode('utf-8')
    return set(abbreviations_str.split("\n"))

In [4]:
BUCKET_NAME = "wri-nlp-policy"

In [14]:
# Create s3 object
s3 = setup_s3("/Users/dafirebanks/Documents", "credentials.json")

### Comparing performance of normal functions vs generator functions

In [19]:
from time import perf_counter 

#### 1. For loading text files

In [59]:
start_t = perf_counter()
i = 0
for file_id, text in load_text_files_for_language("english", s3).items():
    print("File_id:", file_id)
    print("Text:", text[:100])
    print("=======================================")
    i += 1
    if i == 10000:
        break
stop_t = perf_counter()
print("Time taken:", stop_t - start_t)

File_id: /0087b716bb5c4b4d8f8496f106c195e6f027e88d
Text: THE WILD LIFE (PROTECTION) AMENDMENT ACT, 2006
(ACT No. 39 OF 2006)
AN
ACT
further to amend the Wild
File_id: /016420ae47f42d91eb7171bd5a62805abe6f9ff5
Text: 137 (72)
PTSTENT 2101-97, TORER 21, 2002
4 (TT)
4 (")
(2) The words and expressions used but not def
File_id: /017dc3e6fb7b98d975918798afb86462ee89001f
Text: THE TELANGANA PROHIBITION OF COW SLAUGHTER AND
ANIMAL PRESERVATION ACT, 1977.
(ACT NO. 11 OF 1977)
A
File_id: /0180163dc32afed62e39d6c63d637222bc7bdd3f
Text: -1-
GOVERNMENT OF GOA
Department of Animal Husbandry
Directorate of Animal Husbandry & Veterinary Se
File_id: /01c9d2721eaa2fd73f926ae68e27441f355e6023
Text: t
198
THE ARUNACHAL PRADESH WATER
RESOURCES
REGULATORY AUTHORITY ACT, 2006
(ACT NO. 15 OF 2006)
(Rec
File_id: /01e8604f38982ab507d8d687bcfe96ef8e564bd1
Text: MA8BA
Registered No.
A
(uter
lo suld bevionall)
(MOITA
799
sill
und
of
The Assam Gazette
of
not
EXTR
File_id: /025b5dd84bdd834cc79f1db41af5b8a6c621fc4b
T

In [60]:
start_t1 = perf_counter()
i = 0
for file_id, text in load_text_files_for_language1("english", s3):
    print("File_id:", file_id)
    print("Text:", text[:100])
    print("=======================================")
    i += 1
    if i == 10000:
        break
stop_t1 = perf_counter()
print("Time taken:", stop_t1 - start_t1)

File_id: /0087b716bb5c4b4d8f8496f106c195e6f027e88d
Text: THE WILD LIFE (PROTECTION) AMENDMENT ACT, 2006
(ACT No. 39 OF 2006)
AN
ACT
further to amend the Wild
File_id: /016420ae47f42d91eb7171bd5a62805abe6f9ff5
Text: 137 (72)
PTSTENT 2101-97, TORER 21, 2002
4 (TT)
4 (")
(2) The words and expressions used but not def
File_id: /017dc3e6fb7b98d975918798afb86462ee89001f
Text: THE TELANGANA PROHIBITION OF COW SLAUGHTER AND
ANIMAL PRESERVATION ACT, 1977.
(ACT NO. 11 OF 1977)
A
File_id: /0180163dc32afed62e39d6c63d637222bc7bdd3f
Text: -1-
GOVERNMENT OF GOA
Department of Animal Husbandry
Directorate of Animal Husbandry & Veterinary Se
File_id: /01c9d2721eaa2fd73f926ae68e27441f355e6023
Text: t
198
THE ARUNACHAL PRADESH WATER
RESOURCES
REGULATORY AUTHORITY ACT, 2006
(ACT NO. 15 OF 2006)
(Rec
File_id: /01e8604f38982ab507d8d687bcfe96ef8e564bd1
Text: MA8BA
Registered No.
A
(uter
lo suld bevionall)
(MOITA
799
sill
und
of
The Assam Gazette
of
not
EXTR
File_id: /025b5dd84bdd834cc79f1db41af5b8a6c621fc4b
T

#### 2. For loading sentences

In [61]:
start_t = perf_counter()
init_at_doc = 14778
end_at_doc = 16420

i = 0
for sent_id, sent_map in load_sentences_for_language("english", s3, init_at_doc, end_at_doc).items():
    print("Sent_id:", sent_id)
    print("Text:", sent_map['text'])
    print("=======================================")
    i += 1
    if i == 1000:
        break
stop_t = perf_counter()
print("Time taken:", stop_t - start_t)

Sent_id: e616b24e900271b89cde0761f3e4ac92f809b4bc_sent_0
Sent_id: e616b24e900271b89cde0761f3e4ac92f809b4bc_sent_1
Text: The purpose of these  notices is to give interested persons an opportunity to participate in  the rule making prior to the adoption of the final rules.
Sent_id: e616b24e900271b89cde0761f3e4ac92f809b4bc_sent_2
Sent_id: e616b24e900271b89cde0761f3e4ac92f809b4bc_sent_3
Text: ----------------------------------------------------------------------- SUMMARY: The United States Department of Agriculture (USDA) is announcing that it has withdrawn certain proposed rules that were either published in the Federal Register more than 3 years ago without subsequent action or determined to no longer be candidates for final action.
Sent_id: e616b24e900271b89cde0761f3e4ac92f809b4bc_sent_4
Text: USDA is taking this action to reduce its regulatory backlog and focus its resources on higher priority actions.
Sent_id: e616b24e900271b89cde0761f3e4ac92f809b4bc_sent_5
Text: The Department's acti

In [64]:
start_t1 = perf_counter()

init_at_doc = 14778
end_at_doc = 16420
i = 0
for sent in load_sentences_for_language1("english", s3, init_at_doc, end_at_doc):
    try:
        sent_id, sent_map = next(iter(sent.items()))
        print("Sent_id:", sent_id)
        print("Text:", sent_map['text'])
        print("=======================================")
    except Exception as e:
        print("ERROR. Sent:", sent)
        print(e)
    i += 1
    if i == 1000:
        break
stop_t1 = perf_counter()
print("Time taken:", stop_t1 - start_t1)

Sent_id: e616b24e900271b89cde0761f3e4ac92f809b4bc_sent_0
Sent_id: e616eb3ac356422a046856562a1a7a53504efd10_sent_0
Text: Federal Register, Volume 83 Issue 243 (Wednesday, December 19, 2018) [Federal Register Volume 83, Number 243 (Wednesday, December 19, 2018)] [Rules and Regulations] [Page 65077] From the Federal Register Online via the Government Publishing Office [[URL] [FR Doc No: 2018-27436] ----------------------------------------------------------------------- NUCLEAR REGULATORY COMMISSION 10 CFR Part 72 [NRC-2018-0075] RIN 3150-AK12 List of Approved Spent Fuel Storage Casks: NAC International NAC- UMS[supreg] Universal Storage System, Certificate of Compliance No 1015, Amendment No 6 AGENCY: Nuclear Regulatory Commission.
Sent_id: e617eade05c5d028ef50bacb70a08a44514efe42_sent_0
Text: Federal Register, Volume 82 Issue 91 (Friday, May 12, 2017) [Federal Register Volume 82, Number 91 (Friday, May 12, 2017)] [Notices] [Page 22135] From the Federal Register Online via the Government 