# English preprocessor/sentence splitting

In [1]:
import os
os.chdir("../../../")
os.getcwd()

'/Users/dafirebanks/Projects/policy-data-analyzer'

In [2]:
from tasks.text_preprocessing import *

In [3]:
# 1. Setup amazon client
language = "english"
bucket_name = "wri-nlp-policy"
creds_filepath = "/Users/dafirebanks/Documents/credentials.json"

s3_client = S3Client(creds_filepath=creds_filepath, bucket_name=bucket_name, language=language)

In [8]:
# 2. Make sure we're getting the right files
i = 0
for file_id, text in s3_client.load_text_files(language):
    print("File_id:", file_id)
    print("Text:", text[:100])
    print("=======================================")
    i += 1
    if i == 2:
        break


File_id: /000bd482c20f22b801193f1897ed1eca89318a53
Text: SECRETARIA DE DESARROLLO URBANO Y OBRAS
PUBLICAS DEL ESTADO DE QUERETARO
DEPARTAMENTO DE CONCURSOS
R
File_id: /000d91e74dd312e2f8d2f28bd154c3cab6a8bc10
Text: 
  CONVENIO de Coordinación en materia de reasignación de recursos que celebran la Secretaría de Com


In [14]:
# 3. Start preprocessing
tokenizer = nltk.data.load(f"tokenizers/punkt/{language}.pickle")
abbrevs = {"no", "sec", "cong", "dist", "doc"}
min_num_words = 5

new_text_files_folder = f"{language}_documents/text_files/new"
processed_text_files_folder = f"{language}_documents/text_files/processed"

i = 0
print_every = 100
error_files = []

for file_id, text in s3_client.load_text_files(language):
    try:
        file_id = file_id.replace("/", "")
        preprocessed_text = preprocess_english_text(text)
        sents = get_nltk_sents(preprocessed_text, tokenizer, abbrevs)
        postprocessed_sents = format_sents_for_output(remove_short_sents(sents, min_num_words), file_id)
        s3_client.store_sentences(postprocessed_sents, file_id, language)
        s3_client.move_object(file_id + ".txt", new_text_files_folder, processed_text_files_folder)

    except Exception as e:
        error_files.append({file_id: e})

    i += 1
    
    # For testing and early stopping, uncomment this
#     if i == 2:
#         break

    if i % print_every == 0:
        print("----------------------------------------------")
        print(f"Processing {i} documents...")
        print(f"Number of errors so far: {len(error_files)}")
        print("----------------------------------------------")

In [None]:
# If you want to store the errors
with open(f"../output/{language}_sentence_splitting_errors.json", "w") as f:
    json.dump(error_files, f)

# Text preprocessing

## Design
*Input*: Text file (.txt) containing the text extracted from HTML, PDF, Word, etc.

*Output*: JSON file with sentences ready for use, with respective ID (if needed, we can keep both the original sentence and the processed sentence before/after splitting respectively)

- *Sample output template for document with id 23effs8765*:
```
{"23effs8765": 
    {
        "metadata": {
            "n_sentences": 23, 
            "language": "English"
         },
        "sentences": {
            "23effs8765_sent_0": {
                "text": "Here is a sample sentence that is NOT an incentive",
                "labels": [0]
            },
            "23effs8765_sent_1": {
                "text": "This sentence should be an incentive",
                "labels": [1]
            }
        }
    }
}
```


## Pipeline:

- **1st component:** Few, basic rules created to deal with acronyms ("U.T.M"), bullet points ("(3)") and abreviations ("ord."). Differs per country, state or local level - this is to adapt to variability of format. The creation of rules will be as standardized as possible, so that the process of creating them is easy regardless of country/state.
    - Dictionary of abbreviations and acronyms
    - 1-3 rules for the characters that come before/after a period, to avoid confusing sentence splitting model
    - 1-3 rules for ensuring good processing of bullet points as sentences/phrases
- **2nd component:** Pre-built sentence splitter (NLTK or spaCy)

## Sentence splitting rules

### USA

*Notes from preliminary analysis:*
- Can filter out anything up to "ACTION: Final rule." or "-------------------" 
- We need to figure out how laws and docket numbers ("Docket No. FWS-R4-ES-2018-0074.") are represented, congressmen ("Cong."), sessions ("Sess."), district ("Dist.") numbers, etc.
- To figure out common patterns, we should grab everyting that comes before a "." and see if we can build them

In [31]:
import re
import random
import nltk.data
import spacy 
import string
from collections import Counter
from collections import defaultdict
import nltk
en_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
es_tokenizer = nltk.data.load("tokenizers/punkt/spanish.pickle")

In [32]:
base_path = "../input/USA/"
usa_paths = ["Federal Register, Volume 85 Issue 190 (Wednesday, September 30, 2020).htm", "Federal Register, Volume 86 Issue 28 (Friday, February 12, 2021).htm", "Federal Register, Volume 86 Issue 29 (Tuesday, February 16, 2021).htm"]
fname = usa_paths[2]
txt_path = base_path + fname

with open(txt_path, "r") as txt_file:
    txt = txt_file.read()

In [33]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    return re.sub(re.compile('<.*?>'), '', text)

def replace_links(text):
    text = re.sub(r'http\S+', '[URL]', text)
    return re.sub(r'www\S+', '[URL]', text)

def remove_multiple_spaces(text):
    return re.sub('\s+', ' ', text)

In [75]:
# Optional preprocessing
txt = replace_links(remove_html_tags(txt)).replace("\n", " ").replace("\t", " ").strip()
txt = remove_multiple_spaces(txt)
txt

'Federal Register, Volume 86 Issue 29 (Tuesday, February 16, 2021) [Federal Register Volume 86, Number 29 (Tuesday, February 16, 2021)] [Notices] [Page 9543] From the Federal Register Online via the Government Publishing Office [[URL] [FR Doc No: 2021-03002] ----------------------------------------------------------------------- NATIONAL ARCHIVES AND RECORDS ADMINISTRATION [NARA-2021-017] National Industrial Security Program Policy Advisory Committee (NISPPAC) Meeting AGENCY: Information Security Oversight Office (ISOO), National Archives and Records Administration (NARA). ACTION: Notice of Federal Advisory Committee meeting. ----------------------------------------------------------------------- SUMMARY: We are announcing an upcoming meeting of the National Industrial Security Program Policy Advisory Committee (NISPPAC). DATES: The meeting will be on April 14, 2021, from 10:00 a.m. to 1:00 p.m. EST. ADDRESSES: The April 14, 2021, meeting will be a virtual meeting. See supplementary pr

#### 1. Find what happens around periods

In [33]:
def get_surrounding_chars(txt, radius=1):
    surrounding_chars = []
    all_period_idx = [indices.start() for indices in re.finditer("\.", txt)]
    
    for period_idx in all_period_idx:
        start_idx = period_idx - radius
        end_idx = period_idx + radius + 1
        substring = txt[start_idx: end_idx]
        
        if substring:
            surrounding_chars.append(substring)
    
    return surrounding_chars

In [34]:
surrounding_chars_1 = get_surrounding_chars(txt)
surrounding_chars_2 = get_surrounding_chars(txt, radius=2)

print(f"For 1 character before and after a period, we have {len(set(surrounding_chars_1))} unique patterns")
print(f"For 2 characters before and after a period, we have {len(set(surrounding_chars_2))} unique patterns")

For 1 character before and after a period, we have 21 unique patterns
For 2 characters before and after a period, we have 25 unique patterns


In [35]:
def get_possible_chars(neighboring_chars):
    possible_chars = defaultdict(list)

    for pattern in neighboring_chars:
        if pattern[-1] == " ":
            possible_chars[" "].append(pattern)
        elif pattern[-1].isalpha():
            possible_chars["alpha"].append(pattern)
        elif pattern[-1].isnumeric():
            possible_chars["numeric"].append(pattern)
        elif not pattern[-1].isalnum():
            possible_chars["symbol"].append(pattern)
        else:
            possible_chars["other"].append(pattern)
    
    print(f"Total: {len(neighboring_chars)}")
    return possible_chars

def print_char_stats(possible_chars):
    print(f"Space: {len(possible_chars[' '])}"), 
    print(f"Alpha: {len(possible_chars['alpha'])}"), 
    print(f"Numeric: {len(possible_chars['numeric'])}"), 
    print(f"Symbol: {len(possible_chars['symbol'])}"), 
    print(f"Other: {len(possible_chars['other'])}")

Let's analyze the characters surrounding a period, in all instances of a period in the text

In [37]:
possible_chars = get_possible_chars(surrounding_chars_1)
print_char_stats(possible_chars)

Total: 27
Space: 18
Alpha: 7
Numeric: 2
Symbol: 0
Other: 0


Now, we will do the same analysis but for unique patterns

In [38]:
possible_chars = get_possible_chars(set(surrounding_chars_1))
print_char_stats(possible_chars)

Total: 21
Space: 14
Alpha: 5
Numeric: 2
Symbol: 0
Other: 0


#### 1.1 Conclusions from period analysis

- 25/55 unique patterns involve a period being followed by another non-space character. (45%)
- 110/339 instances of a period are followed by something that is a non-space character. (32%)

#### 1.2 Potential rules

For neighboring characters within a radius of 1:
   - If the character after a period is not a space, delete the period

In [34]:
# abreviations = {"No.", "Sec.", "Cong.", "Dist.", "Doc."}
# acronyms = {"W.D.", "U.S.", "H.R.", "U.S.C.", "p.m.", "a.m."}

def parse_emails(text):
    """ 
    Remove the periods from emails in text, except the last one
    """
    emails = [email if email[-1] != "." else email[:-1] for email in re.findall(r"\S*@\S*\s?", txt)]
    
    for email in emails:
        new_email = email.replace(".", "")
        text = text.replace(email, new_email)
        
    return text

def parse_acronyms(text):
    """ 
    Remove the periods from acronyms in the text (i.e "U.S." becomes "US") 
    """

    acronyms = re.findall(r"\b(?:[a-zA-Z]\.){2,}", text)
         
    for acronym in acronyms:
        new_acronym = acronym.replace(".", "")
        text = text.replace(acronym, new_acronym)
        
    return text

def english_preprocessing(txt):
    """
    Steps in the preprocessing of text:
        1. Remove HTML tags
        2. Replace URLS by a tag [URL]
        3. Replace new lines and tabs by normal spaces - sometimes sentences have new lines in the middle
        4. Remove excessive spaces (more than 1 occurrence)
        5. Parse acronyms
    """
    txt = replace_links(remove_html_tags(txt)).strip()#.replace("\n", " ").replace("\t", " ").strip()
    txt = remove_multiple_spaces(txt)
    txt = parse_emails(txt)
    txt = parse_acronyms(txt)
    
    new_txt = ""
    all_period_idx = set([indices.start() for indices in re.finditer("\.", txt)])
    
    for i, char in enumerate(txt):
        if i in all_period_idx:
            # Any char following a period that is NOT a space means that we should not add that period
            if i + 1 < len(txt) and txt[i + 1] != " ":
                continue
            
            # Any char that is a number following a period will not count. 
            # For enumerations, we're counting on docs being enumerated as "(a)" or "(ii)", and if not, they will be separated by the . after the number ("3. Something" will just be "Something" as a sentence)
            if i + 2 < len(txt) and txt[i + 2].isnumeric(): 
                continue
            
            # If we wanted to have all numbered lists together, uncomment this, and comment out the previous condition
#             if i + 2 < len(txt) and not txt[i + 2].isalpha(): 
#                 continue
            
        new_txt += char

    return new_txt

def english_postprocessing(sents, min_num_words=4):
    """
    Remove sentences that are made of less than a given number of words. Default is 4
    """
    
    return [sent for sent in sents if len(sent.split()) >= min_num_words]

def get_nltk_sents(txt, tokenizer, extra_abbreviations=None):
    if extra_abbreviations:
        tokenizer._params.abbrev_types.update(extra_abbreviations)
        
    sents = tokenizer.tokenize(txt)
    return sents

In [35]:
preprocessed = usa_preprocessing(txt)
surrounding_chars_1 = get_surrounding_chars(preprocessed)
possible_chars = get_possible_chars(surrounding_chars_1)
print_char_stats(possible_chars)

NameError: name 'usa_preprocessing' is not defined

### Final USA preprocessing

In [38]:
base_path = "../input/USA/"
usa_paths = ["Federal Register, Volume 85 Issue 190 (Wednesday, September 30, 2020).htm", "Federal Register, Volume 86 Issue 28 (Friday, February 12, 2021).htm", "Federal Register, Volume 86 Issue 29 (Tuesday, February 16, 2021).htm"]
fname = usa_paths[0]
txt_path = base_path + fname

usa_abrevs = {"no", "sec", "cong", "dist", "doc"}

with open(txt_path, "r") as txt_file:
    txt = txt_file.read()

preprocessed = english_preprocessing(txt)
sents = get_nltk_sents(preprocessed, en_tokenizer, usa_abrevs)
post_processed_sents = english_postprocessing(sents, min_num_words=5)

In [39]:
preprocessed



In [40]:
sents

 'ACTION: Final rule.',
 '----------------------------------------------------------------------- SUMMARY: We, the US Fish and Wildlife Service (Service), adopt a rule under section 4(d) of the Endangered Species Act of 1973 (Act), as amended, for the trispot darter (Etheostoma trisella), a fish from Alabama, Georgia, and Tennessee.',
 'This rule provides measures that are necessary and advisable to conserve the species.',
 'DATES: This rule is effective October 30, 2020.',
 'ADDRESSES: This final rule is available on the internet at [URL] under Docket No. FWS-R4-ES-2018-0074 and at [URL] Comments and materials we received, as well as supporting documentation we used in preparing this rule, are available for public inspection at [URL] under Docket No. FWS-R4-ES-2018-0074.',
 'FOR FURTHER INFORMATION CONTACT: William Pearson, Field Supervisor, US Fish and Wildlife Service, Alabama Ecological Services Field Office, 1208-B Main Street, Daphne, AL 36526; telephone 251-441-5870.',
 'Persons

In [41]:
post_processed_sents

 '----------------------------------------------------------------------- SUMMARY: We, the US Fish and Wildlife Service (Service), adopt a rule under section 4(d) of the Endangered Species Act of 1973 (Act), as amended, for the trispot darter (Etheostoma trisella), a fish from Alabama, Georgia, and Tennessee.',
 'This rule provides measures that are necessary and advisable to conserve the species.',
 'DATES: This rule is effective October 30, 2020.',
 'ADDRESSES: This final rule is available on the internet at [URL] under Docket No. FWS-R4-ES-2018-0074 and at [URL] Comments and materials we received, as well as supporting documentation we used in preparing this rule, are available for public inspection at [URL] under Docket No. FWS-R4-ES-2018-0074.',
 'FOR FURTHER INFORMATION CONTACT: William Pearson, Field Supervisor, US Fish and Wildlife Service, Alabama Ecological Services Field Office, 1208-B Main Street, Daphne, AL 36526; telephone 251-441-5870.',
 'Persons who use a telecommunica

### Final India preprocessing

In [4]:
base_path = "../input/India/"
india_paths = ["India1.txt", "India2.txt", "India_image1.txt", "India_image2.txt"]
fname = india_paths[3]
txt_path = base_path + fname

with open(txt_path, "r") as txt_file:
    txt = txt_file.read()

india_abrevs = {"sub", "subs", "ins", "govt", "dy", "dept", "deptt", "ptg"}

preprocessed = english_preprocessing(txt)
sents = get_nltk_sents(preprocessed, en_tokenizer, india_abrevs)
post_processed_sents = english_postprocessing(sents, min_num_words=5)

NameError: name 'english_preprocessing' is not defined

In [365]:
txt

'No. V-(V)04/0007/2003 16 REGISTERED NO,\nannit\nchi\nThe Gazette of India\nEXTRAORDINARY\n-\nPART II - Section I\nit\nPUBLISHED BY AUTHORITY\n27. 45|\nTS Procfl 3, 12, 1938\nNo. 45| NEW DELIII, WEDNESDAY, AUGUST 3, 2016/SHIRAVANA 12, 1938 (SAKA)\n=) A yod of und of For 216 stancity an 354 #) VII\nI\nSeparate paging is given to this Part in order that it may be filed as a separate compilation.\nMINISTRY OF LAW AND JUSTICE\n(Legislative Department)\nNew Delhi, the 3rd August, 2016/Shravana 12, 1938 (Saka)\nThe following Act of Parliament received the assent of the President on the 3rd\nAugust, 2016, and is hereby published for general information:-\nTHE COMPENSATORY AFFORESTATION FUND ACT, 2016\nNo. 38 OF 2016\n[3rd August, 2016.]\nAn Act to provide for the establishment of funds under the public accounts of\nIndia and the public accounts of each State and crediting thereto the monies\nreceived from the user agencies towards compensatory afforestation,\nadditional compensatory afforesta

In [366]:
preprocessed

'No. V-(V)04/0007/2003 16 REGISTERED NO, annit chi The Gazette of India EXTRAORDINARY - PART II - Section I it PUBLISHED BY AUTHORITY 27 45| TS Procfl 3, 12, 1938 No 45| NEW DELIII, WEDNESDAY, AUGUST 3, 2016/SHIRAVANA 12, 1938 (SAKA) =) A yod of und of For 216 stancity an 354 #) VII I Separate paging is given to this Part in order that it may be filed as a separate compilation. MINISTRY OF LAW AND JUSTICE (Legislative Department) New Delhi, the 3rd August, 2016/Shravana 12, 1938 (Saka) The following Act of Parliament received the assent of the President on the 3rd August, 2016, and is hereby published for general information:- THE COMPENSATORY AFFORESTATION FUND ACT, 2016 No 38 OF 2016 [3rd August, 2016] An Act to provide for the establishment of funds under the public accounts of India and the public accounts of each State and crediting thereto the monies received from the user agencies towards compensatory afforestation, additional compensatory afforestation, penal compensatory affor

In [367]:
sents

['No. V-(V)04/0007/2003 16 REGISTERED NO, annit chi The Gazette of India EXTRAORDINARY - PART II - Section I it PUBLISHED BY AUTHORITY 27 45| TS Procfl 3, 12, 1938 No 45| NEW DELIII, WEDNESDAY, AUGUST 3, 2016/SHIRAVANA 12, 1938 (SAKA) =) A yod of und of For 216 stancity an 354 #) VII I Separate paging is given to this Part in order that it may be filed as a separate compilation.',
 'MINISTRY OF LAW AND JUSTICE (Legislative Department) New Delhi, the 3rd August, 2016/Shravana 12, 1938 (Saka) The following Act of Parliament received the assent of the President on the 3rd August, 2016, and is hereby published for general information:- THE COMPENSATORY AFFORESTATION FUND ACT, 2016 No 38 OF 2016 [3rd August, 2016] An Act to provide for the establishment of funds under the public accounts of India and the public accounts of each State and crediting thereto the monies received from the user agencies towards compensatory afforestation, additional compensatory afforestation, penal compensatory 

In [368]:
post_processed_sents

['No. V-(V)04/0007/2003 16 REGISTERED NO, annit chi The Gazette of India EXTRAORDINARY - PART II - Section I it PUBLISHED BY AUTHORITY 27 45| TS Procfl 3, 12, 1938 No 45| NEW DELIII, WEDNESDAY, AUGUST 3, 2016/SHIRAVANA 12, 1938 (SAKA) =) A yod of und of For 216 stancity an 354 #) VII I Separate paging is given to this Part in order that it may be filed as a separate compilation.',
 'MINISTRY OF LAW AND JUSTICE (Legislative Department) New Delhi, the 3rd August, 2016/Shravana 12, 1938 (Saka) The following Act of Parliament received the assent of the President on the 3rd August, 2016, and is hereby published for general information:- THE COMPENSATORY AFFORESTATION FUND ACT, 2016 No 38 OF 2016 [3rd August, 2016] An Act to provide for the establishment of funds under the public accounts of India and the public accounts of each State and crediting thereto the monies received from the user agencies towards compensatory afforestation, additional compensatory afforestation, penal compensatory 

In [369]:
surrounding_chars_1 = get_surrounding_chars(preprocessed)
surrounding_chars_2 = get_surrounding_chars(preprocessed, radius=2)

print(f"For 1 character before and after a period, we have {len(set(surrounding_chars_1))} unique patterns")
print(f"For 2 characters before and after a period, we have {len(set(surrounding_chars_2))} unique patterns")

possible_chars = get_possible_chars(surrounding_chars_1)
print_char_stats(possible_chars)

For 1 character before and after a period, we have 27 unique patterns
For 2 characters before and after a period, we have 120 unique patterns
Total: 168
Space: 168
Alpha: 0
Numeric: 0
Symbol: 0
Other: 0


### Known issues

#### In India docs:
- For images that have been translated from PDFs, not all the text gets captured properly, and sometimes only excerps of sentences get captured. Example:
    - `"18. No suit, prosecution or legal proceeding shall lie\nagainst\nagainst any authority, officer or person for anything which is\nauthority,\nofficers or\nin good faith done or intended to be done under this Act.\npersons\nacting in\ngood faith.\nOfficer and\n"`. Here, we see that after "done under this Act", there are 2 excerps of other sentences: "persons acting in good faith" and "Officer and" that are incomplete.

### Pipeline to split sentences and store them in S3 bucket

In [67]:
import re
import random
import nltk.data
import spacy 
import string
from collections import Counter
from collections import defaultdict
import nltk
import uuid
import json
import boto3
import csv
import s3fs
import codecs

en_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
es_tokenizer = nltk.data.load("tokenizers/punkt/spanish.pickle")

In [None]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    return re.sub(re.compile('<.*?>'), '', text)

def replace_links(text):
    text = re.sub(r'http\S+', '[URL]', text)
    return re.sub(r'www\S+', '[URL]', text)

def remove_multiple_spaces(text):
    return re.sub('\s+', ' ', text)

# abreviations = {"No.", "Sec.", "Cong.", "Dist.", "Doc."}
# acronyms = {"W.D.", "U.S.", "H.R.", "U.S.C.", "p.m.", "a.m."}

def parse_emails(text):
    """ 
    Remove the periods from emails in text, except the last one
    """
    emails = [email if email[-1] != "." else email[:-1] for email in re.findall(r"\S*@\S*\s?", txt)]
    
    for email in emails:
        new_email = email.replace(".", "")
        text = text.replace(email, new_email)
        
    return text

def parse_acronyms(text):
    """ 
    Remove the periods from acronyms in the text (i.e "U.S." becomes "US") 
    """

    acronyms = re.findall(r"\b(?:[a-zA-Z]\.){2,}", text)
         
    for acronym in acronyms:
        new_acronym = acronym.replace(".", "")
        text = text.replace(acronym, new_acronym)
        
    return text

def english_preprocessing(txt):
    """
    Steps in the preprocessing of text:
        1. Remove HTML tags
        2. Replace URLS by a tag [URL]
        3. Replace new lines and tabs by normal spaces - sometimes sentences have new lines in the middle
        4. Remove excessive spaces (more than 1 occurrence)
        5. Parse abreviations and acronyms
    """
    txt = replace_links(remove_html_tags(txt)).strip()#.replace("\n", " ").replace("\t", " ").strip()
    txt = remove_multiple_spaces(txt)
    txt = parse_emails(txt)
    txt = parse_acronyms(txt)
    
    new_txt = ""
    all_period_idx = set([indices.start() for indices in re.finditer("\.", txt)])
    
    for i, char in enumerate(txt):
        if i in all_period_idx:
            # Any char following a period that is NOT a space means that we should not add that period
            if i + 1 < len(txt) and txt[i + 1] != " ":
                continue
            
            # Any char that is a number following a period will not count. 
            # For enumerations, we're counting on docs being enumerated as "(a)" or "(ii)", and if not, they will be separated by the . after the number ("3. Something" will just be "Something" as a sentence)
            if i + 2 < len(txt) and txt[i + 2].isnumeric(): 
                continue
            
            # If we wanted to have all numbered lists together, uncomment this, and comment out the previous condition
#             if i + 2 < len(txt) and not txt[i + 2].isalpha(): 
#                 continue
            
        new_txt += char

    return new_txt

def english_postprocessing(sents, min_num_words=4):
    """
    Remove sentences that are made of less than a given number of words. Default is 4
    """
    
    return [sent for sent in sents if len(sent.split()) >= min_num_words]

def get_nltk_sents(txt, tokenizer, extra_abbreviations=None):
    if extra_abbreviations:
        tokenizer._params.abbrev_types.update(extra_abbreviations)
        
    sents = tokenizer.tokenize(txt)
    return sents

In [9]:
def aws_credentials_from_file(f_name):
    with open(f_name, "r") as f:
        creds = json.load(f)
    
    return creds["aws"]["id"], creds["aws"]["secret"]

In [62]:
def format_sents_for_output(sents, doc_id):
    formatted_sents = {}

    for i, sent in enumerate(sents):
        formatted_sents.update({f"{doc_id}_sent_{i}": {"text": sent, "label": []}})

    return formatted_sents


def output_sents(sents, f_name, f_uuid, country, bucket, output_dir):

    sents_json = {}
    fformat = f_name.split(".")[-1]
    sents_json[f_uuid] = {"metadata":
                              {"n_sentences": len(sents),
                               "file_name": f_name,
                               "file_format": fformat,
                               "country": country},
                          "sentences": format_sents_for_output(sents, f_uuid)}
    
    s3.Object(bucket, f"{output_dir}/{f_uuid}_sents.json").put(Body=(json.dumps(sents_json, indent=4)))
    
    
def filenames_for_country(country, s3, bucket):
    metadata_fname = f"metadata/{country}_metadata.csv"
    obj = s3.Object(bucket_name = bucket, key = metadata_fname)
    
    filenames = []
    i = 0
    for row in csv.reader(codecs.getreader("utf-8")(obj.get()['Body'])):
        # Add original file ID without the file format
        filenames.append(row[3][:-4])
    
    return filenames

### Actual pipeline

Reads from the S3 Bucket english documents, outputs to test_sentences

Format of credentials JSON file:
```
{
    "aws": {
        "id": "AWS ID",
        "secret": "AWS SECRET"
    }
}
```

In [63]:
credentials_file = '/Users/dafirebanks/Documents/credentials.json'
aws_id, aws_secret = aws_credentials_from_file(credentials_file)
region = 'us-east-1'

s3 = boto3.resource(
    service_name = 's3',
    region_name = region,
    aws_access_key_id = aws_id,
    aws_secret_access_key = aws_secret
)

bucket = 'wri-nlp-policy'
countries = ['India', 'USA']

This pipeline is ready to be ran on the India documents only

In [148]:
usa_abrevs = {"no", "sec", "cong", "dist", "doc"}
country = "India"
out_dir = "english_documents/test_sentences"
# usa_filenames = filenames_for_country(country, s3, bucket)
india_filenames = filenames_for_country("India", s3, bucket)

i = 0
for obj in s3.Bucket(bucket).objects.all().filter(Prefix="english_documents/text_files/"):
    
    # Don't get the directory itself
    if not obj.key.endswith("/"):
        file_id = obj.key.replace("english_documents/text_files/", "").replace(".txt", "")
        if file_id in india_filenames:
            text = obj.get()['Body'].read().decode('utf-8')
            preprocessed = english_preprocessing(text)
            sents = get_nltk_sents(preprocessed, en_tokenizer, usa_abrevs)
            post_processed_sents = english_postprocessing(sents, min_num_words=5)
            output_sents(post_processed_sents, obj.key, file_id, country, bucket, out_dir)

    i += 1
    if i % 100 == 0:
        print(f"Processed {i} documents")
    

16412it [59:52,  4.57it/s]


['dd0ccf82ae888eca4c9926f3dcb393b85c0d6c76',
 'c69fd24b9d42100fe63a17f1e67ad35c6d1b4011',
 'cc1e1f9a78f07a43e7c4e80e15b4411cc4022794',
 '3a17da0f2708a1003d1f82bd77fdbc6a94b57b7e',
 'dff9b9e3d8fe809eb197c4d9ae96fd18a7d66810',
 '229d94d7e95e99d4eb391dbe56fe02c3c963d294',
 '31890e39e01b5618bd09616d4f6588274307b536',
 'd62b75756ba6e1081b63782a5ae0b548abf3c42b',
 '2ee3a6fd9b0f30fe175a4612605b89bfc891140b',
 '6129ba23c12e048e1be307f80468fd0d870e1350',
 'fbc201235d1a0decc7711b54c6a65d1db9675642',
 '2e24c3cc453ec936819538ca2ec9875e46cbe0fc',
 '9eb7592fc5410c4b88d75de05ad96e9e710a66df',
 '6920b58518783a76d4896b718d529f1fb6c49f9d',
 '1864e2a805a63aa1bbddb61d630039a1a7c80fd7',
 '4f678a715d42509c7f88b008256af4979d19d29b',
 'd04327c575b3beda074764785bbdbe88a1c606db',
 '16bc31186165c97d824d9b1a65e5a52429b59b0a',
 '991fbe57e7da81d85c3c9efe274beb187a00eacf',
 '33c0d2c622e37ef5ddef7470cbd197307ecc67d9',
 '71515cbc6a80b843911f9a5b48dd420917d68bb7',
 '0d3e8bb0d6dca0ca78d78adc6028cf1ee55a77f3',
 '34be557b