# Pipeline to split sentences and store them in S3 bucket

- Methods for sentence splitting

In [1]:
import re
import random
import nltk.data
import spacy 
import string
from collections import Counter
from collections import defaultdict
import nltk
import uuid
import json
import boto3
import csv
import s3fs
import codecs

en_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
es_tokenizer = nltk.data.load("tokenizers/punkt/spanish.pickle")

In [7]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    return re.sub(re.compile('<.*?>'), '', text)

def replace_links(text):
    text = re.sub(r'http\S+', '[URL]', text)
    return re.sub(r'www\S+', '[URL]', text)

def remove_multiple_spaces(text):
    return re.sub('\s+', ' ', text)

# abreviations = {"No.", "Sec.", "Cong.", "Dist.", "Doc."}
# acronyms = {"W.D.", "U.S.", "H.R.", "U.S.C.", "p.m.", "a.m."}

def parse_emails(text):
    """ 
    Remove the periods from emails in text, except the last one
    """
    emails = [email if email[-1] != "." else email[:-1] for email in re.findall(r"\S*@\S*\s?", text)]
    
    for email in emails:
        new_email = email.replace(".", "")
        text = text.replace(email, new_email)
        
    return text

def parse_acronyms(text):
    """ 
    Remove the periods from acronyms in the text (i.e "U.S." becomes "US") 
    """

    acronyms = re.findall(r"\b(?:[a-zA-Z]\.){2,}", text)
         
    for acronym in acronyms:
        new_acronym = acronym.replace(".", "")
        text = text.replace(acronym, new_acronym)
        
    return text

def english_preprocessing(txt):
    """
    Steps in the preprocessing of text:
        1. Remove HTML tags
        2. Replace URLS by a tag [URL]
        3. Replace new lines and tabs by normal spaces - sometimes sentences have new lines in the middle
        4. Remove excessive spaces (more than 1 occurrence)
        5. Parse abreviations and acronyms
    """
    txt = replace_links(remove_html_tags(txt)).strip()#.replace("\n", " ").replace("\t", " ").strip()
    txt = remove_multiple_spaces(txt)
    txt = parse_emails(txt)
    txt = parse_acronyms(txt)
    
    new_txt = ""
    all_period_idx = set([indices.start() for indices in re.finditer("\.", txt)])
    
    for i, char in enumerate(txt):
        if i in all_period_idx:
            # Any char following a period that is NOT a space means that we should not add that period
            if i + 1 < len(txt) and txt[i + 1] != " ":
                continue
            
            # Any char that is a number following a period will not count. 
            # For enumerations, we're counting on docs being enumerated as "(a)" or "(ii)", and if not, they will be separated by the . after the number ("3. Something" will just be "Something" as a sentence)
            if i + 2 < len(txt) and txt[i + 2].isnumeric(): 
                continue
            
            # If we wanted to have all numbered lists together, uncomment this, and comment out the previous condition
#             if i + 2 < len(txt) and not txt[i + 2].isalpha(): 
#                 continue
            
        new_txt += char

    return new_txt

def english_postprocessing(sents, min_num_words=4):
    """
    Remove sentences that are made of less than a given number of words. Default is 4
    """
    
    return [sent for sent in sents if len(sent.split()) >= min_num_words]

def get_nltk_sents(txt, tokenizer, extra_abbreviations=None):
    if extra_abbreviations:
        tokenizer._params.abbrev_types.update(extra_abbreviations)
        
    sents = tokenizer.tokenize(txt)
    return sents

In [3]:
def aws_credentials_from_file(f_name):
    with open(f_name, "r") as f:
        creds = json.load(f)
    
    return creds["aws"]["id"], creds["aws"]["secret"]

In [4]:
def format_sents_for_output(sents, doc_id):
    formatted_sents = {}

    for i, sent in enumerate(sents):
        formatted_sents.update({f"{doc_id}_sent_{i}": {"text": sent, "label": []}})

    return formatted_sents


def output_sents(sents, f_name, f_uuid, country, bucket, output_dir):

    sents_json = {}
    fformat = f_name.split(".")[-1]
    sents_json[f_uuid] = {"metadata":
                              {"n_sentences": len(sents),
                               "file_name": f_name,
                               "file_format": fformat,
                               "country": country},
                          "sentences": format_sents_for_output(sents, f_uuid)}
    
    s3.Object(bucket, f"{output_dir}/{f_uuid}_sents.json").put(Body=(json.dumps(sents_json, indent=4)))
    
    
def filenames_for_country(country, s3, bucket):
    metadata_fname = f"metadata/{country}_metadata.csv"
    obj = s3.Object(bucket_name = bucket, key = metadata_fname)
    
    filenames = []
    i = 0
    for row in csv.reader(codecs.getreader("utf-8")(obj.get()['Body'])):
        # Add original file ID without the file format
        filenames.append(row[3][:-4])
    
    return filenames

- Actual pipeline

Reads from the S3 Bucket english documents, outputs to test_sentences


Format of credentials JSON file:
```
{
    "aws": {
        "id": "AWS ID",
        "secret": "AWS SECRET"
    }
}
```

In [5]:
credentials_file = '/Users/dafirebanks/Documents/credentials.json'
aws_id, aws_secret = aws_credentials_from_file(credentials_file)
region = 'us-east-1'

s3 = boto3.resource(
    service_name = 's3',
    region_name = region,
    aws_access_key_id = aws_id,
    aws_secret_access_key = aws_secret
)

bucket = 'wri-nlp-policy'
countries = ['India', 'USA']

This pipeline is ready to be ran on the India documents only

In [8]:
# Define abbreviations per country - this can work for all english documents
usa_abrevs = {"no", "sec", "cong", "dist", "doc"}
india_abrevs = {"sub", "subs", "ins", "govt", "dy", "dept", "deptt", "ptg"} 

# Define country and output directory in S3 bucket
country = "India"
out_dir = "english_documents/test_sentences"
select_filenames = filenames_for_country(country, s3, bucket)

i = 0
for obj in s3.Bucket(bucket).objects.all().filter(Prefix="english_documents/text_files/"):
    
    # Don't get the directory itself
    if not obj.key.endswith("/"):
        file_id = obj.key.replace("english_documents/text_files/", "").replace(".txt", "")
        if file_id in select_filenames:
            text = obj.get()['Body'].read().decode('utf-8')
            preprocessed = english_preprocessing(text)
            # NOTE: Change abreviations in the future specific to country
            sents = get_nltk_sents(preprocessed, en_tokenizer, india_abrevs)
            post_processed_sents = english_postprocessing(sents, min_num_words=5)
            output_sents(post_processed_sents, obj.key, file_id, country, bucket, out_dir)

    i += 1
    if i == 2:
        break
    if i % 100 == 0:
        print(f"Processed {i} documents")
    