# Pipeline to split sentences and store them in S3 bucket

- Methods for sentence splitting

In [1]:
import re
import random
import nltk.data
import spacy 
import string
from collections import Counter
from collections import defaultdict
import nltk
import uuid
import json
import boto3
import csv
import s3fs
import codecs
import fasttext

path_to_pretrained_model = "/Users/dafirebanks/Projects/models/fasttext/lid.176.bin"
fmodel = fasttext.load_model(path_to_pretrained_model)

en_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
es_tokenizer = nltk.data.load("tokenizers/punkt/spanish.pickle")



In [2]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    return re.sub(re.compile('<.*?>'), '', text)

def replace_links(text):
    text = re.sub(r'http\S+', '[URL]', text)
    return re.sub(r'www\S+', '[URL]', text)

def remove_multiple_spaces(text):
    return re.sub('\s+', ' ', text)

def parse_emails(text):
    """ 
    Remove the periods from emails in text, except the last one
    """
    emails = [email if email[-1] != "." else email[:-1] for email in re.findall(r"\S*@\S*\s?", text)]
    
    for email in emails:
        new_email = email.replace(".", "")
        text = text.replace(email, new_email)
        
    return text

def parse_acronyms(text):
    """ 
    Remove the periods from acronyms in the text (i.e "U.S." becomes "US") 
    """

    acronyms = re.findall(r"\b(?:[a-zA-Z]\.){2,}", text)
         
    for acronym in acronyms:
        new_acronym = acronym.replace(".", "")
        text = text.replace(acronym, new_acronym)
        
    return text

def english_preprocessing(txt):
    """
    Steps in the preprocessing of text:
        1. Remove HTML tags
        2. Replace URLS by a tag [URL]
        3. Replace new lines and tabs by normal spaces - sometimes sentences have new lines in the middle
        4. Remove excessive spaces (more than 1 occurrence)
        5. Parse abreviations and acronyms
    """
    txt = replace_links(remove_html_tags(txt)).strip()#.replace("\n", " ").replace("\t", " ").strip()
    txt = remove_multiple_spaces(txt)
    txt = parse_emails(txt)
    txt = parse_acronyms(txt)
    
    new_txt = ""
    all_period_idx = set([indices.start() for indices in re.finditer("\.", txt)])
    
    for i, char in enumerate(txt):
        if i in all_period_idx:
            # Any char following a period that is NOT a space means that we should not add that period
            if i + 1 < len(txt) and txt[i + 1] != " ":
                continue
            
            # Any char that is a number following a period will not count. 
            # For enumerations, we're counting on docs being enumerated as "(a)" or "(ii)", and if not, they will be separated by the . after the number ("3. Something" will just be "Something" as a sentence)
            if i + 2 < len(txt) and txt[i + 2].isnumeric(): 
                continue
            
            # If we wanted to have all numbered lists together, uncomment this, and comment out the previous condition
#             if i + 2 < len(txt) and not txt[i + 2].isalpha(): 
#                 continue
            
        new_txt += char

    return new_txt

def english_postprocessing(sents, min_num_words=4):
    """
    Remove sentences that are made of less than a given number of words. Default is 4
    """
    
    return [sent for sent in sents if len(sent.split()) >= min_num_words]

def get_nltk_sents(txt, tokenizer, extra_abbreviations=None):
    if extra_abbreviations:
        tokenizer._params.abbrev_types.update(extra_abbreviations)
        
    sents = tokenizer.tokenize(txt)
    return sents

def aws_credentials_from_file(f_name):
    with open(f_name, "r") as f:
        creds = json.load(f)
    
    return creds["aws"]["id"], creds["aws"]["secret"]

def format_sents_for_output(sents, doc_id):
    formatted_sents = {}

    for i, sent in enumerate(sents):
        formatted_sents.update({f"{doc_id}_sent_{i}": {"text": sent, "label": []}})

    return formatted_sents


def output_sents(sents, f_name, f_uuid, country, bucket, output_dir):

    sents_json = {}
    fformat = f_name.split(".")[-1]
    sents_json[f_uuid] = {"metadata":
                              {"n_sentences": len(sents),
                               "file_name": f_name,
                               "file_format": fformat,
                               "country": country},
                          "sentences": format_sents_for_output(sents, f_uuid)}
    
    s3.Object(bucket, f"{output_dir}/{f_uuid}_sents.json").put(Body=(json.dumps(sents_json, indent=4)))
    
    
def filenames_for_country(country, s3, bucket):
    metadata_fname = f"metadata/{country}_metadata.csv"
    obj = s3.Object(bucket_name = bucket, key = metadata_fname)
    
    filenames = []
    i = 0
    for row in csv.reader(codecs.getreader("utf-8")(obj.get()['Body'])):
        # Add original file ID without the file format
        filenames.append(row[3][:-4])
    
    return filenames

- Actual pipeline

Reads from the S3 Bucket english documents, outputs to test_sentences


Format of credentials JSON file:
```
{
    "aws": {
        "id": "AWS ID",
        "secret": "AWS SECRET"
    }
}
```

In [None]:
# Connection to AWS Jordi's way
folder = '/home/propietari/Documents/claus/' # TODO: change to your local path
file_name = 'AWS_S3_keys_wri.json' # TODO: Change to your filename
file = folder + file_name

with open(file, 'r') as dict:
    credentials = json.load(dict)
                                      
aws_id = list(credentials)[0]
aws_secret = list(credentials.values())[0]
region = 'us-east-1'
bucket = "wri-nlp-policy"
countries = ['India', 'USA']
s3 = boto3.resource(
    service_name = 's3',
    region_name = region,
    aws_access_key_id = aws_id,
    aws_secret_access_key = aws_secret
)

In [3]:
credentials_file = '/Users/dafirebanks/Documents/credentials.json'
aws_id, aws_secret = aws_credentials_from_file(credentials_file)
region = 'us-east-1'

s3 = boto3.resource(
    service_name = 's3',
    region_name = region,
    aws_access_key_id = aws_id,
    aws_secret_access_key = aws_secret
)

bucket = 'wri-nlp-policy'
countries = ['India', 'USA']

This pipeline is ready to be ran on the India documents only

In [8]:
# Define abbreviations per country - this can work for all english documents
usa_abrevs = {"no", "sec", "cong", "dist", "doc"}
india_abrevs = {"sub", "subs", "ins", "govt", "dy", "dept", "deptt", "ptg"} 

# Define country and output directory in S3 bucket
country = "India"
out_dir = "english_documents/test_sentences"
select_filenames = filenames_for_country(country, s3, bucket)

i = 0
for obj in s3.Bucket(bucket).objects.all().filter(Prefix="english_documents/text_files/"):
    
    # Don't get the directory itself
    if not obj.key.endswith("/"):
        file_id = obj.key.replace("english_documents/text_files/", "").replace(".txt", "")
        if file_id in select_filenames:
            text = obj.get()['Body'].read().decode('utf-8')
            preprocessed = english_preprocessing(text)
            # NOTE: Change abreviations in the future specific to country
            sents = get_nltk_sents(preprocessed, en_tokenizer, india_abrevs)
            post_processed_sents = english_postprocessing(sents, min_num_words=5)
            output_sents(post_processed_sents, obj.key, file_id, country, bucket, out_dir)

    i += 1
    if i == 2:
        break
    if i % 100 == 0:
        print(f"Processed {i} documents")
    

#### This cell incorporates the language detection

In [24]:
all_abrevs = {"no", "sec", "cong", "dist", "doc", "sub", "subs", "ins", "govt", "dy", "dept", "deptt", "ptg"} 

# Define country and output directory in S3 bucket
country = "India"
out_dir = "english_documents/test_sentences/processed"
select_filenames = filenames_for_country(country, s3, bucket)

i = 0
for obj in s3.Bucket(bucket).objects.all().filter(Prefix = "english_documents/text_files/new/"):
    
    # Don't get the directory itself
    if not obj.key.endswith("/"):
        file_id = obj.key.replace("english_documents/text_files/new/", "").replace(".txt", "")
        if file_id in select_filenames:
#             # Uncomment to do language detection before sentence splitting
#             text = ''
#             line_i = 0
#             for line in obj.get()['Body'].read().splitlines():
#                 line = line.decode('utf-8')
#                 try:
#                     result = fmodel.predict([line])
#                     print("--------------")
#                     print("Line:", line, " || Language:", result[0][0][-2:])
#                     print("--------------")
#                     if result[0][0][0][-2:] == 'en' and result[1][0][0] > 0.7:
#                         text = text + line + "\n"
#                         print(">>>>>>>>>>> English sentence:", line)
#                     line_i += 1
#                     if line_i == 100:
#                         break
#                 except:
#                     print("*** ", line)
#                     continue
#             text = obj.get()['Body'].read().decode('utf-8')
#             preprocessed = english_preprocessing(text)
#             # NOTE: Change abreviations in the future specific to country
#             sents = get_nltk_sents(preprocessed, en_tokenizer, all_abrevs)
#             post_processed_sents = english_postprocessing(sents, min_num_words=5)
#             ---------------------------------------------------------------------------
#             # Uncomment if you want to apply language detection after sentence splitting
#             sent_i = 0
#             for sent in post_processed_sents:
#                 try:
#                     result = fmodel.predict(sent)
#                     print("--------------")
#                     print("Sentence:", sent, " || Language:", result[0][0][-2:])
#                     print("--------------")
#                     if result[0][0][0][-2:] == 'en' and result[1][0][0] > 0.7:
#                         print(">>>>> English sentence:", sent)
#                     sent_i += 1
#                     if sent_i == 100:
#                         break
#                 except:
#                     print("*** ", sent)
#                     continue

#             output_sents(post_processed_sents, obj.key, file_id, country, bucket, out_dir)

    i += 1
    if i == 2:
        break
    if i % 100 == 0:
        print(f"Processed {i} documents")

--------------
Sentence: 137 (72) PTSTENT 2101-97, TORER 21, 2002 4 (TT) 4 (") (2) The words and expressions used but not defined in CAD & WU DEPARTMENT thece urles have the same meaning as are respectively assigned NOTIFICATION to them in the Act.  || Language: en
--------------
--------------
Sentence: THE RAJASTHAN FARMERS' PARTICIPATION IN CHAPTER-II MANAGEMENT OF IRRICATION SYSTEMS FORMATION OF TERRITORIAL CONSTITUENCIES RULES, 2002 3.  || Language: en
--------------
--------------
Sentence: Formation of territorial constituencies-(1) Every Jaipur, October 22, 2002.  || Language: en
--------------
--------------
Sentence: G. S. R 80--In exercise of the powers conferred by in water users area shall divided by the Project Authority in to section 47 of the Rajasthan Farmers' Participation such number of territorial constituencies as given below- 2000), the State Government hereby makes the following Management of Irrigation Systems Act, 2000 (Act No 21 of Area of Water Users Number o

Extra functions to read the abbreviations file from S3, and move files between S3 bucket folders

In [22]:
abbreviations_fname = f"abbreviations/english_abbreviations.txt"
obj = s3.Object(bucket_name = bucket, key = abbreviations_fname)

In [23]:
obj.get()['Body'].read().decode('utf-8').split("\n")

['sub',
 'subs',
 'ins',
 'govt',
 'dy',
 'dept',
 'deptt',
 'ptg',
 'no',
 'sec',
 'cong',
 'dist',
 'doc']

In [36]:
def move_s3_object(obj_name, obj_old_folder, obj_new_folder, s3, bucket_name):
    """
    Moves an object from a given S3 folder to another by copying it to the new folder it and then deleting it from the old one
    """
    try:
        s3.Object(bucket_name, f"{obj_new_folder}/{obj_name}").copy_from(CopySource=f"{bucket_name}/{obj_old_folder}/{obj_name}")
        response = s3.Object(bucket_name, f"{obj_old_folder}/{obj_name}").delete()
    except Exception as e:
        print(f"Exception while moving {obj_name} from {obj_old_folder} to {obj_new_folder}:")
        print(e)

In [38]:
move_s3_object("english_abbreviations.txt", "english_documents", "abbreviations", s3, "wri-nlp-policy")

## Testing language detection on sample file

In [4]:
with open("/Users/dafirebanks/Downloads/011e041eced3f90558b955e5f65c7682608e35cc.txt", "r") as f:
    text = f.read()

### 1. Apply language detection before sentence splitting 

In [19]:
for line in text.splitlines():
    try:
        result = fmodel.predict([line])
        print("--------------")
        print("Line:", line, " || Language:", result[0][0][-2:])
        print("lang_detect:", detect_langs(line))
        print("Probability:", result[1][0])
        print("--------------")
        if result[0][0][0][-2:] == 'en' and result[1][0][0] > 0.7:
            text = text + line + "\n"
            print(">>>>>>>>>>> English sentence:", line)
    except:
        print("*** ", line)
        continue

--------------
Line: 12456  || Language: ['__label__gu']
lang_detect:
***  12456
--------------
Line: THE GAZETTE OF INDIA, NOVEMBER 10, 2012 (KARTIKA 19, 1934)  || Language: ['__label__en']
lang_detect:
[en:0.999994980952714]
Probability: [0.3980843]
--------------
--------------
Line: [PART III-SEC. 4  || Language: ['__label__en']
lang_detect:
[id:0.9999939202976486]
Probability: [0.5178548]
--------------
--------------
Line: University Grants Commission  || Language: ['__label__en']
lang_detect:
[en:0.9999950418111176]
Probability: [0.7218409]
--------------
>>>>>>>>>>> English sentence: University Grants Commission
--------------
Line: No. F. 15-3/2012 (ARC)  || Language: ['__label__en']
lang_detect:
[pt:0.9999956692468923]
Probability: [0.62202793]
--------------
--------------
Line: 8 October, 2012  || Language: ['__label__en']
lang_detect:
[en:0.9999956111997939]
Probability: [0.83474034]
--------------
>>>>>>>>>>> English sentence: 8 October, 2012
--------------
Line: In exerc

### 2. Apply language detection after sentence splitting

In [21]:
all_abrevs = {"no", "sec", "cong", "dist", "doc", "sub", "subs", "ins", "govt", "dy", "dept", "deptt", "ptg"} 
preprocessed = english_preprocessing(text)
# NOTE: Change abreviations in the future specific to country
sents = get_nltk_sents(preprocessed, en_tokenizer, all_abrevs)
post_processed_sents = english_postprocessing(sents, min_num_words=5)
for sent in post_processed_sents:
    try:
        result = fmodel.predict(sent)
        print("--------------")
        print("Sentence:", sent, " || Language:", result[0][0][-2:])
        print("lang_detect:", detect_langs(sent))
        print("Probability:", result[1][0])
        print("--------------")
        if result[0][0][0][-2:] == 'en' and result[1][0][0] > 0.7:
            print(">>>>> English sentence:", sent)
    except:
        print("*** ", sent)
        continue

--------------
Sentence: 12456 THE GAZETTE OF INDIA, NOVEMBER 10, 2012 (KARTIKA 19, 1934) [PART III-SEC 4 University Grants Commission No. F 15-3/2012 (ARC) 8 October, 2012 In exercise of powers conferred under clause (g) of sub-section (1) of section 26 of the University Grants Commission Act 1956 (3 of 1956) the, University Grants Commission here by makes the following regulations, namely:- (1) These regulations may be called the "curbing the Menace of Ragging in Higher Educational Institutions (Amendment) Regulations, 2012" (2) They shall come into force on the date of their publication in the Official Gazette 2.  || Language: en
lang_detect: [en:0.9999970044210891]
Probability: 0.8904610276222229
--------------
--------------
Sentence: In UGC Regulations on Curbing the Menace of Ragging in Higher Educational Institutions, 2009, (hereinafter referred to as the Principal regulations), in regulation 1, in sub-regulation 11, for the letters and words "ÜGC Regulations on curbing the Men

In [14]:
sample_text = " taties of (in fidlth fild (t hid up (titelye) reaft (tr (t tapite) OF ill depj Offly"

In [15]:
from langdetect import DetectorFactory, detect, detect_langs

In [18]:
# detect(sample_text) #  'cs'
detect_langs(sample_text)  # [cs:0.7142840957132709, pl:0.14285810606233737, sk:0.14285779665739756]

[en:0.9999959850517358]