# Spanish preprocessor/sentence splitting

In [1]:
import os
os.chdir("../../../")
os.getcwd()

'/home/propietari/Documents/GitHub/policy-data-analyzer'

In [4]:
from tasks.text_preprocessing import *

In [5]:
# 1. Setup amazon client
language = "spanish"
bucket_name = "wri-nlp-policy"
creds_filepath = "/Users/dafirebanks/Documents/credentials.json"
creds_filepath = "/home/propietari/Documents/claus/AWS_S3_keys_wri_sentence_splitting.json"

s3_client = S3Client(creds_filepath=creds_filepath, bucket_name=bucket_name, language=language)

In [6]:
# 2. Make sure we're getting the right files
i = 0
for file_id, text in s3_client.load_text_files(language):
    print("File_id:", file_id)
    print("Text:", text[:100])
    print("=======================================")
    i += 1
    if i == 2:
        break


TypeError: can only concatenate str (not "dict") to str

In [14]:
# 3. Start preprocessing
tokenizer = nltk.data.load(f"tokenizers/punkt/{language}.pickle")
abbrevs = {"ord", "num", "sra", "no", "corp", "art", "ltda", "ud"}
min_num_words = 5

new_text_files_folder = f"{language}_documents/text_files/new"
processed_text_files_folder = f"{language}_documents/text_files/processed"

i = 0
print_every = 100
error_files = []

for file_id, text in s3_client.load_text_files(language):
    try:
        file_id = file_id.replace("/", "")
        preprocessed_text = preprocess_spanish_text(text)
        sents = get_nltk_sents(preprocessed_text, tokenizer, abbrevs)
        postprocessed_sents = format_sents_for_output(remove_short_sents(sents, min_num_words), file_id)
        s3_client.store_sentences(postprocessed_sents, file_id, language)
        s3_client.move_object(file_id + ".txt", new_text_files_folder, processed_text_files_folder)

    except Exception as e:
        error_files.append({file_id: e})

    i += 1
    
    # For testing and early stopping, uncomment this
#     if i == 2:
#         break

    if i % print_every == 0:
        print("----------------------------------------------")
        print(f"Processing {i} documents...")
        print(f"Number of errors so far: {len(error_files)}")
        print("----------------------------------------------")

In [None]:
# If you want to store the errors
with open(f"../output/{language}_sentence_splitting_errors.json", "w") as f:
    json.dump(error_files, f)

# Main code

From this section until the end, it's mainly experimental

In [1]:
import re
import random
import nltk.data
import spacy 
import string
from collections import Counter
from collections import defaultdict
import nltk
import unidecode

In [2]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    return re.sub(re.compile('<.*?>'), '', text)

def replace_links(text):
    text = re.sub(r'http\S+', '[URL]', text)
    return re.sub(r'www\S+', '[URL]', text)

def remove_multiple_spaces(text):
    return re.sub('\s+', ' ', text)

def parse_emails(text):
    """ 
    Remove the periods from emails in text, except the last one
    """
    emails = [email if email[-1] != "." else email[:-1] for email in re.findall(r"\S*@\S*\s?", text)]
    
    for email in emails:
        new_email = email.replace(".", "")
        text = text.replace(email, new_email)
        
    return text

def parse_acronyms(text):
    """ 
    Remove the periods from acronyms in the text (i.e "U.S." becomes "US") 
    """

    acronyms = re.findall(r"\b(?:[a-zA-Z]\.){2,}", text)
         
    for acronym in acronyms:
        new_acronym = acronym.replace(".", "")
        text = text.replace(acronym, new_acronym)
        
    return text

def spanish_preprocessing(txt, remove_new_lines=False):
    """
    Steps in the preprocessing of text:
        1. Remove HTML tags
        2. Replace URLS by a tag [URL]
        3. Replace new lines and tabs by normal spaces - sometimes sentences have new lines in the middle
        4. Remove excessive spaces (more than 1 occurrence)
        5. Parse abreviations and acronyms
    """
    txt = replace_links(remove_html_tags(txt)).strip()
    if remove_new_lines:
        txt = txt.replace("\n", " ").replace("\t", " ").strip()
    txt = remove_multiple_spaces(txt)
    txt = parse_emails(txt)
    txt = parse_acronyms(txt)
    
    new_txt = ""
    all_period_idx = set([indices.start() for indices in re.finditer("\.", txt)])
    
    for i, char in enumerate(txt):
        if i in all_period_idx:
            # Any char following a period that is NOT a space means that we should not add that period
            if i + 1 < len(txt) and txt[i + 1] != " ":
                continue
            
            # Any char that is a number following a period will not count. 
            # For enumerations, we're counting on docs being enumerated as "(a)" or "(ii)", and if not, they will be separated by the . after the number ("3. Something" will just be "Something" as a sentence)
            if i + 2 < len(txt) and txt[i + 2].isnumeric(): 
                continue
            
            # If we wanted to have all numbered lists together, uncomment this, and comment out the previous condition
#             if i + 2 < len(txt) and not txt[i + 2].isalpha(): 
#                 continue
            
        new_txt += char

    return unidecode.unidecode(new_txt)

def get_nltk_sents(txt, tokenizer, extra_abbreviations=None):
    if extra_abbreviations:
        tokenizer._params.abbrev_types.update(extra_abbreviations)
        
    sents = tokenizer.tokenize(txt)
    return sents

def spanish_postprocessing(sents, min_num_words=4):
    """
    Remove sentences that are made of less than a given number of words. Default is 4
    """
    
    return [sent for sent in sents if len(sent.split()) >= min_num_words]

def format_sents_for_output(sents, doc_id):
    formatted_sents = {}

    for i, sent in enumerate(sents):
        formatted_sents.update({f"{doc_id}_sent_{i}": {"text": sent, "label": []}})

    return formatted_sents

# Experiments

In [None]:
import os
import pandas as pd
import pickle
import re


USC_re = re.compile('[Uu]\.*[Ss]\.*[Cc]\.]+')
PAREN_re = re.compile('\([^(]+\ [^\(]+\)')
BAD_PUNCT_RE = re.compile(r'([%s])' % re.escape('"#%&\*\+/<=>@[\]^{|}~_'), re.UNICODE)
BULLET_RE = re.compile('\n[\ \t]*`*\([a-zA-Z0-9]*\)')
DASH_RE = re.compile('--+')
WHITESPACE_RE = re.compile('\s+')
EMPTY_SENT_RE = re.compile('[,\.]\ *[\.,]')
FIX_START_RE = re.compile('^[^A-Za-z]*')
FIX_PERIOD = re.compile('\.([A-Za-z])')
SECTION_HEADER_RE = re.compile('SECTION [0-9]{1,2}\.|\nSEC\.* [0-9]{1,2}\.|Sec\.* [0-9]{1,2}\.')

FIX_PERIOD = re.compile('\.([A-Za-z])')

SECTION_HEADER_RE = re.compile('SECTION [0-9]{1,2}\.|\nSEC\.* [0-9]{1,2}\.|Sec\.* [0-9]{1,2}\.')

def clean_text(text):
    """
    Borrowed from the FNDS text processing with additional logic added in.
    Note: we do not take care of token breaking - assume SPACY's tokenizer
    will handle this for us.
    """

    # Indicate section headers, we need them for features
    text = SECTION_HEADER_RE.sub('SECTION-HEADER', text)
    # For simplicity later, remove '.' from most common acronym
    text = text.replace("U.S.", "US")
    text = text.replace('SEC.', 'Section')
    text = text.replace('Sec.', 'Section')
    text = USC_re.sub('USC', text)

    # Remove parantheticals because they are almost always references to laws 
    # We could add a special tag, but we just remove for now
    # Note we dont get rid of nested parens because that is a complex re
    #text = PAREN_re.sub('LAWREF', text)
    text = PAREN_re.sub('', text)
    

    # Get rid of enums as bullets or ` as bullets
    text = BULLET_RE.sub(' ',text)
    
    # Clean html 
    text = text.replace('&lt;all&gt;', '')

    # Remove annoying punctuation, that's not relevant
    text = BAD_PUNCT_RE.sub('', text)

    # Get rid of long sequences of dashes - these are formating
    text = DASH_RE.sub( ' ', text)

    # removing newlines, tabs, and extra spaces.
    text = WHITESPACE_RE.sub(' ', text)
    
    # If we ended up with "empty" sentences - get rid of them.
    text = EMPTY_SENT_RE.sub('.', text)
    
    # Attempt to create sentences from bullets 
#    text = replace_semicolon(text)
    
    # Fix weird period issues + start of text weirdness
    #text = re.sub('\.(?=[A-Z])', '  . ', text)
    # Get rid of anything thats not a word from the start of the text
    text = FIX_START_RE.sub( '', text)
    # Sometimes periods get formatted weird, make sure there is a space between periods and start of sent   
    text = FIX_PERIOD.sub(". \g<1>", text)

    # Fix quotes
    text = text.replace('``', '"')
    text = text.replace('\'\'', '"')

    # Add special punct back in
    text = text.replace('SECTION-HEADER', '<SECTION-HEADER>')

    return text

In [None]:
base_path = "../input/Mexico/"
chile_paths = ["Chile1.txt", "Chile2.txt", "Chile3.txt"]
elsalvador_paths = ["ElSalvador1.txt", "ElSalvador2.txt", "ElSalvador3.txt"]
mexico_paths = ["Mexico1.txt", "Mexico2.txt", "Mexico3.txt", "Mexico4.txt", "Mexico5.txt", "Mexico6.txt"]
fname = mexico_paths[1]
txt_path = base_path + fname

with open(txt_path, "r") as txt_file:
    txt = txt_file.read()
    
# txt

In [None]:
# Initial out of the box result
es_tokenizer = nltk.data.load("tokenizers/punkt/spanish.pickle")
spa_abrevs = {"ord", "num", "sra", "no", "corp", "art", "ltda", "ud"}
preprocessed1 = spanish_preprocessing(clean_text(txt))
preprocessed2 = spanish_preprocessing(txt)
s1 = get_nltk_sents(preprocessed1, es_tokenizer, spa_abrevs)
s2 = get_nltk_sents(preprocessed2, es_tokenizer, spa_abrevs)
len(s1), len(s2)

In [None]:
s2

# Conclusions
- We can add the `clean_text()` function but it may be more appropriate for english
- With the execption of some weird characters, and some acronyms, sentences are parsed properly
- Only concern is sometimes bullet points/nested enumerations are captured together or apart... but maybe that's for future work.

# Appendix

In [None]:
def get_surrounding_chars(txt, radius=1):
    surrounding_chars = []
    all_period_idx = [indices.start() for indices in re.finditer("\.", txt)]
    
    for period_idx in all_period_idx:
        start_idx = period_idx - radius
        end_idx = period_idx + radius + 1
        substring = txt[start_idx: end_idx]
        
        if substring:
            surrounding_chars.append(substring)
    
    return surrounding_chars

In [None]:
surrounding_chars_1 = get_surrounding_chars(txt)
surrounding_chars_2 = get_surrounding_chars(txt, radius=2)

print(f"For 1 character before and after a period, we have {len(set(surrounding_chars_1))} unique patterns")
print(f"For 2 characters before and after a period, we have {len(set(surrounding_chars_2))} unique patterns")

In [None]:
def get_possible_chars(neighboring_chars):
    possible_chars = defaultdict(list)

    for pattern in neighboring_chars:
        if pattern[-1] == " ":
            possible_chars[" "].append(pattern)
        elif pattern[-1].isalpha():
            possible_chars["alpha"].append(pattern)
        elif pattern[-1].isnumeric():
            possible_chars["numeric"].append(pattern)
        elif not pattern[-1].isalnum():
            possible_chars["symbol"].append(pattern)
        else:
            possible_chars["other"].append(pattern)
    
    print(f"Total: {len(neighboring_chars)}")
    return possible_chars

def print_char_stats(possible_chars):
    print(f"Space: {len(possible_chars[' '])}"), 
    print(f"Alpha: {len(possible_chars['alpha'])}"), 
    print(f"Numeric: {len(possible_chars['numeric'])}"), 
    print(f"Symbol: {len(possible_chars['symbol'])}"), 
    print(f"Other: {len(possible_chars['other'])}")

In [None]:
possible_chars = get_possible_chars(surrounding_chars_1)
print_char_stats(possible_chars)

In [None]:
possible_chars = get_possible_chars(set(surrounding_chars_1))
print_char_stats(possible_chars)