# Step 3. Manual-code categorization of surface-forms and errors 

Among the corrections made by the LLM in the previous step

In [2]:
import json
import unicodedata
import os
import re
import pandas as pd
from string import punctuation
import difflib
punctuation += " "

In [21]:
def normalize(c): return unicodedata.normalize('NFKD', c).encode('ASCII', 'ignore').decode()

def useful_chars(string): return re.sub(r'[^a-zA-ZÀ-ÿ]', '', string)

def count_accent_changes(str1, str2):
    if len(str1) != len(str2):
        return -1

    changes = 0
    for char1, char2 in zip(str1, str2):
        if char1 != char2 and normalize(char1) == normalize(char2):
            changes += 1
    return changes


*Go back to this point to reset all the applied categorizations and start over the debugging process, when required.*

In [4]:
CORRECTIONS_FILE = "./correctionsLatam.json"
SURFACE_FORMS_FILE = "../data/surfaceForms.json"
SURFACE_FORMS_FILE_NONACC = "../data/surfaceFormsNonAccents.json"
ORTHOGRAPHIC_ERRORS_FILE = "../data/orthographicErrors.json"
COLOR_PRINTING = True # for large corpus, set it to False
current_step = 0

df = pd.read_parquet("../data/pre-corrected-latam-xix.parquet")

with open(CORRECTIONS_FILE, 'r') as infile:
    fixes = json.load(infile)

if not os.path.exists(SURFACE_FORMS_FILE):
    with open(SURFACE_FORMS_FILE, 'w') as outfile:
        json.dump({}, outfile)
surface_forms = dict()

if not os.path.exists(SURFACE_FORMS_FILE_NONACC):
    with open(SURFACE_FORMS_FILE_NONACC, 'w') as outfile:
        json.dump({}, outfile)
surface_forms_nacc = dict()

if not os.path.exists(ORTHOGRAPHIC_ERRORS_FILE):
    with open(ORTHOGRAPHIC_ERRORS_FILE, 'w') as outfile:
        json.dump([], outfile)
orthographic_errors = dict() # will be changed to list before saving

In [5]:
df.loc[0, "text"]

'La publicacion del Oso se harà dos veces cada semana, y constará de un pliego en cuarto ; ofreciendo à mas sus redactores, dar los gravados oportunos, siempre que lo exija el asunto de que trate. Redactado por un Num. 8. TEMA del Periodico. POLITICA MILITAR. OCTAVA SESION. Abierta la sesion á las dore y un minuto de la noche , 25 de Febrero de 1845 , con asistencia de todos los Señores Representantes, se leyó y aprobó la acta de la Asamblea anterior , ménos en lo tocante à la torre del Convento de Santo Domingo, punto que quedó para ventilarse en mejor ocasion. Enseguida se dió cuenta de una nota del Ejecutivo , referente à que urjía la necesidad de organizar un Ejército ; pues decia el Excmo. Decano: - "Un poder sin bayonetas vale tanto como un cero puesto á la izquierda."'

In [6]:
def add_to_surface(wrong, good, freq):
    sf_t, real_t = wrong.lower(), good.lower()
    sf_ws, real_ws = re.findall(r'\w+', sf_t), re.findall(r'\w+', real_t)

    if len(sf_ws) == len(real_ws):
        for sf,real in zip(sf_ws, real_ws):
            if sf != real:
                numaccentchanges = count_accent_changes(real, sf)
                surface_forms[real] = surface_forms.get(real, dict())
                surface_forms[real][sf] = surface_forms[real].get(sf, 0) + freq
                if numaccentchanges < 1:
                    surface_forms_nacc[real] = surface_forms_nacc.get(real, dict())
                    surface_forms_nacc[real][sf] = surface_forms_nacc[real].get(sf, 0) + freq

    else:
        if sf_t != real_t:
            numaccentchanges = count_accent_changes(real_t, sf_t)
            surface_forms[real_t] = surface_forms.get(real_t, dict())
            surface_forms[real_t][sf_t] = surface_forms[real_t].get(sf_t, 0) + freq
            if numaccentchanges < 1:
                surface_forms_nacc[real_t] = surface_forms_nacc.get(real_t, dict())
                surface_forms_nacc[real_t][sf_t] = surface_forms_nacc[real_t].get(sf_t, 0) + freq

def add_to_errors(fix):
    for idx,widx1,widx2,ctx in fix['usages']:
        orthographic_errors[(idx,widx1,widx2)] = {
            "prv": fix['change'][0],
            "mod": fix['change'][1],
            "ctx": ctx
        }

def print_fix(wrong, good, freq, category):
    if COLOR_PRINTING:
        if category == "SFRM": 
            category = f"[\033[93m{category}"
        elif category == "ERRR":
            category = f"[\033[95m{category}"
        else:
            category = f"[\033[90m{category}"
        print(f"{category}\033[0m] (\033[92m{freq}\033[0m) \033[91m{wrong}\033[0m - \033[94m{good}\033[0m")
    else:
        print(f"[{category}] ({freq}) {wrong} - {good}")

def execute_prev_steps(step):
    if step <= current_step + 1:
        return
    elif step - 2 > current_step:
        raise Exception(f"Execute step {step-2} first")
    else:
        if step >= 2:
            i = step-1
            print(f"Executing step {i}...")
            globals()[f"step{i}"](False)
            status()

status = lambda: print(f"{len(fixes)} fixes to check\n{len(surface_forms)} surface forms found\n{len(orthographic_errors)} orthographic errors found")
status()

114411 fixes to check
0 surface forms found
0 orthographic errors found


## Manually-written steps for categorization:

**NOTES:**

- Each correction made by the LLM may be categorized as:
  * **surface form**: with the method `add_to_surface`
  * **orthographic error**: with the method `add_to_errors`
  * **none**: just `pass` (skip) the correction

- Each step has a `JUST_PRINT` variable; if it's True, the changes won't affect the variables. For debugging a particular step:
  * First, **ALWAYS run all the previous steps with the `JUST_PRINT` variable set to False**
  * Then, set the `JUST_PRINT` variable of the step you want to debug to True and run the step
  * If you run the next step without running the previous one (with `JUST_PRINT`=False), it'll run the previous automatically.

In [7]:
SF_CHANGES = [
    ('á','a'), ('a','á'),
    ('é','e'), ('e','é'),
    ('í','i'), ('i','í'),
    ('ó','o'), ('o','ó'),
    ('ú','u'), ('u','ú'),
    ('i','y'), ('y','i'),
    ('j','g'), ('g','j'),
    ('v','b'), ('b','v'),
    ('s','x'), ('x','s'),
    ('j','x'), ('x','j'),
    ('c','s'), ('s','c'),
    ('s','z'), ('z','s'),
    ('z','c'),
    ('q','c'), # quatro
    ('n','ñ'), # senor
    ('ni','ñ'), # senior
    ('k','qu'), # nikel
    ('k','c'), # kiosko
    ('ou','u'), # boulevares
    ('s','bs'), ('bs','s'), # suscriciones, obscuro
    ('c','pc'), # suscriciones
    ('s','ns'), # trasportar
    ('t','pt'), # Setiembre
    ('rt','r'), # libertar
    ('rr','r'),
]
# other common form is '...lo' -> 'lo ...' (e.g. cambiólo -> lo cambió)

ERR_CHANGES = [
    ('6','ó'), ('6','o'), ('1','y'), ('0','o'), ('4','a')
]

SF_EXCEPTIONS = [
    "presidenta", "sr.", "q'", "ud.", "d.", "usté", "apuntaciones", "comprofesores",
    "diez y seis", "bien que", "de el", "costarrica", "hispano-america"
]

ERR_EXCEPTIONS = [
    
]

SKIP = [
    "sugestiones", "suerte", "camonel"
]

### Step 1.

In [8]:
def diff(text1, text2):
    sm = difflib.SequenceMatcher(None, text1, text2)
    added = []
    removed = []
    modified = []
    for opcode, a0, a1, b0, b1 in sm.get_opcodes():
        sa = sm.a[a0:a1]
        sb = sm.b[b0:b1]
        match opcode:
            case 'insert': added.append(sb)
            case 'delete': removed.append(sa)
            case 'replace': modified.append((sa,sb))
            case _: pass
    return added, removed, modified

In [16]:
diff(normalize('viólo'), normalize('lo vio'))

(['lo '], ['lo'], [])

In [13]:
JUST_PRINT = False
def sfrm(fix):
    if JUST_PRINT: print_fix(*fix['change'], fix['freq'], "SFRM")
    else: add_to_surface(*fix['change'], fix['freq'])

def errr(fix):
    if JUST_PRINT: print_fix(*fix['change'], fix['freq'], "ERRR")
    else: add_to_errors(fix)

def none(fix):
    if JUST_PRINT: print_fix(*fix['change'], fix['freq'], "NONE")
    #else: pass

In [22]:
JUST_PRINT = True

def step1():
    global curent_step
    execute_prev_steps(1)
    if JUST_PRINT == False: curent_step=1

    idx_remove = []
    for i,fix in enumerate(fixes):
        added, removed, modified = diff(fix['change'][0].lower(), fix['change'][1].lower())
        if len(added)==0 and len(removed)==0:
            if all([i in SF_CHANGES for i in modified]):
                sfrm(fix)
            elif all([i in ERR_CHANGES for i in modified]):
                errr(fix)
            else:
                # TODO...
                none(fix)
        else:
            # TODO... lo -lo
            none(fix)
            print(added, removed, modified)
            break

step1()
status()

[[93mSFRM[0m] ([92m31606[0m) [91má[0m - [94ma[0m
[[93mSFRM[0m] ([92m3254[0m) [91mó[0m - [94mo[0m
[[93mSFRM[0m] ([92m2790[0m) [91mmas[0m - [94mmás[0m
[[93mSFRM[0m] ([92m2731[0m) [91mfué[0m - [94mfue[0m
[[93mSFRM[0m] ([92m2666[0m) [91mi[0m - [94my[0m
[[95mERRR[0m] ([92m2355[0m) [91m6[0m - [94mo[0m
[[93mSFRM[0m] ([92m1387[0m) [91mé[0m - [94me[0m
[[90mNONE[0m] ([92m1237[0m) [91mdo[0m - [94mde[0m
[[90mNONE[0m] ([92m868[0m) [91m&[0m - [94ma[0m
[[90mNONE[0m] ([92m823[0m) [91mà[0m - [94ma[0m
[[93mSFRM[0m] ([92m812[0m) [91mdia[0m - [94mdía[0m
[[93mSFRM[0m] ([92m756[0m) [91mque[0m - [94mqué[0m
[[93mSFRM[0m] ([92m750[0m) [91mel[0m - [94mél[0m
[[93mSFRM[0m] ([92m636[0m) [91mtambien[0m - [94mtambién[0m
[[93mSFRM[0m] ([92m599[0m) [91mhabia[0m - [94mhabía[0m
[[93mSFRM[0m] ([92m547[0m) [91mdió[0m - [94mdio[0m
[[90mNONE[0m] ([92m537[0m) [91mon[0m - [94men[0m
[[93mSFRM