In [2]:
# Developed by Carlos Gonçalves, with funding from the São Paulo State Foundation - FAPESP, Grant 2021/01363-6
# See README for more information
#
# This is the second of four scripts.
#
# It produces a file calles normalized_names+run_date.txt from the file generate by 01_parsing script (the file is called
# "parsing"+run_date_out+".txt", where run_date_out is the date when parsing was run). 
# There are two differences between the two files:
# - the personal names appear in transliterated form in parsing.txt, while they appear in normalized form
# in normalized_names.txt. For instance, nu-ur2-utu is replaced by Nūr-Šamaš.
# - besides, if the file identities.txt is not empty, this script replaces the identification as written in 
# the the cuneiform tablets by the one specified in identities.txt. This is used to resolve ambiguitites in 
# the original documentation, and it does involve a degree of interpretation, so it must be used with caution.
#
# Input files:
# ../Processing_Input/names+names_date+.txt
# ../Processing_Output/parsint_run_date_in+.txt
# ..Processing_Input/identities+identities_date+.txt
# Output file:
# ../Processing_output/normalized_names+run_date_out+.txt



run_date_in = '_2023_12_05'

run_date_out = '_2023_12_05'

names_date = '_2023_03_13'
identities_date = '_2023_03_14'
base_folder = ".."


import io
f = io.open(base_folder+"/Processing_Input/names"+names_date+".txt",'r', encoding = "utf-8")
g = io.open(base_folder+"/Processing_Output/parsing"+run_date_in+".txt",'r', encoding = "utf-8")
h = io.open(base_folder+"/Processing_Output/normalized_names"+run_date_out+".txt","w", encoding = "utf-8")
glossario = {}

# Here I load the rules for replacing names in order to solve homonyms and aliases
# This is far from being the most elegant solution, but it seemed to me that (under the excruciating pressure 
# of time), it could be easily implemented. So,
# - any resolution of homonyms of aliases must be manually included in the file identities.txt
# - the file does not distinguish if the originaly problem derives from homonyms of from aliases
# - all solution have the same format:
# -- in each line, we identify a transliteration, a doc number and a face.line and we state how
#    this should be normalized
# 
# All in all, this script makes two different although associated things:
# - it replaces for a given transliteration a correct normalisation
# - it complements the normalisation in order to solve homonyms and aliases
# For instance:
# - it normalises J-o-h-n and I-o-h-n as John
# - it may (if this makes sense to the human operating the system) change John to John, 
#   the carpenter, son of Joseph, reuniting all the occurrences of the same person under one only of their aliases
# - it may also change indicate the John in certain tablets is in reality John 2, because there are two 
#   people with this name in the community



i = io.open(base_folder+"/Processing_Input/identities"+identities_date+".txt","r", encoding = "utf-8")
replacements = {}
for line in i:
    if (line[0] == "#"):
        continue
    line = line.strip()
    linha_quebrada = line.split('|')
    #print(line, linha_quebrada)
    replacements[linha_quebrada[0],linha_quebrada[1],linha_quebrada[2]] = [linha_quebrada[3],linha_quebrada[4],linha_quebrada[5],linha_quebrada[6]]
#print(replacements)

#for chave in replacements:
#    print(chave, replacements[chave])


for line in f:
    linha_quebrada = line.split('\t')
    glossario[linha_quebrada[2]] = linha_quebrada[1]
f.close()
count_unknown = 1
for line in g:
    line = line.replace("{d}","")
    line = line.replace("{disz}","")
    linha_quebrada = line.split("|&")
# is this a homonym or alias that must be replaced?
    nominho = (linha_quebrada[0], linha_quebrada[5], linha_quebrada[6]+linha_quebrada[7])
    #print(nominho)
    if nominho in replacements:
        replacement_case = True
    else:
        replacement_case = False
        
        
        # replacing the profession

    nome = linha_quebrada[0]
    if (nome !=''):
        if(replacement_case):
            # replacing the name from list of homonyms and aliases
            nome_normalisado = replacements[nominho][0]
        else:
            nome_normalisado = glossario[nome]
        if (nome_normalisado != ''):
            linha_quebrada[0] = nome_normalisado
            if (nome_normalisado == 'Unknown'):
                linha_quebrada[0] = nome_normalisado + '_' + str(count_unknown)+ '('+ nome +')'
                count_unknown = count_unknown + 1
                
    if(replacement_case):
        # replacing the kinship
        linha_quebrada[1] = replacements[nominho][1]
        
    nome = linha_quebrada[2]
    if ((nome !='') or (replacement_case)):
        if(replacement_case):
        # replacing the relative
            nome_normalisado = replacements[nominho][2]
        else:
            nome_normalisado = glossario[nome]
        if(nome_normalisado !=''):
            linha_quebrada[2] = nome_normalisado
            if (nome_normalisado == 'Unknown'):
                linha_quebrada[2] = nome_normalisado + '_' + str(count_unknown) + '('+ nome +')'
                count_unknown = count_unknown + 1
    
    if(replacement_case):
        # replacing profession
        linha_quebrada[3] = replacements[nominho][3]
    
    
    #print(linha_quebrada)
    
    h.write('|'.join(map(str, linha_quebrada)))
g.close()
h.close()