In [1]:
import pandas as pd
#from pandas import option_context
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from tqdm import tqdm
import re
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import spacy
import pypandoc

# Increase limit of characters processed
# Need to specify to not use spcifics models to avoid overload
# nlp_es.max_length = 2000000


## (fonctions)

In [2]:
# funct. import text, clean withite spaces, return cleaned text as list
def clean_input(txt_file):
    file_content = open(txt_path + file + ".txt", "r").readlines()
    file_cleaned = []
    for line in file_content:
        if not line.isspace():
            file_cleaned.append(line.strip('\n'))
    return(file_cleaned)


def remove_html_tags(text):
    clean = re.sub('<.*>','',text)
    return(clean)

## Textual corpus


### Input files

| Acronym      | Author | Langage     |Original format|Source|
| :---         |    :----:   |          ---: |---:|--:|
|TS_DIC  | Anderson dictionnary | Tik-Spa |CSV||
|LLB_NEW |New Testament La Ligua Biblica |Tikuna|PDF||
|LLB_OLD |Old Testament La Ligua Biblica |Tikuna|PDF||
|T_Ebible|Ebible |Tikuna|HTML|https://ebible.org/bible/details.php?id=tcaNT|
|S_Ebible|EBible| Spanish|HTML|https://ebible.org/bible/details.php?id=spavbl|
|T_OHCR  |Universal Declaration Of Human Rights | Tikuna|TXT||
|S_OHCR  |Universal Declaration Of Human Rights  |Spanish|TXT||
|T_CRU  |Crubadan Synsets  |Tikuna|TXT||


In [3]:
# TODO : instead of length os text, transform into number of tokens and add csv
# Create dictionary to store content
corpus = {}
# Define files names
# TODO get from directory
name_files_txt = ['llb_new','llb_old','t_ohcr','s_ohcr','t_cru']
# explicit path
# TODO clean name
txt_path = 'SOURCES/txt/'
# Fill the doctionary with the clean content
for file in name_files_txt:
    corpus[file] = clean_input(file)

## Textual Sources
- PDF content : Adobe Acrobat to convert pdf to docx
- Docx content : pandoc docx to plaintext
- html content : pandoc html to plain

### Cleaning
For both docx and html version of the bible, keep a maximum of metadata related to the text structure to facilitate the alignement of tikuna and spanish.
Therefore one version is kept to add a maximum of annotation, and a second version of the context sums all the files in one.
The cleaning focus on the elminiation of the copyright trademark, page navigation options etc...

#### Option B with creating text files but no access to html structure after
pypandoc.convert_file('chapters/*.md', 'docx', outputfile="somefile.docx")
and create text files
then match names of texts in tikuna and spanish
create a graphical map in the notebook

### Storing
All bible content are stored in a dictionnary.

!!! clean file structures, clean path names and test them again !

In [4]:
directory = 'SOURCES/txt/spa_ebible'
spa_ebible = {}
master_f = []

for filename in os.listdir(directory):
    f = os.path.join(directory,filename)
    if os.path.isfile(f):
        if filename.endswith(".txt"):
            rename = re.search(r'([0-9]{3,3}.*[0-9]{2,2})', filename)
            content = open(os.path.join(directory, filename), "rt").read()  
            master_f.append(content)          
            spa_ebible[rename.group(0)] = content

spa_ebible =  dict(sorted(spa_ebible.items()))

directory = 'SOURCES/txt/tik_ebible'
tik_ebible = {}
master_f = []

for filename in os.listdir(directory):
    f = os.path.join(directory,filename)
    if os.path.isfile(f):
        if filename.endswith(".txt"):
            rename = re.search(r'([0-9]{3,3}.*[0-9]{2,2})', filename)
            content = open(os.path.join(directory, filename), "rt",encoding="UTF-8-sig").read()
            content = re.sub('\n','',content)
            master_f.append(content)          
            tik_ebible[rename.group(0)] = content

tik_ebible =  dict(sorted(tik_ebible.items()))

directory = 'SOURCES/original/tcaNT_html'
tik_ebible_full = {}

for filename in os.listdir(directory):
    f = os.path.join(directory,filename)
    if os.path.isfile(f):
        if filename.endswith(".htm"):
            output = pypandoc.convert_file(f, 'rst',format = 'html')
            filename = re.sub('\.htm','',filename)
            tik_ebible_full[filename] = output

directory = 'SOURCES/original/spavbl_html'
spa_ebible_full = {}

for filename in os.listdir(directory):
    f = os.path.join(directory,filename)
    if os.path.isfile(f):
        if filename.endswith(".htm"):
            output = pypandoc.convert_file(f, 'rst',format = 'html')
            filename = re.sub('\.htm','',filename)
            spa_ebible_full[filename] = output

directory = 'SOURCES/txt/tik_ebible'
tik_ebible = {}
master_f = []

for filename in os.listdir(directory):
    f = os.path.join(directory,filename)
    if os.path.isfile(f):
        if filename.endswith(".txt"):
            rename = re.search(r'([0-9]{3,3}.*[0-9]{2,2})', filename)
            content = open(os.path.join(directory, filename), "rt",encoding="UTF-8-sig").read()
            content = re.sub('\n','',content)
            master_f.append(content)          
            tik_ebible[rename.group(0)] = content

tik_ebible =  dict(sorted(tik_ebible.items()))

#spa_ebible_full
for key, value in spa_ebible_full.items():
    # Clean content by removing all tags and content from inside.
    # TODO : Keep hetml descritpion tags as meta data

    spa_ebible_full[key] = remove_html_tags(value)
# Sort dictionary to sort by apostle

spa_ebible_full =  dict(sorted(spa_ebible_full.items()))

for key, value in tik_ebible_full.items():
    tik_ebible_full[key] = remove_html_tags(value)
    tik_ebible_full =  dict(sorted(tik_ebible_full.items()))
print(tik_ebible_full['1CO01'])

-  `1 CORĨ́TIUCÜ̱̃Ã̱X `__
-  ``__
-  `1 `__
-  `> `__

.. container:: main

   .. container:: mt

      NÜXĨRAÜ̃XÜ̃ GA POPERA GA CORĨ́TIUCÜ̱̃Ã̱X GA YAXÕGÜXÜ̃TANÜWA NAMUXÜ̃ GA
      PAURU

   .. container:: chapterlabel
      :name: V0

      1

   .. container:: s

      Pauru rü nüxü̃ narümoxẽ ga yema yaxõgüxü̃ ga Corĩ́tiuarü ĩãnewa
      yexmagüxü̃

   .. container:: p

      1-2 Pa Chaueneẽgü i Corĩ́tiucü̱̃ã̱xgüx, choma i Pauru nixĩ i Tupana
      yaxuxü̃ norü ngúchaxü̃maã na Ngechuchu ya Cristu norü puracüwa choxü̃
      namuxü̃cèx. Rü choma rü namaã i taeneẽ i Chótene nixĩ i pexü̃
      tarümõxẽgüxü̃ rü pexcèx tanaxümatüxü̃ i ñaã popera. Pa Chaueneẽgü ya
      Tupanaãrü Ixĩgüxex, pema rü marü Tupanapẽ́xewa pixüüne yerü
      Ngechuchu ya Cristu marü pexü̃ nixüünexẽẽ. Rü Tupana rü marü pexü̃
      nade na norü duü̃xü̃gü pixĩgüxü̃cèx wüxigu namaã i guxü̃ma i ngẽma
      duü̃xü̃gü i guxü̃wama tórü Cori ya Ngechuchu ya Cristuxü̃ icuèxüü̃güxü̃.
      Rü nüma ya Ngechuchu ya tórü Cori i

In [124]:

def clean_annotate_ebible(chapter_txt):
    #char_to_remove = ['\-','\`','``','\.','\s*\>\s*','\s*<\s*','copyright','© 2008 WBT','_']
    container_type = ['mt',',main','m','q','p','s','chapterlabel']
    # from container :: [a-z]* to .., container annotation.
    c_chapter = re.sub('\.\. container\:\:','',chapter_txt)
    c_chapter = re.sub('\n','',c_chapter)
    #c_chapter = re.search('^(.*?)[a-z]*','',chapter_txt)
    #c_chapter = re.search('(\w+)',chapter_txt)
    #print(c_chapter.group(0))
    return(c_chapter)
print(clean_annotate_ebible(tik_ebible_full['1CO01']))

-  `1 CORĨ́TIUCÜ̱̃Ã̱X `__-  ``__-  `1 `__-  `> `__ main    mt      NÜXĨRAÜ̃XÜ̃ GA POPERA GA CORĨ́TIUCÜ̱̃Ã̱X GA YAXÕGÜXÜ̃TANÜWA NAMUXÜ̃ GA      PAURU    chapterlabel      :name: V0      1    s      Pauru rü nüxü̃ narümoxẽ ga yema yaxõgüxü̃ ga Corĩ́tiuarü ĩãnewa      yexmagüxü̃    p      1-2 Pa Chaueneẽgü i Corĩ́tiucü̱̃ã̱xgüx, choma i Pauru nixĩ i Tupana      yaxuxü̃ norü ngúchaxü̃maã na Ngechuchu ya Cristu norü puracüwa choxü̃      namuxü̃cèx. Rü choma rü namaã i taeneẽ i Chótene nixĩ i pexü̃      tarümõxẽgüxü̃ rü pexcèx tanaxümatüxü̃ i ñaã popera. Pa Chaueneẽgü ya      Tupanaãrü Ixĩgüxex, pema rü marü Tupanapẽ́xewa pixüüne yerü      Ngechuchu ya Cristu marü pexü̃ nixüünexẽẽ. Rü Tupana rü marü pexü̃      nade na norü duü̃xü̃gü pixĩgüxü̃cèx wüxigu namaã i guxü̃ma i ngẽma      duü̃xü̃gü i guxü̃wama tórü Cori ya Ngechuchu ya Cristuxü̃ icuèxüü̃güxü̃.      Rü nüma ya Ngechuchu ya tórü Cori ixĩcü, rü ngẽma duü̃xü̃güarü Cori      ta nixĩ. 3 Rü chanaxwèxe ya Tanatü ya Tupana rü tórü Cori ya    

-  `APOCARÍCHIU `
-  ``
-  `18 `
-  `> `

.. container:: main

   .. container:: chapterlabel
      :name: V0

      18

   .. container:: s

      Nagu nayarüchixe ga ĩãne ga Babiróniã

   .. container:: p

      1 Rü yemawena nüxü̃ chadau ga wüxi ga to ga orearü ngeruü̃ ga
      daxũguxü̃ ga naãnewa írüxĩxü̃ ga taxü̃ ga ãẽ̱xgacü ixĩxü̃. Rü yema
      orearü ngeruü̃ãrü y̱auracüüxü̃maã nangóone ga ñoma ga naãne. 2 Rü
      tagaãcü ñanagürü:

   .. container:: p

      “Marü nagu nayarüchixe ya yima ĩãne ya Babiróniã ya itaégacüxüne.
      Rü ñu̱xma rü ngoxogüchiü̃ nixĩ rü nagúxü̃raü̃xü̃ i naãẽ i chixexü̃güchiü̃
      nixĩ. Rü ñu̱xma rü ngẽxma naxãchiü̃ i nagúxü̃raü̃xü̃ i ngurucugü rü
      werigü i chixexü̃ i duü̃xü̃gü naxchi aiexü̃. 3 Yerü guxü̃ma ga
      nachiü̃ãnecü̱̃ã̱xgü, rü guma ĩãneãrü chixexü̃ nüxna naxüe. Rü norü
      ãẽ̱xgacügü, rü yéma poraãcü chixri namaxẽ. Rü guxü̃ma ga ñoma ga
      nachiü̃ãnecü̱̃ã̱xgü ga taxetanüxü̃gü, rü nügü namuãrü dĩẽruã̱xẽẽgü namaã
      ga yema ĩ

### Tikuna - Spanish dictionnary


In [None]:
# Import new output from Anderson dictionary

ts_dic = pd.read_csv('SOURCES/csv/ts_dic.csv')
ts_dic.columns = ['tok_tikuna', 'token_prononciation', 'pos_tag_dict','token_spa_translation', 'full_dic_entry']
with option_context('display.max_colwidth', 400):
    display(ts_dic.head(5))


Unnamed: 0,tok_tikuna,token_prononciation,pos_tag_dict,token_spa_translation,full_dic_entry
0,a1,(a³⁵),interj.,pues,"a1 (a³⁵) interj. pues A, tamanüxü chacuax. Pues, yo no sé."
1,a2,(a⁴),conj.,,"a2 (a⁴) conj. Se usa para conectar sustantivos, pronombres y adjetivos en tiempo presente o futuro. Véanse i, ya, ga"
2,ã,(ã⁴),s.n.,zancudo,ã (ã⁴) s.n. zancudo Ngẽma ã rü choxü̃ napai. Ese zancudo me picó.
3,ããcuweü̃xü̃,(ã³ã³cu⁵we³ü̃x³ü²),s.n.,cargador,ããcuweü̃xü̃ (ã³ã³cu⁵we³ü̃x³ü²) s.n. cargador
4,ããcuxü̃,(ã³ã³cux³ü²),adj.,lleno,ããcuxü̃ (ã³ã³cux³ü²) adj. lleno Ngẽma diẽruchixü̃ i ããcuxü̃ rü choxǘ ̃̃ ínangu. Esa billetera llena se me cayó.


## Example
- In some  dictionary entries, examples are provided. They give examples on a contextual use of a word.</br>
- As they mostly follow a structure [Tikuna sense][.][space][Sanish sentence][.], we can build a simple regex to split the content between spanish and tikuna.</br>
- However, due to some mistake in the conversion or inconsistencies in the data, some entries don't follow the same Uppercase, ponctuation & lowercase scheme.</br>
For this specific cases, it is harder to differenciate the tikuna and spanish text with regexes. Those cases will be handled manually.

In [None]:
# Not all entries do have examples. Some of them have more pecific infos or a reference to another entry.
# All example in tikuna should begin with Uppercase.
# So first step is to check this condition, and capture the content for further cleaning.
 
example = []
for i in range(ts_dic.shape[0]):
    str = ts_dic['full_dic_entry'][i]
    match = re.search(r'([A-Z].*\.)',str)
    if match:
        str2 = match.group(0)
        match2 = re.search(r'[\(]',str2)
        if match2:
            example.append('')
        else:
            example.append(match.group(0))
    else:
        example.append('')
    # * cases, tikuna esp
    # mismatch an entry without pos tag
    # Spanish descritpion of meaning
ts_dic['example'] = example

# In the Content captured with the Uppercase, not everything is Tikuna.
# The cleaning focuses on removing the content begining with Var. referecing to another entry.
# The [Tikuna sense][.][space][Sanish sentence][.] Pattern capture most of the content with the first regex.

example_spanish = []
example_tikuna = []
exclude = [None,'',' ','Var.','\n','\t','\r','\f']
annotated_examples = 0
other_text = 0
to_review = 0
for i in range(ts_dic.shape[0]):
    str = ts_dic['example'][i]
    match = re.search(r'(?P<tikuna>[A-Z].*(\.|\!|\?|\¿))\s*'
    r'(?P<spanish>[A-ZÈÉ].*(\.|\!|\?|\¿))',str)
    if match:
        example_tikuna.append(match.group(1))
        example_spanish.append(match.group(2))
        annotated_examples += 1
    else:
        if ts_dic['example'][i] not in exclude:
            str = ts_dic['example'][i]
            match = re.search(r'[A-Z].*\.?\s?[A-Z].*\.?',str)
            if match:
                #print(ts_dic['example'][i])
                match2 = re.search(r'(Indica|Se usa|\[])',str)
                if match2:
                    other_text +=1
                else:
                    ts_dic['example'][i] = re.sub(r'(\s+([A-Z]))',r'\.\1',str)
            else:
                #print(ts_dic['example'][i])
                to_review += 1

# in the 220 examples, still good example with tikuna and spanish
# Spanish begins with lowe case, difficult match and add point.

total = ts_dic.shape[0]
empty = total -  annotated_examples
feedback = pd.DataFrame([[annotated_examples,other_text,to_review,empty,total]],columns=['full_example','other','to_review','empty_examples','total_entries'])
feedback

# example content in lists
# Remaining examples needs to me naually cleaned, most is due to dios in lowercase, but should be in uppercase normally ?
# correct in a function or manually

NameError: name 'ts_dic' is not defined

In [None]:
# ebible text processing
# import files
# sum in one
# dictionary with file name numbers and content
# find pattern between spanish and ticuna in names

# Descriptive statistics

## Dictionary

In [None]:
# mono / poly
# Number of unique spanish words vs number of tokens
# Count : length of examples, number of missing spanish (def)
# Look for (missing) occurwnce across all the dataset
# length of list of spa_tokens
# Longueur des exemmpes
# ts_dic['token_pos'].value_counts()
# Split list of pos for categories
# missing data
#ts_dic['token_spa'].isnull().sum()
# len(ts_dic[ts_dic['example'] != '']) 




## Texts

In [21]:
# number of tokens in documents
token_size_texts = {}
for key,value in corpus.items():
    text = ''.join(value)
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    tok_words = tokenizer.tokenize(text)
    n_tokens = len(tok_words)
    token_size_texts[key] = n_tokens
token_size_texts = pd.DataFrame(token_size_texts, index=[0])
#print(token_size_texts)
# Remove punctuation to have only words
# frequency table

taille = 0
comptage = []
for key, value in tik_ebible_full.items():
    text = ''.join(value)
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    tok_words = tokenizer.tokenize(text)
    for tokene in tok_words:
        match = re.search(r'[^0-9]',tokene)
        if match:
            comptage.append(tokene)
len(comptage)

317009

# Master Content Database 

The data is store in a dictionnary here is an small overview of the data

| Key    | Subkeys      |   Value   | Content   ||
| :---   |    :----:    |   ---:    |---:       |--:|
|TS_DIC  | Column names |   Tokens  | translations||
|LLB_NEW | Chapter Name |   Tikuna  |||
|LLB_OLD | Chapter Name |   Tikuna  |||
|T_Ebible| Chapter Name |   Tikuna  |||
|S_Ebible| Chapter Name |   Spanish |||
|T_OHCR  | Alinea       |   Tikuna  |||
|S_OHCR  | Alinea       |   Spanish |||
|T_CRU   | None         |   Tikuna  |||

*TODO : Draw the data structure tree. What has to be kept in hierachical order, what part pas whole text ?*
*Do both for modularity*





##  spacy

### Load models

In [None]:
# load large spacy models
nlp_es = spacy.load("es_dep_news_trf")




### pos tagging texts in spanish

In [None]:
# spacy tag function. list as input, list of tupple as output.
def tag_text(text_list):
    tag_list = []
    s = ''.join([str(line) for line in text_list])
    doc = nlp_es(s,disable=['ner'])
    for token in doc:
        token_tupple = (token.text, token.pos_, token.dep_)
        tag_list.append(token_tupple)
    return(tag_list)



In [None]:
# Tag texts vs token lists with annotation

In [None]:
# Annoter les données en html espagnol - tikuna pour aligner
# txt, paragraphe, phrase, token
# Delta Crubadan & Dataset
# Match dictionnary content with bible
# Create function for matching efficient