In [2]:
import pandas as pd
#from pandas import option_context
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from tqdm import tqdm
import re
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import spacy
import pypandoc

# Increase limit of characters processed
# Need to specify to not use spcifics models to avoid overload
# nlp_es.max_length = 2000000


## (fonctions)

In [75]:
# funct. import text, clean withite spaces, return cleaned text as list
def clean_input(txt_file):
    file_content = open(txt_path + file + ".txt", "r").readlines()
    file_cleaned = []
    for line in file_content:
        if not line.isspace():
            file_cleaned.append(line.strip('\n'))
    file_cleaned = '-'.join(file_cleaned)
    return(file_cleaned)


def remove_html_tags(text):
    clean = re.sub('<.*>','',text)
    return(clean)

## Textual corpus


### Input files

| Acronym      | Author | Langage     |Original format|Source|
| :---         |    :----:   |          ---: |---:|--:|
|TS_DIC  | Anderson dictionnary | Tik-Spa |CSV||
|LLB_NEW |New Testament La Ligua Biblica |Tikuna|PDF||
|LLB_OLD |Old Testament La Ligua Biblica |Tikuna|PDF||
|T_Ebible|Ebible |Tikuna|HTML|https://ebible.org/bible/details.php?id=tcaNT|
|S_Ebible|EBible| Spanish|HTML|https://ebible.org/bible/details.php?id=spavbl|
|T_OHCR  |Universal Declaration Of Human Rights | Tikuna|TXT||
|S_OHCR  |Universal Declaration Of Human Rights  |Spanish|TXT||
|T_CRU  |Crubadan Synsets  |Tikuna|TXT||


In [76]:
# TODO : instead of length os text, transform into number of tokens and add csv
# Create dictionary to store content
corpus = {}
# Define files names
# TODO get from directory
name_files_txt = ['llb_new','llb_old','t_ohcr','s_ohcr','t_cru']
# explicit path
# TODO clean name
txt_path = 'sources/txt/'
# Fill the doctionary with the clean content
for file in name_files_txt:
    corpus[file] = clean_input(file)

## Textual Sources
- PDF content : Adobe Acrobat to convert pdf to docx
- Docx content : pandoc docx to plaintext
- html content : pandoc html to plain

### Cleaning
For both docx and html version of the bible, keep a maximum of metadata related to the text structure to facilitate the alignement of tikuna and spanish.
Therefore one version is kept to add a maximum of annotation, and a second version of the context sums all the files in one.
The cleaning focus on the elminiation of the copyright trademark, page navigation options etc...

#### Option B with creating text files but no access to html structure after
pypandoc.convert_file('chapters/*.md', 'docx', outputfile="somefile.docx")
and create text files
then match names of texts in tikuna and spanish
create a graphical map in the notebook

### Storing
All bible content are stored in a dictionnary.

!!! clean file structures, clean path names and test them again !

In [77]:
directory = 'sources/txt/spa_ebible'
spa_ebible = {}
master_f = []

for filename in os.listdir(directory):
    f = os.path.join(directory,filename)
    if os.path.isfile(f):
        if filename.endswith(".txt"):
            rename = re.search(r'([0-9]{3,3}.*[0-9]{2,2})', filename)
            content = open(os.path.join(directory, filename), "rt",encoding="UTF-8-sig").read()
            content = re.sub('\n','',content)  
            master_f.append(content)          
            spa_ebible[rename.group(0)] = content

spa_ebible =  dict(sorted(spa_ebible.items()))

directory = 'sources/txt/tik_ebible'
tik_ebible = {}
master_f = []

for filename in os.listdir(directory):
    f = os.path.join(directory,filename)
    if os.path.isfile(f):
        if filename.endswith(".txt"):
            rename = re.search(r'([0-9]{3,3}.*[0-9]{2,2})', filename)
            content = open(os.path.join(directory, filename), "rt",encoding="UTF-8-sig").read()
            content = re.sub('\n','',content)
            master_f.append(content)          
            tik_ebible[rename.group(0)] = content

tik_ebible =  dict(sorted(tik_ebible.items()))

# directory = 'sources/original/tcaNT_html'
# tik_ebible_full = {}

# for filename in os.listdir(directory):
#     f = os.path.join(directory,filename)
#     if os.path.isfile(f):
#         if filename.endswith(".htm"):
#             output = pypandoc.convert_file(f, 'rst',format = 'html')
#             filename = re.sub('\.htm','',filename)
#             tik_ebible_full[filename] = output

# directory = 'sources/original/spavbl_html'
# spa_ebible_full = {}

# for filename in os.listdir(directory):
#     f = os.path.join(directory,filename)
#     if os.path.isfile(f):
#         if filename.endswith(".htm"):
#             output = pypandoc.convert_file(f, 'rst',format = 'html')
#             filename = re.sub('\.htm','',filename)
#             spa_ebible_full[filename] = output

# directory = 'sources/txt/tik_ebible'
# tik_ebible = {}
# master_f = []

# for filename in os.listdir(directory):
#     f = os.path.join(directory,filename)
#     if os.path.isfile(f):
#         if filename.endswith(".txt"):
#             rename = re.search(r'([0-9]{3,3}.*[0-9]{2,2})', filename)
#             content = open(os.path.join(directory, filename), "rt",encoding="UTF-8-sig").read()
#             content = re.sub('\n','',content)
#             master_f.append(content)          
#             tik_ebible[rename.group(0)] = content

# tik_ebible =  dict(sorted(tik_ebible.items()))

# #spa_ebible_full
# for key, value in spa_ebible_full.items():
#     # Clean content by removing all tags and content from inside.
#     # TODO : Keep hetml descritpion tags as meta data

#     spa_ebible_full[key] = remove_html_tags(value)
# # Sort dictionary to sort by apostle

# spa_ebible_full =  dict(sorted(spa_ebible_full.items()))

# for key, value in tik_ebible_full.items():
#     tik_ebible_full[key] = remove_html_tags(value)
#     tik_ebible_full =  dict(sorted(tik_ebible_full.items()))


# def clean_annotate_ebible(chapter_txt):
#     #char_to_remove = ['\-','\`','``','\.','\s*\>\s*','\s*<\s*','copyright','© 2008 WBT','_']
#     container_type = ['mt',',main','m','q','p','s','chapterlabel']
#     # from container :: [a-z]* to .., container annotation.
#     c_chapter = re.sub('\.\. container\:\:','',chapter_txt)
#     c_chapter = re.sub('\n','',c_chapter)
#     #c_chapter = re.search('^(.*?)[a-z]*','',chapter_txt)
#     #c_chapter = re.search('(\w+)',chapter_txt)
#     #print(c_chapter.group(0))
#     return(c_chapter)


In [78]:
def align_content_bible(dic1,dic2):
    bil_dict = {}
    common_chapters = [i for i in dic1.keys() if i in dic2.keys()]
    for element in common_chapters:
        bil_dict[element] = {'spanish':'','tikuna':''}
        bil_dict[element]['tikuna'] = dic1[element]
        bil_dict[element]['spanish'] = dic2[element]
    return(bil_dict)
bil_dict = align_content_bible(tik_ebible,spa_ebible)
# Remove first entry describing the corpus
del bil_dict['000_000_000']

In [240]:
bil_dict[1] = {'spanish':'','tikuna':''}
bil_dict[1]['tikuna'] = corpus['t_ohcr']
bil_dict[1]['spanish'] = corpus['s_ohcr']


In [215]:
ts_dic = pd.read_csv(r'../dic_preprocessing/output.csv')
ts_dic = ts_dic[ts_dic['example_spanish'].notna()].reset_index(drop=True)
ex_t = ts_dic['example_tikuna']
ex_s = ts_dic['example_spanish']
# Keep the index of the source df ?
def align_examples(tikuna,spanish):
    examplen = {}
    for i in range(len(tikuna)):
        examplen[i] = {'spanish':'','tikuna':''}
        examplen[i]['tikuna'] = tikuna[i]
        examplen[i]['spanish'] = spanish[i]
    return(examplen)
dic_examples = align_examples(ex_t,ex_s)


In [247]:
master_output_training = {'ebible':'','ohcr':'','examples':''}

In [248]:
master_output_training['ebible'] = bil_dict
master_output_training['ohcr'] = bil_dict['ohcr']
master_output_training['examples'] = dic_examples

In [312]:
master_output_training['examples']

{0: {'spanish': 'Pues, yo no sé.', 'tikuna': 'A, tamanüxü chacuax.'},
 1: {'spanish': 'Ese zancudo me picó.', 'tikuna': 'Ngẽma ã rü choxü̃ napai.'},
 2: {'spanish': 'Esa billetera llena se me cayó.',
  'tikuna': 'Ngẽma diẽruchixü̃ i ããcuxü̃ rü choxǘ ̃̃  ínangu.'},
 3: {'spanish': 'No había  planta hojosa porque las curuhuinses  habían comido las hojas.',
  'tikuna': 'Nataxuma ga nanetü ga ãátüxü̃ yerü  naiyüxü nanangõ̱ xatü.'},
 4: {'spanish': 'Mezclamos la avena con  leche para tomarla.',
  'tikuna': 'Abéna rü  lechimaxã tanaxüéxü̃ nax ngẽmaãcü yaxaxüxü̃cax̱ .'},
 5: {'spanish': 'Ese avión se cayó.', 'tikuna': 'Yima abióũ rü narüngu.'},
 6: {'spanish': 'Pintaron la cola de ese avión.',
  'tikuna': 'Nanachaxu i ngẽma  abióũcüra.'},
 7: {'spanish': 'El hangar es donde el guardián cuida el avión.',
  'tikuna': 'Yima abióũpatawa  nixĩ i nüxnanadauxü̃ ya abióũ i ngẽma  norü dauruxü̃.'},
 8: {'spanish': 'El ala del avión lo sostiene  para que no caiga.',
  'tikuna': 'Ngẽma  abióũpexatü rü 

In [292]:
from nltk.tokenize import RegexpTokenizer
# tokenizer = RegexpTokenizer(r'\w+')
# tokenizer.tokenize(txt)
def get_size_corpus(txt):
    #n_token = {'spanish':'','tikuna':''}
    tokenizer = RegexpTokenizer(r'\w+')
    n_token = len(tokenizer.tokenize(txt))
    return(n_token)

ss = 0
st = 0
for k,v in master_output_training['ebible'].items():
   ss += get_size_corpus(v['spanish'])
   st += get_size_corpus(v['tikuna'])

print(str(ss) + " " + str(st))

183697 291217


In [265]:
import json
jason = json.dumps(master_output_training)

# Descriptive statistics

## Dictionary

In [None]:
# mono / poly
# Number of unique spanish words vs number of tokens
# Count : length of examples, number of missing spanish (def)
# Look for (missing) occurwnce across all the dataset
# length of list of spa_tokens
# Longueur des exemmpes
# ts_dic['token_pos'].value_counts()
# Split list of pos for categories
# missing data
#ts_dic['token_spa'].isnull().sum()
# len(ts_dic[ts_dic['example'] != '']) 




## Texts

In [21]:
# number of tokens in documents
token_size_texts = {}
for key,value in corpus.items():
    text = ''.join(value)
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    tok_words = tokenizer.tokenize(text)
    n_tokens = len(tok_words)
    token_size_texts[key] = n_tokens
token_size_texts = pd.DataFrame(token_size_texts, index=[0])
#print(token_size_texts)
# Remove punctuation to have only words
# frequency table

taille = 0
comptage = []
for key, value in tik_ebible_full.items():
    text = ''.join(value)
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    tok_words = tokenizer.tokenize(text)
    for tokene in tok_words:
        match = re.search(r'[^0-9]',tokene)
        if match:
            comptage.append(tokene)
len(comptage)

317009

# Master Content Database 

The data is store in a dictionnary here is an small overview of the data

| Key    | Subkeys      |   Value   | Content   ||
| :---   |    :----:    |   ---:    |---:       |--:|
|TS_DIC  | Column names |   Tokens  | translations||
|LLB_NEW | Chapter Name |   Tikuna  |||
|LLB_OLD | Chapter Name |   Tikuna  |||
|T_Ebible| Chapter Name |   Tikuna  |||
|S_Ebible| Chapter Name |   Spanish |||
|T_OHCR  | Alinea       |   Tikuna  |||
|S_OHCR  | Alinea       |   Spanish |||
|T_CRU   | None         |   Tikuna  |||

*TODO : Draw the data structure tree. What has to be kept in hierachical order, what part pas whole text ?*
*Do both for modularity*





##  spacy

### Load models

In [None]:
# load large spacy models
nlp_es = spacy.load("es_dep_news_trf")




### pos tagging texts in spanish

In [None]:
# spacy tag function. list as input, list of tupple as output.
def tag_text(text_list):
    tag_list = []
    s = ''.join([str(line) for line in text_list])
    doc = nlp_es(s,disable=['ner'])
    for token in doc:
        token_tupple = (token.text, token.pos_, token.dep_)
        tag_list.append(token_tupple)
    return(tag_list)



In [None]:
# Tag texts vs token lists with annotation

In [None]:
# Annoter les données en html espagnol - tikuna pour aligner
# txt, paragraphe, phrase, token
# Delta Crubadan & Dataset
# Match dictionnary content with bible
# Create function for matching efficient