# Entradas y salidas del pipeline

Cuaderno para generar las entradas y salidas del pipeline para el glosador automatico del otomí de Toluca

In [1]:
# Up one folder for local modules issues
import sys
sys.path.insert(0, '../')
import pickle
import os
from utils import (get_corpus, extractLabels, extractFeatures, sent2features,
                   sent2labels, WordsToLetter)

In [2]:
def get_preinput_data():
    corpus_mod = get_corpus('corpus_otomi_mod', '../corpora/')
    corpus_hard = get_corpus('corpus_otomi_hard', '../corpora/')
    corpora = corpus_mod + corpus_hard
    pre_data = WordsToLetter(corpora)
    return corpus_mod, corpus_hard, corpora, pre_data

def write_data(data, file_name):
    objects_dir = "pickle_objects/"
    path = os.path.join(objects_dir, file_name)
    with open(path, 'wb') as f:
        pickle.dump(data, f)
    return 0

In [3]:
# Getting corpus, corpora and preprocess data
base, hard, corpora, predata = get_preinput_data()
# Getting feature functions
X_inputs = sent2features(predata)
# Getting BIO Labels
y_inputs = sent2labels(predata)

In [4]:
print(len(X_inputs), len(y_inputs))

1769 1769


In [5]:
# Print feature function - Bio Label pairs from first example
for feature, label in zip(X_inputs[0], y_inputs[0]):
    print("Feature: ", feature)
    print("BIO Label:", label)
    print("*"* 30)

Feature:  [b'bias', b"letterLowercase='", b'EOS', b'BOW', b'nxtletter=<\xc3\xa1', b'nxt2letters=<\xc3\xa1b', b'nxt3letters=<\xc3\xa1bi', b'nxt4letters=<\xc3\xa1bim']
BIO Label: b'B-como'
******************************
Feature:  [b'bias', b'letterLowercase=\xc3\xa1', b'EOS', b'letterposition=-7', b"prevletter='>", b'nxtletter=<b', b'nxt2letters=<bi', b'nxt3letters=<bim', b'nxt4letters=<bim\xc3\xa1']
BIO Label: b'I-como'
******************************
Feature:  [b'bias', b'letterLowercase=b', b'EOS', b'letterposition=-6', b"prev2letters='\xc3\xa1>", b'prevletter=\xc3\xa1>', b'nxtletter=<i', b'nxt2letters=<im', b'nxt3letters=<im\xc3\xa1', b'nxt4letters=<im\xc3\xa1k']
BIO Label: b'B-3.cpl'
******************************
Feature:  [b'bias', b'letterLowercase=i', b'EOS', b'letterposition=-5', b"prev3letters='\xc3\xa1b>", b'prev2letters=\xc3\xa1b>', b'prevletter=b>', b'nxtletter=<m', b'nxt2letters=<m\xc3\xa1', b'nxt3letters=<m\xc3\xa1k', b'nxt4letters=<m\xc3\xa1kh']
BIO Label: b'I-3.cpl'
****

Para entrenar un modelo se necesitan ejecutar las siguientes lineas:

```python
X_input = sent2features(predata)
y_input = sent2labels(predata)

trainer = pycrfsuite.Trainer(verbose=verbose)

for xseq, yseq in zip(X_input, y_input):
    trainer.append(xseq, yseq)

trainer.set_params({
        'c1': hyper['L1'],  # coefficient for L1 penalty
        'c2': hyper['L2'],  # coefficient for L2 penalty
        'max_iterations': hyper['max-iter']  # early stopping
    })
trainer.Train(output_path)
```

## Escribiendo los objectos de entrada en disco

In [6]:
# Corpus Otomi Base
write_data(base, 'corpus_base')
# Corpus Otomi Retador
write_data(hard, 'corpus_hard')
# Corpora = Corpus base + Corpus retador
write_data(corpora, 'corpora')
# Datos preprocesados en listas del estilo [[[[[letter, POS, BIO-label],...], palabras],oraciones]]
# Preparadas para grear feature funtions 
write_data(predata, 'predata')
# Feature functions de entrada a los CRF
write_data(X_inputs, 'X_input')
# BIO labels asociadas a cada feature function
write_data(y_inputs, 'y_input')

0

## Cargando modelos preentrenados para generar glosa (TODO)