# Entrasdas y salidas del pipeline

## Obteniendo las entradas

In [1]:
# Up one folder for local modules issues
import sys
sys.path.insert(0, '../')
import pickle
from utils import (get_corpus, extractLabels, extractFeatures, sent2features,
                   sent2labels, WordsToLetter)

In [2]:
def get_preinput_data():
    corpus_mod = get_corpus('corpus_otomi_mod', '../corpora/')
    corpus_hard = get_corpus('corpus_hard', '../corpora/')
    corpora = corpus_mod + corpus_hard
    pre_data = WordsToLetter(corpora)
    return corpus_mod, corpus_hard, corpora, pre_data

def write_data(data, file_name):
    base_path = "pickle_objects/"
    with open(base_path + file_name, 'wb') as f:
        pickle.dump(data, f)
    return 0

In [3]:
# Getting corpus, corpora and preprocess data
base, hard, corpora, predata = get_preinput_data()
# Getting feature functions
X_inputs = sent2features(predata)
# Getting BIO Labels
y_inputs = sent2labels(predata)

In [4]:
print(len(X_inputs), len(y_inputs))

1786 1786


In [5]:
# Print feature function - Bio Label pairs for first example
for feature, label in zip(X_inputs[0], y_inputs[0]):
    print("Feature: ", feature)
    print("BIO Label:", label)
    print("*"* 30)

Feature:  [b'bias', b'letterLowercase=n', b'BOS', b'BOW', b'nxtletter=<d', b'nxt2letters=<d\xc3\xb3', b'nxt3letters=<d\xc3\xb3p', b'nxt4letters=<d\xc3\xb3ph']
BIO Label: b'B-psd'
******************************
Feature:  [b'bias', b'letterLowercase=d', b'BOS', b'letterposition=-6', b'prevletter=n>', b'nxtletter=<\xc3\xb3', b'nxt2letters=<\xc3\xb3p', b'nxt3letters=<\xc3\xb3ph', b'nxt4letters=<\xc3\xb3ph\xce\xbc']
BIO Label: b'B-1.cpl'
******************************
Feature:  [b'bias', b'letterLowercase=\xc3\xb3', b'BOS', b'letterposition=-5', b'prev2letters=nd>', b'prevletter=d>', b'nxtletter=<p', b'nxt2letters=<ph', b'nxt3letters=<ph\xce\xbc', b'nxt4letters=<ph\xce\xbcd']
BIO Label: b'I-1.cpl'
******************************
Feature:  [b'bias', b'letterLowercase=p', b'BOS', b'letterposition=-4', b'prev3letters=nd\xc3\xb3>', b'prev2letters=d\xc3\xb3>', b'prevletter=\xc3\xb3>', b'nxtletter=<h', b'nxt2letters=<h\xce\xbc', b'nxt3letters=<h\xce\xbcd', b'nxt4letters=<h\xce\xbcdi']
BIO Label: b

Para entrenar un modelo se necesitan ejecutar las siguientes lineas:

```python
X_input = sent2features(predata)
y_input = sent2labels(predata)

trainer = pycrfsuite.Trainer(verbose=verbose)

for xseq, yseq in zip(X_input, y_input):
    trainer.append(xseq, yseq)

trainer.set_params({
        'c1': hyper['L1'],  # coefficient for L1 penalty
        'c2': hyper['L2'],  # coefficient for L2 penalty
        'max_iterations': hyper['max-iter']  # early stopping
    })
trainer.Train(output_path)
```

In [7]:
write_data(base, 'corpus_base')
write_data(hard, 'corpus_hard')
write_data(corpora, 'corpora')
write_data(predata, 'predata')
write_data(X_inputs, 'X_input')
write_data(y_inputs, 'y_input')

0