# Ejemplos del modelo `linearCRF_reg_k_3.crf`

### Parametros

* l1 = 0.1
* l2 = 0.001
* Max_Iter = 50
* Accuracy promedio = `0.9604`

### Ejemplo de Feature List

*Feature list* para a letra 'n' de la palabra "díníphé" de la frase "díníphé nι rι 'yó'"

```python
[
    'bias',
    'letterLowercase=n',
    'postag=v',
    'BOS',
    'BOS',
    'nxtpostag=det',
    'letterposition=-4',
    'prev2letters=dí>',
    'prevletter=í>',
    'nxtletter=<í',
    'nxt2letters=<íp',
    'nxt3letters=<íph',
    'nxt4letters=<íphé'
]
```

In [35]:
import os
import pycrfsuite
import sys  
import random
from utils import get_corpus, WordsToLetter
from corpus_utils import (oto_glosser, words_report, gloss_to_csv, obtener_frase)

In [36]:
env_name = "linearCRF"
model_name = "linearCRF_reg_k_3.crf"
model_path = os.path.join("models", env_name, model_name)
print("Full path", model_path)
tagger = pycrfsuite.Tagger()
# Cargando modelos preentrenados
tagger.open(model_path)

Full path models/linearCRF/linearCRF_reg_k_3.crf


<contextlib.closing at 0x7fc0cb10ae20>

In [37]:
# Obteniendo corpus para pruebas
corpus = get_corpus('corpus_otomi_mod', '../corpora/') + \
         get_corpus('corpus_otomi_hard', '../corpora/')
letter_corpus = WordsToLetter(corpus)

In [38]:
# funciones auxiliares    
def obtener_palabras(frases):
    palabras = []
    for frase in frases:
        chunks = [palabra[0] for palabra in frase[:-1]]
        palabras.append("".join(chunks))
    return palabras

def accuracy_score(y_test, y_pred):
    right, total = 0, 0
    for real, prediction in zip(y_test, y_pred):
        if real == prediction:
            right += 1
        else:
            breakpoint()
    return right / len(y_test)


def get_labels(sent, flag=0):
    labels = []
    for word in sent:
        for letter in word:
            labels.append(letter[2])
    return labels


def sent2features(data):
    return [get_feature_lists(sent) for sent in data]


def sent2labels(data):
    return [get_labels(sent) for sent in data]


def get_feature_lists(sent):
    ''' Reglas que configuran las feature lists para entrenamiento

    :param sent: Data as `[[[[[letter, POS, BIO-label],...],words],sents]]`
    :type: list
    :return: list of words with characters as features list:
        [[[[[letterfeatures],POS,BIO-label],letters],words]]
    :rtype: list
    '''

    featurelist = []
    senlen = len(sent)
    # each word in a sentence
    for i in range(senlen):
        word = sent[i]
        wordlen = len(word)
        lettersequence = ''
        # each letter in a word
        for j in range(wordlen):
            letter = word[j][0]
            # gathering previous letters
            lettersequence += letter
            # ignore digits
            if not letter.isdigit():
                features = [
                    'bias',
                    'letterLowercase=' + letter.lower(),
                    'postag=' + word[j][1],
                ]
                # Position of word in sentence
                if i == senlen -1:
                    features.append("EOS")
                else:
                    features.append("BOS")

                # Pos tag sequence (Don't get pos tag if sentence is 1 word long)
                if i > 0 and senlen > 1:
                    features.append('prevpostag=' + sent[i-1][0][1])
                    if i != senlen-1:
                        features.append('nxtpostag=' + sent[i+1][0][1])
                    else:
                        features.append('EOS')
                else:
                    features.append('BOS')
                    #Don't get pos tag if sentence is 1 word long
                    if i != senlen-1:
                        features.append('nxtpostag=' + sent[i+1][0][1])

                # Position of letter in word
                if j == 0:
                    features.append('BOW')
                elif j == wordlen-1:
                    features.append('EOW')
                else:
                    features.append('letterposition=-%s' % str(wordlen-1-j))

                # Letter sequences before letter
                if j >= 4:
                    features.append('prev4letters=' + lettersequence[j-4:j].lower() + '>')
                if j >= 3:
                    features.append('prev3letters=' + lettersequence[j-3:j].lower() + '>')
                if j >= 2:
                    features.append('prev2letters=' + lettersequence[j-2:j].lower() + '>')
                if j >= 1:
                    features.append('prevletter=' + lettersequence[j-1:j].lower() + '>')

                # letter sequences after letter
                if j <= wordlen-2:
                    nxtlets = word[j+1][0]
                    features.append('nxtletter=<' + nxtlets.lower())
                if j <= wordlen-3:
                    nxtlets += word[j+2][0]
                    features.append('nxt2letters=<' + nxtlets.lower())
                if j <= wordlen-4:
                    nxtlets += word[j+3][0]
                    features.append('nxt3letters=<' + nxtlets.lower())
                if j <= wordlen-5:
                    nxtlets += word[j+4][0]
                    features.append('nxt4letters=<' + nxtlets.lower())
            featurelist.append(features)
    return featurelist

## Los peores ejemplos

In [None]:
peores_frases = []
data = dict()
for i, example in enumerate(letter_corpus):
    # Obtenemos las palabras de la frase en forma de lista
    palabras = obtener_palabras(corpus[i])
    # Creación de feature functions
    feature_lists = get_feature_lists(example)    
    # Etiquetas predichas por el modelo
    predicted_tags = tagger.tag(feature_lists)  
    # Etiquetas reales para las partes de las palabras
    real_tags = get_labels(example)
    accuracy = accuracy_score(real_tags, prediction_tags)
    # Accuracy score albitrario para las peores
    if accuracy <= 0.8:
        # Glosador con las etiquetas predichas
        predicted_gloss = oto_glosser(palabras, predicted_tags, corpus[i])
        # Estuctura para ser escrita en csv
        data["frase"] = " ".join(palabras)
        data["pred-tags"] = predicted_tags
        data["real-tags"] = real_tags
        data["index"] = i
        data["accuracy"] = accuracy
        peores_frases.append(data)
        data = {}

> <ipython-input-38-cf9a5f83fc2d>(27)accuracy_score()
-> wrong += 1


(Pdb)  l


 22  	        for t, p in zip(tests, predictions):
 23  	            if t == p:
 24  	                right += 1
 25  	            elif t != p:
 26  	                breakpoint()
 27  ->	                wrong += 1
 28  	
 29  	    return right / total
 30  	
 31  	
 32  	def get_labels(sent, flag=0):


(Pdb)  p t


'c'


(Pdb)  p p


'1'


(Pdb)  p tests


'B-como'


(Pdb)  p predictions


'B-1.icp'


In [30]:
peores_frases = sorted(peores_frases, key=lambda t: t['accuracy'])

In [31]:
peores_frases[0]["pred-tags"]

['B-como',
 'I-como',
 'B-3.cpl',
 'I-3.cpl',
 'B-ctrf',
 'I-ctrf',
 'B-stem',
 'I-stem',
 'I-stem']

In [32]:
peores_frases[0]["real-tags"]

['B-como',
 'I-como',
 'B-3.cpl',
 'I-3.cpl',
 'B-ctrf',
 'I-ctrf',
 'B-stem',
 'I-stem',
 'I-stem']

### Imprimir reporte

In [13]:
corpus[i]

[[['dí', '1.icp'], ['ní', 'stem'], ['p', 'lig'], ['hé', 'pl.exc'], 'v'],
 [['nι', 'det'], 'det'],
 [['rι', 'det'], 'det'],
 [["'yó'", 'stem'], 'n']]

In [14]:
predicted_gloss

[[['dí', '1.icp'], ['níphé', 'stem'], 'v'],
 [['nι', 'det'], 'det'],
 [['rι', 'det'], 'det'],
 [["'yó'", 'stem'], 'n']]

In [15]:
real_tags

['B-1.icp',
 'I-1.icp',
 'B-stem',
 'I-stem',
 'B-lig',
 'B-pl.exc',
 'I-pl.exc',
 'B-det',
 'I-det',
 'B-det',
 'I-det',
 'B-stem',
 'I-stem',
 'I-stem',
 'I-stem']

In [17]:
prediction_tags

['B-1.icp',
 'I-1.icp',
 'B-stem',
 'I-stem',
 'I-stem',
 'I-stem',
 'I-stem',
 'B-det',
 'I-det',
 'B-det',
 'I-det',
 'B-stem',
 'I-stem',
 'I-stem',
 'I-stem']

In [6]:
# Tomamos los 5 peores
words_report(peores_frases, letter_corpus, get_feature_lists, tagger, 5)

--------------------------------------------------
Ejemplo 1 de 5 | Frase: "padú" | Accuracy: 0.25
Letra | Predicción | Real | eq?
p | B-stem | b'B-3.prf' | False
a | I-stem | b'I-3.prf' | False
d | I-stem | b'B-stem' | False
ú | I-stem | b'I-stem' | False
--------------------------------------------------
Ejemplo 2 de 5 | Frase: "doné" | Accuracy: 0.25
Letra | Predicción | Real | eq?
d | B-stem | b'B-1.pot' | False
o | I-stem | b'I-1.pot' | False
n | I-stem | b'B-stem' | False
é | I-stem | b'I-stem' | False
--------------------------------------------------
Ejemplo 3 de 5 | Frase: "bεhthó bεhthó" | Accuracy: 0.3333333333333333
Letra | Predicción | Real | eq?
b | B-stem | b'B-stem' | False
ε | I-stem | b'I-stem' | False
h | I-stem | b'B-lig' | False
t | B-ila | b'B-neg' | False
h | I-ila | b'I-neg' | False
ó | I-ila | b'I-neg' | False
b | B-stem | b'B-stem' | False
ε | I-stem | b'I-stem' | False
h | I-stem | b'B-lig' | False
t | B-ila | b'B-neg' | False
h | I-ila | b'I-neg' | False
ó |

In [7]:
# Escribirmos resultados en formato csv
gloss_to_csv(peores_frases, corpus, "peores-frases-linearCRF_reg")

### Los mejores

Como de estos hay muchos vamos a tomar 3 al azar

In [None]:
mejores_frases = []
for i, example in enumerate(letter_corpus):
    # Obtenemos las palabras de la frase en forma de lista
    palabras = obtener_palabras(corpus[i])
    # Creación de feature functions
    feature_lists = get_feature_lists(example)    
    # Etiquetas predichas por el modelo
    prediction_tags = tagger.tag(feature_lists)      
    # Etiquetas reales para las partes de las palabras
    real_tags = get_labels(example, 1)
    accuracy = accuracy_score(real_tags, prediction_tags)
    # Accuracy score albitrario para las peores
    if accuracy == 1.0:
        # Glosador con las etiquetas predichas
        predicted_gloss = oto_glosser(palabras, prediction_tags, corpus[i])
        # Estuctura para ser escrita en csv
        mejores_frases.append((palabras, predicted_gloss, i, accuracy))

In [None]:
# Tomamos los 5 peores
words_report(mejores_frases, letter_corpus, get_feature_lists, tagger, 5)

In [None]:
# Escribirmos resultados en formato csv
gloss_to_csv(mejores_frases, corpus, "mejores-frases-linearCRF")