## Ejemplos del modelo Linear Chain CRF

### Nombre

* linearCRF_l2_zero_50_0.1_0_k_10.crfsuite

### Parametros

* l1 = 0.1
* l2 = 0
* Precisión general = 0.9516
* Max_Iter = 50

In [1]:
import os
import pycrfsuite
import sys  
import random
sys.path.insert(0, '../')
from utils import get_corpus, WordsToLetter, extractLabels, extractTokens
from corpus_utils import (oto_glosser, words_report, gloss_to_csv)

In [2]:
model = "linearCRF_l2_zero_50_0.1_0_k_10.crfsuite"
model_path = os.path.join("../models/linearChainCRFs",
                          "tsu_" + model)
print(model_path)
tagger = pycrfsuite.Tagger()
# Cargando modelos preentrenados
tagger.open(model_path)

../models/linearChainCRFs/tsu_linearCRF_l2_zero_50_0.1_0_k_10.crfsuite


<contextlib.closing at 0x7f5e4bfd0610>

In [3]:
# Obteniendo corpus para pruebas
corpus = get_corpus('corpus_otomi_mod', '../corpora/') + \
         get_corpus('corpus_hard', '../corpora/')
letter_corpus = WordsToLetter(corpus)

In [4]:
# funciones auxiliares    
def obtener_palabras(frases):
    palabras = []
    for frase in frases:
        chunks = [palabra[0] for palabra in frase[:-1]]
        palabras.append("".join(chunks))
    return palabras

def accuracy_score(y_test, y_pred):
    right, total = 0, 0
    for real, prediction in zip(y_test, y_pred):
        if real == prediction:
            right += 1
    return right / len(y_test)


def feature_functions_maker(sent):
    ''' Reglas que configuran las feature functions para entrenamiento

    :param sent: Data as `[[[[[letter, POS, BIO-label],...],words],sents]]`
    :type: list
    :return: list of words with characters as features list:
        [[[[[letterfeatures],POS,BIO-label],letters],words]]
    :rtype: list
    '''

    featurelist = []
    senlen = len(sent)
    # each word in a sentence
    for i in range(senlen):
        word = sent[i]
        wordlen = len(word)
        lettersequence = ''
        # each letter in a word
        for j in range(wordlen):
            letter = word[j][0]
            # gathering previous letters
            lettersequence += letter
            # ignore digits
            if not letter.isdigit():
                features = [
                    'bias',
                    'letterLowercase=' + letter.lower(),
                ]
                # Position of word in sentence
                if i == senlen -1:
                    features.append("EOS")
                else:
                    features.append("BOS")

                # Pos tag sequence (Don't get pos tag if sentence is 1 word long)
                if i > 0 and senlen > 1:
                    features.append('prevpostag=' + sent[i-1][0][1])
                    if i != senlen-1:
                        features.append('nxtpostag=' + sent[i+1][0][1])

                # Position of letter in word
                if j == 0:
                    features.append('BOW')
                elif j == wordlen-1:
                    features.append('EOW')
                else:
                    features.append('letterposition=-%s' % str(wordlen-1-j))

                # Letter sequences before letter
                if j >= 4:
                    features.append('prev4letters=' + lettersequence[j-4:j].lower() + '>')
                if j >= 3:
                    features.append('prev3letters=' + lettersequence[j-3:j].lower() + '>')
                if j >= 2:
                    features.append('prev2letters=' + lettersequence[j-2:j].lower() + '>')
                if j >= 1:
                    features.append('prevletter=' + lettersequence[j-1:j].lower() + '>')

                # letter sequences after letter
                if j <= wordlen-2:
                    nxtlets = word[j+1][0]
                    features.append('nxtletter=<' + nxtlets.lower())
                if j <= wordlen-3:
                    nxtlets += word[j+2][0]
                    features.append('nxt2letters=<' + nxtlets.lower())
                if j <= wordlen-4:
                    nxtlets += word[j+3][0]
                    features.append('nxt3letters=<' + nxtlets.lower())
                if j <= wordlen-5:
                    nxtlets += word[j+4][0]
                    features.append('nxt4letters=<' + nxtlets.lower())

            # Add encoding for pysrfsuite
            featurelist.append([f.encode('utf-8') for f in features])
    return featurelist

## Los peores ejemplos

In [5]:
peores_frases = []
for i, example in enumerate(letter_corpus):
    # Obtenemos las palabras de la frase en forma de lista
    palabras = obtener_palabras(corpus[i])
    # Creación de feature functions
    feature_functions = feature_functions_maker(example)    
    # Etiquetas predichas por el modelo
    prediction_tags = tagger.tag(feature_functions)      
    # Etiquetas reales para las partes de las palabras
    real_tags = extractLabels(example, 1)
    accuracy = accuracy_score(real_tags, prediction_tags)
    # Accuracy score albitrario para las peores
    if accuracy <= 0.8:
        # Glosador con las etiquetas predichas
        predicted_gloss = oto_glosser(palabras, prediction_tags, corpus[i])
        # Estuctura para ser escrita en csv
        peores_frases.append((palabras, predicted_gloss, i, accuracy))

In [6]:
# Tomamos los 5 peores
words_report(peores_frases, letter_corpus, feature_functions_maker, tagger, 5)

--------------------------------------------------
Ejemplo 1 de 5 | Frase: "nuní" | Accuracy: 0.0
Letra | Predicción | Real | eq?
n | B-stem | B-det.dem | False
u | I-stem | I-det.dem | False
n | I-stem | I-det.dem | False
í | I-stem | I-det.dem | False
--------------------------------------------------
Ejemplo 2 de 5 | Frase: "padú" | Accuracy: 0.25
Letra | Predicción | Real | eq?
p | B-stem | B-3.prf | False
a | I-stem | I-3.prf | False
d | I-stem | B-stem | False
ú | I-stem | I-stem | True
--------------------------------------------------
Ejemplo 3 de 5 | Frase: "ximo" | Accuracy: 0.25
Letra | Predicción | Real | eq?
x | B-stem | B-3.prf | False
i | I-stem | I-3.prf | False
m | I-stem | B-stem | False
o | I-stem | I-stem | True
--------------------------------------------------
Ejemplo 4 de 5 | Frase: "xinú" | Accuracy: 0.25
Letra | Predicción | Real | eq?
x | B-stem | B-3.prf | False
i | I-stem | I-3.prf | False
n | I-stem | B-stem | False
ú | I-stem | I-stem | True
--------------

In [7]:
# Escribirmos resultados en formato csv
gloss_to_csv(peores_frases, corpus, "peores-frases-linearCRF")

### Los mejores

Como de estos hay muchos vamos a tomar 3 al azar

In [8]:
mejores_frases = []
for i, example in enumerate(letter_corpus):
    # Obtenemos las palabras de la frase en forma de lista
    palabras = obtener_palabras(corpus[i])
    # Creación de feature functions
    feature_functions = feature_functions_maker(example)    
    # Etiquetas predichas por el modelo
    prediction_tags = tagger.tag(feature_functions)      
    # Etiquetas reales para las partes de las palabras
    real_tags = extractLabels(example, 1)
    accuracy = accuracy_score(real_tags, prediction_tags)
    # Accuracy score albitrario para las peores
    if accuracy == 1.0:
        # Glosador con las etiquetas predichas
        predicted_gloss = oto_glosser(palabras, prediction_tags, corpus[i])
        # Estuctura para ser escrita en csv
        mejores_frases.append((palabras, predicted_gloss, i, accuracy))

In [9]:
# Tomamos los 5 peores
words_report(mejores_frases, letter_corpus, feature_functions_maker, tagger, 5)

--------------------------------------------------
Ejemplo 1 de 5 | Frase: "ndóphμdi dópεphí bit'μngí bimähtratágí ko chíkóhté" | Accuracy: 1.0
Letra | Predicción | Real | eq?
n | B-psd | B-psd | True
d | B-1.cpl | B-1.cpl | True
ó | I-1.cpl | I-1.cpl | True
p | B-stem | B-stem | True
h | I-stem | I-stem | True
μ | I-stem | I-stem | True
d | I-stem | I-stem | True
i | I-stem | I-stem | True
d | B-1.cpl | B-1.cpl | True
ó | I-1.cpl | I-1.cpl | True
p | B-stem | B-stem | True
ε | I-stem | I-stem | True
p | I-stem | I-stem | True
h | I-stem | I-stem | True
í | I-stem | I-stem | True
b | B-3.cpl | B-3.cpl | True
i | I-3.cpl | I-3.cpl | True
t | B-lig | B-lig | True
' | B-stem | B-stem | True
μ | I-stem | I-stem | True
n | I-stem | I-stem | True
g | B-1.obj | B-1.obj | True
í | I-1.obj | I-1.obj | True
b | B-3.cpl | B-3.cpl | True
i | I-3.cpl | I-3.cpl | True
m | B-stem | B-stem | True
ä | I-stem | I-stem | True
h | I-stem | I-stem | True
t | I-stem | I-stem | True
r | I-stem | I-stem | Tru

In [11]:
# Escribirmos resultados en formato csv
gloss_to_csv(mejores_frases, corpus, "mejores-frases-linearCRF")