## Ejemplos del POS Less

### Nombre

* POS_Less_50_0.1_0.001_k_10.crfsuite

### Parametros

* l1 = 0.1
* l2 = 0.001
* Precisión = 0.9499
* Max_Iter = 50

In [None]:
import os
import pycrfsuite
import sys  
import random
sys.path.insert(0, '../')
from utils import get_corpus, WordsToLetter, extractLabels, extractTokens
model = "POS_Less_50_0.1_0.001_k_10.crfsuite"

In [None]:
# funciones auxiliares
def obtener_palabras(frases):
    palabras = []
    for frase in frases:
        chunks = [palabra[0] for palabra in frase[:-1]]
        palabras.append("".join(chunks))
    return palabras

def reporte(prediction_tags, real_tags, example):
    print("Letra | Predicción | Real | Es correcto?")
    for prediction, real, letter in zip(prediction_tags, real_tags, extractTokens(example)):
        print(f"{letter} | {prediction} | {real} | {True if prediction == real else False}")
        
def accuracy_score(y_test, y_pred):
    right, total = 0, 0
    for real, prediction in zip(y_test, y_pred):
        if real == prediction:
            right += 1
    return right / len(y_test)

In [None]:
# Obteniendo corpus para pruebas
corpus = get_corpus('corpus_otomi_mod', '../corpora/') + get_corpus('corpus_hard', '../corpora/')
letter_corpus = WordsToLetter(corpus)

In [None]:
model_path = os.path.join("../models/POSLess", model)
print(model_path)
tagger = pycrfsuite.Tagger()
# Cargando modelos preentrenados
tagger.open(model_path)

In [None]:
def feature_functions_maker(sent):
    ''' Reglas que configuran las feature functions para entrenamiento

    :param sent: Data as `[[[[[letter, POS, BIO-label],...],words],sents]]`
    :type: list
    :return: list of words with characters as features list:
        [[[[[letterfeatures],POS,BIO-label],letters],words]]
    :rtype: list
    '''

    featurelist = []
    senlen = len(sent)
    # each word in a sentence
    for i in range(senlen):
        word = sent[i]
        wordlen = len(word)
        lettersequence = ''
        # each letter in a word
        for j in range(wordlen):
            letter = word[j][0]
            # gathering previous letters
            lettersequence += letter
            # ignore digits
            if not letter.isdigit():
                features = [
                    'bias',
                    'letterLowercase=' + letter.lower(),
                ]
                # Position of word in sentence
                if i == senlen -1:
                    features.append("EOS")
                else:
                    features.append("BOS")              

                # Position of letter in word
                if j == 0:
                    features.append('BOW')
                elif j == wordlen-1:
                    features.append('EOW')
                else:
                    features.append('letterposition=-%s' % str(wordlen-1-j))

                # Letter sequences before letter
                if j >= 4:
                    features.append('prev4letters=' + lettersequence[j-4:j].lower() + '>')
                if j >= 3:
                    features.append('prev3letters=' + lettersequence[j-3:j].lower() + '>')
                if j >= 2:
                    features.append('prev2letters=' + lettersequence[j-2:j].lower() + '>')
                if j >= 1:
                    features.append('prevletter=' + lettersequence[j-1:j].lower() + '>')

                # letter sequences after letter
                if j <= wordlen-2:
                    nxtlets = word[j+1][0]
                    features.append('nxtletter=<' + nxtlets.lower())
                if j <= wordlen-3:
                    nxtlets += word[j+2][0]
                    features.append('nxt2letters=<' + nxtlets.lower())
                if j <= wordlen-4:
                    nxtlets += word[j+3][0]
                    features.append('nxt3letters=<' + nxtlets.lower())
                if j <= wordlen-5:
                    nxtlets += word[j+4][0]
                    features.append('nxt4letters=<' + nxtlets.lower())

            # Add encoding for pysrfsuite
            featurelist.append([f.encode('utf-8') for f in features])
    return featurelist

# Los Peores ejemplos

In [None]:
peores = []
prediction_tags = []
for i, example in enumerate(letter_corpus):
    features = feature_functions_maker(example)
    try:
        prediction_tags = tagger.tag(features)
    except UnicodeDecodeError as e:
        # TODO: Revisar que hacer
        continue     
    real_tags = extractLabels(example, 1)
    accuracy = accuracy_score(real_tags, prediction_tags)
    if accuracy <= 0.8:
        peores.append((i, accuracy))
peores = sorted(peores, key=lambda t: t[1])

In [None]:
top = peores[:5]
for i, data in enumerate(top):
    print("-"*50)
    index = data[0]
    palabras = obtener_palabras(corpus[index])
    example = letter_corpus[index]
    features = feature_functions_maker(example)
    try:
        prediction_tags = tagger.tag(features)
    except UnicodeDecodeError as e:
        print(e.object)
        continue
    real_tags = extractLabels(example, 1)
    print(f"Ejemplo {i+1} de 5 | Frase: \"{' '.join(palabras)}\" | Precisión: {data[1]}")
    reporte(prediction_tags, real_tags, example)

# Lo Mejores ejemplos

Como de estos hay muchos tomaremos 3 al azar

In [None]:
flag = 0
while flag != 3:
    index = random.randint(0, len(letter_corpus))
    palabras = obtener_palabras(corpus[index])
    example = letter_corpus[index]
    features = feature_functions_maker(example)
    prediction_tags = tagger.tag(features)
    real_tags = extractLabels(example, 1)
    accuracy = accuracy_score(real_tags, prediction_tags)
    if accuracy >= 0.99:
        print(f"Ejemplo {flag + 1} de 3 | Frase: \"{' '.join(palabras)}\" | Precisión: {accuracy}")
        reporte(prediction_tags, real_tags, example)
        flag += 1
        print("-"*50)