## Ejemplos del HMM Like

### Nombre

* baseline_HMMLike_zero_50_0_0_k_10.crfsuite

### Parametros

* l1 = 0
* l2 = 0
* Precisión = 0.8762
* Max_Iter = 50

In [2]:
import os
import pycrfsuite
import sys  
import random
sys.path.insert(0, '../')
from utils import get_corpus, WordsToLetter, extractLabels, extractTokens

model = "baseline_HMMLike_zero_50_0_0_k_10.crfsuite"

In [3]:
model_path = os.path.join("../models/baseline", "tsu_" + model)
print(model_path)
tagger = pycrfsuite.Tagger()
# Cargando modelos preentrenados
tagger.open(model_path)

../models/baseline/tsu_baseline_HMMLike_zero_50_0_0_k_10.crfsuite


<contextlib.closing at 0x7f9bac6866d0>

In [4]:
# Obteniendo corpus para pruebas
corpus = get_corpus('corpus_otomi_mod', '../corpora/') + get_corpus('corpus_hard', '../corpora/')
letter_corpus = WordsToLetter(corpus)

In [5]:
# funciones auxiliares
def obtener_palabras(frases):
    palabras = []
    for frase in frases:
        chunks = [palabra[0] for palabra in frase[:-1]]
        palabras.append("".join(chunks))
    return palabras

def reporte(prediction_tags, real_tags, example):
    print("Letra | Predicción | Real | Es correcto?")
    for prediction, real, letter in zip(prediction_tags, real_tags, extractTokens(example)):
        print(f"{letter} | {prediction} | {real} | {True if prediction == real else False}")
        
def accuracy_score(y_test, y_pred):
    right, total = 0, 0
    for real, prediction in zip(y_test, y_pred):
        if real == prediction:
            right += 1
    return right / len(y_test)

In [7]:
def feature_functions_maker(sent):
    ''' Reglas que configuran las feature functions para entrenamiento

    :param sent: Data as `[[[[[letter, POS, BIO-label],...],words],sents]]`
    :type: list
    :return: list of words with characters as features list:
        [[[[[letterfeatures],POS,BIO-label],letters],words]]
    :rtype: list
    '''

    featurelist = []
    senlen = len(sent)
    # each word in a sentence
    for i in range(senlen):
        word = sent[i]
        wordlen = len(word)
        lettersequence = ''
        # each letter in a word
        for j in range(wordlen):
            letter = word[j][0]
            # gathering previous letters
            lettersequence += letter
            # ignore digits
            if not letter.isdigit():
                features = [
                    'bias',
                    'letterLowercase=' + letter.lower(),
                ]
                
                if j >= 1:
                    features.append('prevletter=' + lettersequence[j-1:j].lower() + '>')
             
            # Add encoding for pysrfsuite
            featurelist.append([f.encode('utf-8') for f in features])
    return featurelist

# Los Peores ejemplos

In [6]:
peores = []
prediction_tags = []
for i, example in enumerate(letter_corpus):
    features = feature_functions_maker(example)
    try:
        prediction_tags = tagger.tag(features)
    except UnicodeDecodeError as e:
        # TODO: Revisar que hacer
        print(f"Error generando {e.object} > {e.object.decode}")
        continue     
    real_tags = extractLabels(example, 1)
    accuracy = accuracy_score(real_tags, prediction_tags)
    if accuracy <= 0.8:
        peores.append((i, accuracy))
peores = sorted(peores, key=lambda t: t[1])
print(len(peores), len(letter_corpus))

420 1786


In [7]:
top = peores[:5]
for i, data in enumerate(top):
    print("-"*50)
    index = data[0]
    palabras = obtener_palabras(corpus[index])
    example = letter_corpus[index]
    features = feature_functions_maker(example)
    try:
        prediction_tags = tagger.tag(features)
    except UnicodeDecodeError as e:
        print(e.object)
        continue
    real_tags = extractLabels(example, 1)
    print(f"Ejemplo {i+1} de 5 | Frase: \"{' '.join(palabras)}\" | Precisión: {data[1]}")
    reporte(prediction_tags, real_tags, example)

--------------------------------------------------
Ejemplo 1 de 5 | Frase: "nuní" | Precisión: 0.0
Letra | Predicción | Real | Es correcto?
n | B-stem | B-det.dem | False
u | I-stem | I-det.dem | False
n | I-stem | I-det.dem | False
í | I-stem | I-det.dem | False
--------------------------------------------------
Ejemplo 2 de 5 | Frase: "ibé" | Precisión: 0.0
Letra | Predicción | Real | Es correcto?
i | I-it | B-3.icp | False
b | B-dual.exc | B-stem | False
é | I-dual.exc | I-stem | False
--------------------------------------------------
Ejemplo 3 de 5 | Frase: "xirembik'íhí" | Precisión: 0.08333333333333333
Letra | Predicción | Real | Es correcto?
x | B-stem | B-3.prf | False
i | I-stem | I-3.prf | False
r | I-stem | B-stem | False
e | I-stem | I-stem | True
m | B-med | I-stem | False
b | B-3.cpl | I-stem | False
i | I-3.cpl | I-stem | False
k | B-stem | B-3.obj | False
' | I-stem | I-3.obj | False
í | I-stem | I-3.obj | False
h | I-stem | B-pl | False
í | I-stem | I-pl | False
-----

# Mejores ejemplos

Como de esos hay muchos tomaremos 3 al azar

In [8]:
flag = 0
while flag != 3:
    index = random.randint(0, len(letter_corpus))
    palabras = obtener_palabras(corpus[index])
    example = letter_corpus[index]
    features = feature_functions_maker(example)
    prediction_tags = tagger.tag(features)
    real_tags = extractLabels(example, 1)
    accuracy = accuracy_score(real_tags, prediction_tags)
    if accuracy >= 0.99:
        print(f"Ejemplo {flag + 1} de 3 | Frase: \"{' '.join(palabras)}\" | Precisión: {accuracy}")
        reporte(prediction_tags, real_tags, example)
        flag += 1
        print("-"*50)

Ejemplo 1 de 3 | Frase: "xi'μngi" | Precisión: 1.0
Letra | Predicción | Real | Es correcto?
x | B-3.prf | B-3.prf | True
i | I-3.prf | I-3.prf | True
' | B-stem | B-stem | True
μ | I-stem | I-stem | True
n | I-stem | I-stem | True
g | I-stem | I-stem | True
i | I-stem | I-stem | True
--------------------------------------------------
Ejemplo 2 de 3 | Frase: "ya xathó ikháyι'" | Precisión: 1.0
Letra | Predicción | Real | Es correcto?
y | B-stem | B-stem | True
a | I-stem | I-stem | True
x | B-stem | B-stem | True
a | I-stem | I-stem | True
t | I-stem | I-stem | True
h | I-stem | I-stem | True
ó | I-stem | I-stem | True
i | B-3.icp | B-3.icp | True
k | B-stem | B-stem | True
h | I-stem | I-stem | True
á | I-stem | I-stem | True
y | B-lim | B-lim | True
ι | I-lim | I-lim | True
' | I-lim | I-lim | True
--------------------------------------------------
Ejemplo 3 de 3 | Frase: "bidú bimot'í" | Precisión: 1.0
Letra | Predicción | Real | Es correcto?
b | B-3.cpl | B-3.cpl | True
i | I-3.cpl 

In [14]:
# Obteniendo las frases mas largas y mejor etiquetadas
min_len = 30
phrases = []
for i, example in enumerate(letter_corpus):
    features = feature_functions_maker(example)
    palabras = obtener_palabras(corpus[i])
    prediction_tags = tagger.tag(features)
    real_tags = extractLabels(example, 1)
    accuracy = accuracy_score(real_tags, prediction_tags)
    if accuracy >= 0.99:
        phrase = " ".join(palabras)
        if len(phrase) >= min_len:
            phrases.append((palabras, accuracy, i))
print(phrases[:3])

[(["bi'μngí", 'yι', 'mbμhí', 'nge', 'hín', 'dímáné', 'gwaporá', 'nge', 'dímádáhní'], 1.0, 1), (["kamánk'ι", "t'μ", 'tsusiathó', 'yι', 'dáhtúhí'], 1.0, 52), (['komo', 'beinte', 'o', 'treinta', 'pero', 'despwés', 'xo', 'binkhμtsí'], 1.0, 56)]


In [10]:
letter_corpus[1]

[[['b', 'v', 'B-3.cpl'],
  ['i', 'v', 'I-3.cpl'],
  ["'", 'v', 'B-stem'],
  ['μ', 'v', 'I-stem'],
  ['n', 'v', 'I-stem'],
  ['g', 'v', 'B-1.obj'],
  ['í', 'v', 'I-1.obj']],
 [['y', 'det', 'B-det.pl'], ['ι', 'det', 'I-det.pl']],
 [['m', 'obl', 'B-stem'],
  ['b', 'obl', 'I-stem'],
  ['μ', 'obl', 'I-stem'],
  ['h', 'obl', 'I-stem'],
  ['í', 'obl', 'I-stem']],
 [['n', 'cnj', 'B-stem'], ['g', 'cnj', 'I-stem'], ['e', 'cnj', 'I-stem']],
 [['h', 'neg', 'B-stem'], ['í', 'neg', 'I-stem'], ['n', 'neg', 'I-stem']],
 [['d', 'v', 'B-1.icp'],
  ['í', 'v', 'I-1.icp'],
  ['m', 'v', 'B-ctrf'],
  ['á', 'v', 'I-ctrf'],
  ['n', 'v', 'B-stem'],
  ['é', 'v', 'I-stem']],
 [['g', 'v', 'B-1.icp.irr'],
  ['w', 'v', 'I-1.icp.irr'],
  ['a', 'v', 'I-1.icp.irr'],
  ['p', 'v', 'B-stem'],
  ['o', 'v', 'I-stem'],
  ['r', 'v', 'I-stem'],
  ['á', 'v', 'I-stem']],
 [['n', 'cnj', 'B-stem'], ['g', 'cnj', 'I-stem'], ['e', 'cnj', 'I-stem']],
 [['d', 'v', 'B-1.icp'],
  ['í', 'v', 'I-1.icp'],
  ['m', 'v', 'B-ctrf'],
  ['á', 'v'

In [12]:
for phrase in phrases[:10]:
    print("Frase: ", " ".join(phrase[0]))
    print(corpus[phrase[2]])

Frase:  bi'μngí yι mbμhí nge hín dímáné gwaporá nge dímádáhní
[[['bi', '3.cpl'], ["'μn", 'stem'], ['gí', '1.obj'], 'v'], [['yι', 'det.pl'], 'det'], [['mbμhí', 'stem'], 'obl'], [['nge', 'stem'], 'cnj'], [['hín', 'stem'], 'neg'], [['dí', '1.icp'], ['má', 'ctrf'], ['né', 'stem'], 'v'], [['gwa', '1.icp.irr'], ['porá', 'stem'], 'v'], [['nge', 'stem'], 'cnj'], [['dí', '1.icp'], ['má', 'ctrf'], ['dáhní', 'stem'], 'v']]
Frase:  kamánk'ι t'μ tsusiathó yι dáhtúhí
[[['ka', '3.pls'], ['má', 'ctrf'], ["nk'ι", 'stem'], 'v'], [["t'μ", 'stem'], 'obl'], [['tsusia', 'stem'], ['thó', 'ila'], 'v'], [['yι', 'det.pl'], 'det'], [['dáhtú', 'stem'], ['hí', 'pl'], 'unkwn']]
Frase:  komo beinte o treinta pero despwés xo binkhμtsí
[[['komo', 'stem'], 'obl'], [['beinte', 'stem'], 'obl'], [['o', 'stem'], 'obl'], [['treinta', 'stem'], 'obl'], [['pero', 'stem'], 'obl'], [['despwés', 'stem'], 'obl'], [['xo', 'stem'], 'cnj'], [['bi', '3.cpl'], ['nkhμtsí', 'stem'], 'v']]
Frase:  xo mitahá rι ntsoyι mitá sínko pesos rι b

In [15]:
import csv

with open("mejores-frases-HMMLike.csv", "w") as csvfile:
    header = ["frase", "glosa", 'accuracy-score']
    writer = csv.writer(csvfile)
    writer.writerow(header)
    for phrase in phrases:
        writer.writerow([" ".join(phrase[0]), corpus[phrase[2]], str(phrase[1])])

In [12]:
# Obteniendo las frases mas largas y peor etiquetadas
min_len = 30
phrases = []
for i, example in enumerate(letter_corpus):
    features = feature_functions_maker(example)
    palabras = obtener_palabras(corpus[i])
    prediction_tags = tagger.tag(features)
    real_tags = extractLabels(example, 1)
    accuracy = accuracy_score(real_tags, prediction_tags)
    if accuracy <= 0.50:
        phrase = " ".join(palabras)
        if len(phrase) >= min_len:
            phrases.append((palabras, accuracy, i))
print(phrases[:3])

[(['nι', "k'ι", 'té', 'gítsí', 'gímá', 'tantsμhní'], 0.4, 414), (['gótú', 'riáste', 'ripetrólio', 'rixábo', 'rinιni', 'i', 'gothó', 'tantsμhní'], 0.44680851063829785, 416), (['así', 'ke', 'xo', 'gítónthóhnι', 'rι', 'dó', 'ya'], 0.4583333333333333, 944)]


In [13]:
import csv

with open("peores-frases-HMMLike.csv", "w") as csvfile:
    header = ["frase", "glosa", 'accuracy-score']
    writer = csv.writer(csvfile)
    writer.writerow(header)
    for phrase in phrases:
        writer.writerow([" ".join(phrase[0]), corpus[phrase[2]], str(phrase[1])])