In [1]:
import os
import json
import copy
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
import sys
sys.dont_write_bytecode = True
sys.path.append('../')

In [3]:
from datatools.analyzer import *

In [4]:
path = "../eval_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
error_types = ['Unclear intention', 'Wrong information',
 'Ignore question', 'Topic transition error', 
 'Lack of information', 'Repetition', 
 'Contradiction', 'Self-contradiction',
  'Lack of common sense', 'Semantic error',
   'Grammatical error', 'Ignore proposal', 
   'Ignore offer', 'Lack of sociality', 
   'Uninterpretable', 'Ignore greeting', 
   'No-Err']

In [5]:
convs = read_conv(path, datalist)

In [6]:
# 文章ごとに n-gram を考えてみる
def get_ngram_set(doc, N=3):
    if isinstance(doc, str):
        doc = nlp(doc)
    surfaces = [token.text for token in doc]
    ngram_set = set()
    filled = ["FOS", *surfaces, "EOS"]
    # print(filled)
    for i in range(len(filled)-N+1):
        f = "_".join(filled[i:i+N])
        ngram_set.add(f)
    return ngram_set
    

In [7]:
def check_repeat_rate(target:set, history:list, border=0.7):
    t_list = list(target)
    for prev_set in history:
        size = len(prev_set)
        hit = 0
        for t in t_list:
            if t in prev_set:
                hit+=1
        if hit/size >= border:
            return True
    
    return False

In [23]:

y_pred = []
for conv in convs:
    ngram_sets = []
    for ut in conv:
        if not ut.is_system():
            continue
        utt = ut.utt
        doc = nlp(utt)
        y_pred.append(0)
        for sent in doc.sents:
            if len(sent) <= 3:
                # print(sent)
                continue
            ngram_set = get_ngram_set(sent, N=3)
            # これまでのセットで重複が大きいものがあるかチェック！
            if check_repeat_rate(target=ngram_set, history=ngram_sets, border=0.8):
                # print(ut, ut.errors)
                y_pred[-1] = 1
            ngram_sets.append(ngram_set)
        # break
    # break
        

In [24]:
error = "Repetition"
y = []
for conv in convs:
    for ut in conv:
        if not ut.is_system():
            continue
        # 
        if ut.is_error_included(error):
            # print(ut.errors)
            y.append(1)
        else:
            y.append(0)

In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn import metrics
print('confusion matrix = \n', confusion_matrix(y_true=y, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y, y_pred=y_pred))
print('precision = ', precision_score(y_true=y, y_pred=y_pred))
print('recall = ', recall_score(y_true=y, y_pred=y_pred))
print('f1 score = ', f1_score(y_true=y, y_pred=y_pred))

confusion matrix = 
 [[2129   15]
 [  26   30]]
accuracy =  0.9813636363636363
precision =  0.6666666666666666
recall =  0.5357142857142857
f1 score =  0.594059405940594


In [22]:
from tqdm import tqdm
epoch = 5

border = 0.7
f_score = 0
gamma = 0.05
results = []
for _ in tqdm(range(epoch)):
    y_pred = []
    for conv in convs:
        ngram_sets = []
        for ut in conv:
            if not ut.is_system():
                continue
            utt = ut.utt
            doc = nlp(utt)
            y_pred.append(0)
            for sent in doc.sents:
                if len(sent) <= 3:
                    # print(sent)
                    continue
                ngram_set = get_ngram_set(sent, N=3)
                # これまでのセットで重複が大きいものがあるかチェック！
                if check_repeat_rate(target=ngram_set, history=ngram_sets, border=border):
                    # print(ut, ut.errors)
                    y_pred[-1] = 1
                ngram_sets.append(ngram_set)
            # break
    this_f1 = metrics.f1_score(y, y_pred)
    results.append([this_f1, border])
    if this_f1 > f_score:
        f_score = this_f1
        border += gamma
        print(f_score, border)
    else:
        border -= gamma

# print(f_score, border)



 20%|██        | 1/5 [00:25<01:43, 25.94s/it]

0.6086956521739131 0.75


 40%|████      | 2/5 [00:51<01:17, 25.94s/it]

0.6126126126126126 0.8


100%|██████████| 5/5 [02:09<00:00, 25.89s/it]


In [18]:
i = 0
for conv in convs:
    conv_list = []
    for ut in conv:
        conv_list.append(ut.utt)
        if not ut.is_system():
            continue
        # 本来エラーではないけどエラー扱い
        # if y[i]==0 and y_pred[i]==1:
        #     print("本来エラーではないけどエラー扱い", ut.did)
        #     print(conv_list)
        #     print()
        if y[i]==1 and y_pred[i]==0:
            print("本来エラーなのに非エラー扱い", ut.did)
            print(conv_list)
            print()
        # elif y[i]==1 and y_pred[i]==1:
        #     print("よく検出した！えらいぞ", ut.did)
        #     print(conv_list)
        #     print()
        
        i+=1

本来エラーなのに非エラー扱い 1502868266
['こんにちは。ちょうど退屈してたんだ。', 'こんにちは。元気ですか？', '元気ですかは元気です', '元気そうでよかった。', '元気ですかは元気ですね', 'いまなにしてたの？', 'もちろんですってのが元気ですかにいいですね', 'そうなんだ。どうして？', 'もちろんですってのが元気ですかにいいらしいですよ']

本来エラーなのに非エラー扱い 1470624414
['こんにちは！熱中症に気をつけて。', 'こんにちは。ありがとう、水分とるように気を付けてます。', '気をとってるんですね。熱中症に気をつけないんですか？', '大丈夫です。お仕事がお休みの日は何してるんですか？', '熱中症に気をつけたいのです', '水分とって、涼しくしてください。', '熱中症を引き起こします', '病院には\u3000行きたくないです。', '病院は治療を受けましょう', '薬は\u3000好きじゃありません。', '病院を終えるんですよねぇ', '車で帰りますよ。', '病院は有名ですね', '公園は好きですか？', '行くのがいいですね', '帽子を忘れないでくださいね', '帽子がほしいですね', '麦わら帽子はどうですか？', '帽子は欲しいですね']

本来エラーなのに非エラー扱い 1502868964
['こんにちは。声かけてくれるのを待ってたんだ。', 'こんにちは。調子はどうですか？', '何か普段の生活で気をつけていることはありますか。', '健康には特に気を付けています。', '和食、洋食、中華どれが食べたいですか？', 'そうですねぇ。和食ですかね。', '和食ですか。和食でヘルシーですね', 'そうですね。あなたはどれが食べたいですか？', '和食でヘルシーです']

本来エラーなのに非エラー扱い 1470624033
['こんにちは。声かけてくれるのを待ってたんだ。', 'こんにちは。最近興味のあることってありますか？', '興味が湧いてないかもです', 'そうですかー。映画とか気になるのありません？', '映画が楽しみです', '今いろんな映画やっていますもんね', '映画は大丈夫です', '映画でポップコーン食べるのが好きです。', '映画は元気ですね', '私も元気な映画を見るのは好

In [None]:
import Levenshtein
def leven_prev_sim(target, history, border=0.7):
    for text in history:
        if border <  Levenshtein.ratio(text, target):
            return True
    return False

In [None]:

y_pred2 = []
for conv in convs:
    # ngram_sets = []
    history = []
    for ut in conv:
        if not ut.is_system():
            continue
        utt = ut.utt
        doc = nlp(utt)
        y_pred2.append(0)
        for sent in doc.sents:
            if len(sent) <= 3:
                # print(sent)
                continue
            # ngram_set = get_ngram_set(sent, N=3)
            target = sent.text
            # これまでのセットで重複が大きいものがあるかチェック！
            if leven_prev_sim(target, history, border=0.75):
                # print(ut, ut.errors)
                y_pred2[-1] = 1
            history.append(target)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn import metrics
print('confusion matrix = \n', confusion_matrix(y_true=y, y_pred=y_pred2))
print('accuracy = ', accuracy_score(y_true=y, y_pred=y_pred2))
print('F-measure: ', metrics.f1_score(y, y_pred2))

confusion matrix = 
 [[2081   71]
 [  17   31]]
accuracy =  0.96
F-measure:  0.41333333333333333


In [None]:
y.count(1)

48

In [None]:
error = "Repetition"
import csv
import Levenshtein
import random
def make_X_y_csv(filename="repetition.csv"):
    X = []
    y = []
    all_data = []
    with open(filename, "r") as f:
        data_ = csv.reader(f)
        for d in data_:
          all_data.append(d)
    
    for d in all_data:
        y.append(int(d[0]))
        hit = 0
        leven_rate = Levenshtein.ratio(d[1], d[2])
        ngram_set = get_ngram_set(d[2], N=3)
        for ngram in get_ngram_set(d[1], N=3):
            if ngram in ngram_set:
                hit += 1
        ngram_rate = hit/len(ngram_set)
        X.append([ngram_rate, leven_rate])

    u1_l = [d[1]  for d in all_data]
    u2_l = [d[2]  for d in all_data]
    for u1, u2 in zip( random.choices(u1_l, k=len(u1_l)), random.choices(u2_l, k=len(u2_l)) ):
        X.append(make_feature(u1, u2))
        y.append(0)
    return np.asarray(X), np.asarray(y)

In [None]:
X, y_ = make_X_y_csv()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_, test_size=0.30, random_state=5)

In [None]:
# 2つを組み合わせてもいいかもしれないな！
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='sag', max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=1000, solver='sag')

In [None]:
y_pred_ = lr.predict(X_test)

In [None]:
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred_))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred_))
print('precision = ', precision_score(y_true=y_test, y_pred=y_pred_))
print('recall = ', recall_score(y_true=y_test, y_pred=y_pred_))
print('f1 score = ', f1_score(y_true=y_test, y_pred=y_pred_))

confusion matrix = 
 [[12  0]
 [ 1 15]]
accuracy =  0.9642857142857143
precision =  1.0
recall =  0.9375
f1 score =  0.967741935483871


In [None]:
def make_feature(u1, u2):
    leven_rate = Levenshtein.ratio(u1, u2)
    ngram_set = get_ngram_set(u2, N=3)
    hit = 0
    for ngram in get_ngram_set(u1, N=3):
        if ngram in ngram_set:
            hit += 1
    ngram_rate = hit/len(ngram_set)
    return np.asarray([ngram_rate, leven_rate])

In [None]:
y_pred3 = []
for conv in convs:
    ngram_sets = []
    history = []
    for ut in conv:
        if not ut.is_system():
            continue
        utt = ut.utt
        doc = nlp(utt)
        y_pred3.append(0)
        for sent in doc.sents:
            if len(sent) <= 3:
                # print(sent)
                continue
            ngram_set = get_ngram_set(sent, N=3)
            target = sent.text
            # これまでのセットで重複が大きいものがあるかチェック！
            for ngram, text in zip(ngram_sets, history):
                leven_rate = Levenshtein.ratio(target, text)
                hit = 0
                for s in ngram_set:
                    if s in ngram:
                        hit += 1
                ngram_rate = hit/len(ngram)
                x = np.asarray([ngram_rate, leven_rate])
                # if lr.predict(x.reshape(1, -1))[0] == 1:
                if ngram_rate>=0.8 or leven_rate>=0.9:
                    y_pred3[-1] = 1
                    break
            history.append(target)
            ngram_sets.append(ngram_set)

In [None]:
print('confusion matrix = \n', confusion_matrix(y_true=y, y_pred=y_pred3))
print('accuracy = ', accuracy_score(y_true=y, y_pred=y_pred3))
print('precision = ', precision_score(y_true=y, y_pred=y_pred3))
print('recall = ', recall_score(y_true=y, y_pred=y_pred3))
print('f1 score = ', f1_score(y_true=y, y_pred=y_pred3))

confusion matrix = 
 [[2127   25]
 [  24   24]]
accuracy =  0.9777272727272728
precision =  0.4897959183673469
recall =  0.5
f1 score =  0.4948453608247423
