In [1]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

import sys
sys.path.append("../")
from datatools.analyzer import *
from utterance.error_tools import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor

In [2]:
def clean_text_plain(text):
    text_ = neologdn.normalize(text)
    text_ = re.sub(r'\(.*\)', "", text_)
    text_ = re.sub(r'\d+', "0", text_)
    return text_

In [3]:
class Vocabulary:
    def __init__(self, TOKENS=["[PAD]", "FOS", "EOS", "[SEP]", "[UNK]", "[NONE]"]):
        self.index2item = []
        self.item2index = {}

        for sp_token in TOKENS:
            self.add_item(sp_token)

    # 単語数
    def __len__(self):
        return len(self.item2index)

    # 単語が含まれているか
    def __contains__(self, item):
        return item in self.item2index.keys()
    
    def __str__(self) -> str:
        return str(self.item2index)

    # 単語の追加
    def add_item(self, item):
        # もう登録されてたら登録しないよ
        if item in self.item2index:
            return
        index = len(self.item2index)
        self.index2item.append(item)
        self.item2index[item] = index
    
    def add_items(self, items:list):
        for item in items:
            self.add_item(item)

    # 単語の取得
    def get_item(self, index):
        if len(self.index2item) <= index:
            return "[UNK]"
        return self.index2item[index]

    # 単語をidへ
    def get_index(self, item):
        if item not in self.item2index:
            return self.item2index["[UNK]"]
        return self.item2index[item]

    # def save_vocab(self, )

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim
import torch.nn.utils.rnn as rnn

class LMN5(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        # 親クラスのコンストラクタ。決まり文句
        super(LMN5, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。    
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.embed = nn.Embedding(vocab_size, embedding_dim,  padding_idx=0)
        # モデルを2つ定義
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim//2, batch_first=True,  bidirectional=True )
        # self.lstm2 = nn.LSTM(embedding_dim, hidden_dim//2, batch_first=True,  bidirectional=True )
        self.linear = nn.Linear(embedding_dim, vocab_size)
        # softmaxのLog版。dim=0で列、dim=1で行方向を確率変換。
        self.softmax = nn.Softmax()

        nn.init.normal_(self.embed.weight, std=0.01)
        self.linear.weight = self.embed.weight
        nn.init.zeros_(self.linear.bias)
    
    def forward(self, x):
        emb_out = self.embed(x)
        # y =self.softmax(tag_space)
        out, hc = self.lstm1(emb_out)
        y = self.linear(out)
        # print(y.shape)
        return y

In [5]:
model_path = "../models/CF/"
model_name = "ffn_CF1.pickle"
vocab_name = "vocab_CF1.pickle"
modelM = DataManager(model_path)
model = modelM.load_data(model_name)
vocab = modelM.load_data(vocab_name)

success load : ../models/CF/ffn_CF1.pickle
success load : ../models/CF/vocab_CF1.pickle


In [6]:
mecab_analyzer = Analyzer(Tokenizer())



def sentence2normalize_noun_mecab(sentences):
    normalize_sen = []
    if isinstance(sentences, str):
        sentences = [sentences]
    for sen in sentences:
        df = mecab_analyzer.analyze_with_dataframe(sen)
        words = []
        if df is None:
            continue
        for txt, pos in zip(df.surface, df.part_of_speech):
            pos_split = pos.split("-")
            # print(pos_split)
            if pos_split[0]=="名詞" :
                if pos_split[1] == "固有名詞" :
                    words.append(pos)
                elif pos_split[1] == "数詞":
                    words.append("0")
                else:
                    words.append(txt)
            else:
                words.append(txt)
        normalize_sen.append(words)
    return normalize_sen

def sentence2gram(sentence, vocab, N=5, is_id=True):
    normalized = sentence2normalize_noun_mecab(sentence)
    if len(normalized)==0:
        print("sentence was empty")
        return []
    normalized = ["FOS"] + normalized[0] + ["EOS"]
    # padding
    if len(normalized) < N:
        normalized += ["[PAD]"]*(N-len(normalized)) 
    # id化を同時に行う場合
    if is_id:
        normalized = [ vocab.get_index(c) for c in normalized]
    ngram_text = []
    for i in range(len(normalized)-N+1):
            # print(L[i:i+N])
        ngram_text.append(normalized[i:i+N])
    return ngram_text


In [7]:
def make_X_y_seq(A, vocab, N=5, is_id=True):
    X = []
    Y = []
    for utt in tqdm(A):
        # ngram にした結果がかえってくる
        ngram_text = sentence2gram(utt, vocab, N, is_id)
        X.extend(ngram_text[:-1])
        Y.extend(ngram_text[1:])
    return X, Y    

In [8]:
path = "../eval_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
convs = read_conv(path, datalist)

error = "Semantic error"
# errors = ['Grammatical error', "Uninterpretable"]
sys_utt = []
y = []
for conv in convs:
    for ut in conv:
        if ut.is_system() and ut.is_exist_error():
            # if not ut.utt[-1] in ["？", "！", "。", "!"]:
            #     sys_utt.append( clean_text( ut.utt+"。" ))
            #     # sys_utt.append(ut.utt)
            # else:   
            sys_utt.append( clean_text(ut.utt))
            if ut.is_error_included(error):
                y.append(1)
            else:
                y.append(0)

In [9]:
N = 5

In [10]:
X, Y = make_X_y_seq(sys_utt, vocab, N=N, is_id=False)

100%|██████████| 1386/1386 [00:03<00:00, 410.61it/s]


In [11]:
softmax = nn.Softmax(dim=1)
def sentence2score(sentence, vocab, N):
    ngram_text = sentence2gram(sentence, vocab, N=N, is_id=True)
    X = torch.tensor(ngram_text)
        # print(X.shape, utt)
    y_pred = model(X.to(torch.int))
    max_ppl = 0
    with torch.no_grad():
        for x, yp in zip(X, y_pred):
            ppl = 0
            # print(x)
            for i, (x_, yp_) in enumerate( zip(x[:-1], softmax(yp)) ):
                    # print(torch.sum(yp_))
                ppl += np.log2(yp_[x[i+1]])
            ppl = ppl/N
            ppl = float(np.power(2, ppl))
                # print(float(ppl))
            if ppl > max_ppl:
                max_ppl = ppl
    return max_ppl

In [12]:
model.cpu()

ppl_list = []
N=5
for utt in tqdm(sys_utt):
    # ngram_text = sentence2gram(utt, vocab, N=3, is_id=True)
        
    ppl = sentence2score(utt, vocab, N=N)
    ppl_list.append(ppl)
        # break

100%|██████████| 1386/1386 [00:14<00:00, 97.76it/s] 


In [13]:
ppl_list = list(map(float, ppl_list))

In [14]:
ppl_list_sort = sorted(ppl_list, reverse=True)

In [15]:
ppl_list_sort[len(ppl_list_sort)//5]

0.9999960184033294

In [25]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
import numpy as np

border = 1.5912122790274516e-05
border= ppl_list_sort[len(ppl_list_sort)//2]
epoch = 1
# y_pred = np.zeros(len(y)) + 1
y_pred = np.zeros(len(y))
max_precision = 0

for e in range(epoch):
    # y_pred = np.zeros(len(y))
    y_pred = np.zeros(len(y))
    for i, score_ in enumerate(ppl_list):
        # border 未満をエラーでとする
        # print(score_)
        if score_ > border :
            y_pred[i] = 1
            # print(sys_utt[i])
        # precision = metrics.precision_score(y, y_pred)
        if y[i] == 1:
            print(sys_utt[i])
    # print('accuracy = ', accuracy_score(y_true=y, y_pred=y_pred))
    # print(border + 0.01*e)
    # print('confusion matrix = \n', confusion_matrix(y_true=y, y_pred=y_pred))
    # print()


print('confusion matrix = \n', confusion_matrix(y_true=y, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y, y_pred=y_pred))
print('precision = ', precision_score(y_true=y, y_pred=y_pred))
print('recall = ', recall_score(y_true=y, y_pred=y_pred))
print('f1 score = ', f1_score(y_true=y, y_pred=y_pred))

元気ですかは元気です
好きだを見ますよねー
病院は治療を受けましょう
好きだは好きですか。お寿司は縁側が好きですね
時期から資格を取りますねぇ
手を貯金に出しますねぇ
所で、テレビでテレビあるって言ってましたが、テレビは民主党支持が多いですね
旬ですねぇ。自分もオリンピック書いたし。
confusion matrix = 
 [[686 692]
 [  7   1]]
accuracy =  0.49567099567099565
precision =  0.001443001443001443
recall =  0.125
f1 score =  0.0028530670470756064


In [17]:
softmax = nn.Softmax(dim=1)
def sentence2score_list(sentence, vocab, N):
    ngram_text = sentence2gram(sentence, vocab, N=N, is_id=True)
    X = torch.tensor(ngram_text)
        # print(X.shape, utt)
    y_pred = model(X.to(torch.int))
    ppl_list = []
    with torch.no_grad():
        for x, yp in zip(X, y_pred):
            ppl = 0
            # print(x)
            for i, (x_, yp_) in enumerate( zip(x[:-1], softmax(yp)) ):
                    # print(torch.sum(yp_))
                ppl += np.log2(yp_[x[i+1]])
                # ppl += np.log2(yp_[x_])
            ppl = ppl/N
            ppl = float(np.power(2, ppl))
                # print(float(ppl))
            ppl_list.append(ppl)
    return ppl_list

In [26]:
sentence = "時期から資格を取りますねぇ"
# sentence = "お金は大切です"
n = 4
ngrams = sentence2gram(sentence, vocab, n, is_id=False)
scores = sentence2score_list(sentence, vocab, n)

for n, s in zip(ngrams, scores):
    print(n, s)

['FOS', '時期', 'から', '資格'] 0.9994229054231399
['時期', 'から', '資格', 'を'] 0.9997054971739678
['から', '資格', 'を', '取り'] 0.9998664142064736
['資格', 'を', '取り', 'ます'] 0.9999749057296895
['を', '取り', 'ます', 'ねぇ'] 0.9984511960257818
['取り', 'ます', 'ねぇ', 'EOS'] 0.525261368575887


In [19]:
sentence = "同じ趣味の白土人と出会えるのも楽しみに1つです"
# sentence = "お金は大切です"
ngrams = sentence2gram(sentence, vocab, N, is_id=False)
scores = sentence2score_list(sentence, vocab, N)
for n, s in zip(ngrams, scores):
    print(n, s)

['FOS', '同じ', '趣味', 'の', '白土'] 0.02145287021279901
['同じ', '趣味', 'の', '白土', '人'] 0.015255695587417241
['趣味', 'の', '白土', '人', 'と'] 0.02595353271797157
['の', '白土', '人', 'と', '出会える'] 0.04081718965134213
['白土', '人', 'と', '出会える', 'の'] 0.9992572020508157
['人', 'と', '出会える', 'の', 'も'] 0.9999455066192333
['と', '出会える', 'の', 'も', '楽しみ'] 0.9999031523085676
['出会える', 'の', 'も', '楽しみ', 'に'] 0.9999811408291085
['の', 'も', '楽しみ', 'に', '0'] 0.9999762769695517
['も', '楽しみ', 'に', '0', 'つ'] 0.9999684792676979
['楽しみ', 'に', '0', 'つ', 'です'] 0.9999605166236292
['に', '0', 'つ', 'です', 'EOS'] 0.4227626730114587


- 熱中症に気をつけか?
    - 8.845499791513074e-06
- お金は大きとか
    - 7.319023919936928e-08
- はんばんこ
    - 7.965650687703318e-09
- 4.300948350625364e-07

In [20]:
list(vocab.item2index.keys())[1960:1970]

['集中', '熱', '演劇', '芝居', '疼く', 'エンターテインメント', '歌', 'わかっ', '大き', 'ビックリ']

- LSTM ベースの手法
    ボーダー設定なし
    ```
    confusion matrix = 
    [[1243  136]
    [   5    2]]
    accuracy =  0.8982683982683982
    precision =  0.014492753623188406
    recall =  0.2857142857142857
    f1 score =  0.02758620689655172
    ```
    