In [1]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim

In [3]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, tagset_size, batch_size):
        # 親クラスのコンストラクタ。決まり文句
        super(LSTMClassifier, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。    
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        # LSTMの隠れ層。これ１つでOK。超便利。
        self.lstm = nn.LSTM(embedding_dim, hidden_dim//2, batch_first=True, bidirectional=True )
        # LSTMの出力を受け取って全結合してsoftmaxに食わせるための１層のネットワーク
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        # softmaxのLog版。dim=0で列、dim=1で行方向を確率変換。
        # self.softmax = 
    
    def forward(self, x):
        #embeds.size() = (batch_size × len(sentence) × embedding_dim)
        batch_size, seq_len = x.shape[0], x.shape[1]
        _, hidden_layer = self.lstm(x)
        # print(hidden_layer)
        bilstm_out = torch.cat([hidden_layer[0][0], hidden_layer[0][1]], dim=1)
        # y = self.hidden2tag(hidden_layer[0].view(batch_size, -1))

        y = self.hidden2tag(bilstm_out)
        y = F.log_softmax(y, dim=1)
        return y

In [4]:
import pickle
class DataManager:
    def __init__(self, data_path) -> None:
        import os
        import pickle
        self.data_path = data_path
        os.makedirs(data_path, exist_ok=True)
        self.dir = os.listdir(data_path)

    def is_exist(self, name):
        return (name in self.dir)
    
    def save_data(self, name, obj):
        with open(self.data_path+name, "wb") as f:
            pickle.dump(obj, f)
        print("success save : {0}{1}".format(self.data_path, name))

    def load_data(self, name):
        with open(self.data_path+name, "rb") as f:
            obj = pickle.load(f)
        print("success load : {0}{1}".format(self.data_path, name))
        return obj

In [5]:
from pyknp import Juman
from sentence_transformers import SentenceTransformer
import scipy.spatial
Nmodel_path = "/home/yamada/Downloads/training_bert_japanese"
Nmodel = SentenceTransformer(Nmodel_path, show_progress_bar=False)
emb_dim = Nmodel.encode(["お辞儀をしている男性会社員"])[0].shape[0]



In [6]:
def make_X(convs, max_len):
    # emb_dim = nlp("形態素").vector.shape
    X_data = []
    
    for conv in convs :
        # vec_list = np.zeros( (max_len, emb_dim[0]) )
        sentence_vectors = Nmodel.encode(conv)
        # for i, ut in enumerate(conv):
        #     doc = nlp(ut)
        #     vec_list[i] = doc.vector
        X_data.append(sentence_vectors)
    return np.array(X_data)

In [7]:
path = "../hand_labeled/"
datalist = ['DCM', 'DIT', 'IRS']

output = "./"

In [8]:
def read_json_with_NoErr(path:str, datalist:list) -> pd.DataFrame:
    cols = ['did', 'tid', 'usr', 'sys', 'ec']
    df = pd.DataFrame(index=[], columns=cols)

    for p in datalist:
        datapath = Path(path + p + '/')
        for file in datapath.glob("*.json"):
            with open(file, "r") as f:
                json_data = json.load(f)
                did = json_data["did"]
                for t in json_data["turns"]:
                    if t["turn-index"] == 0:
                        continue
                    if t["speaker"] == "U":
                        usr = t["utterance"]
                        continue
                    if t["speaker"] == "S" :
                        tid = t["turn-index"]
                        sys = t["utterance"]
                        if t["error_category"]:
                            ec = t["error_category"]
                        else:
                            ec = ["No-Err"]
                        df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
    df.reset_index(inplace=True, drop=True)
    return df

In [9]:
df = read_json_with_NoErr(path, datalist)

In [10]:
def extract_continue_convs_with_error(df, length, errors):
    new_convs = []
    continue_conv = []
    did = 0
    for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
        # did が変化すれば，別の対話
        if d != did:
            continue_conv = []
            did = d
        continue_conv .append(u)
        continue_conv .append(s)
        for err in errors:
            if len(continue_conv) >= length and err in e:
                new_convs.append( continue_conv[-length:] )
    
    return new_convs

In [41]:
errors = ["Topic transition error", "Unclear intention", "Lack of information"]
# errors = ["Lack of information"]
errors = ["Topic transition error"]
# errors = ["Unclear intention"]
length = 3

In [42]:
model_path = "../models/context/"
model_name = "topic4-{0}.pickle".format(length)
modelM = DataManager(model_path)
print(model_name)
if modelM.is_exist(model_name):
    model = modelM.load_data(model_name)
    model.to("cpu")

topic4-3.pickle
success load : ../models/context/topic4-3.pickle


In [43]:
# real test
leng_c = 2
def make_X_str_y(df, errors):
    X_str = []
    y = []
    y = np.zeros(len(df))
    continue_conv = []
    did = 0
    for i, (d, u, s, e) in enumerate(zip(df.did, df.usr, df.sys, df.ec)):
        # sentence_vectors = Nmodel.encode([u, s])
        if d != did:
            continue_conv = [""]*leng_c
            did = d
        continue_conv.append(u)
        continue_conv.append(s)
        X_str.append( continue_conv[-leng_c:] )
        for err in errors:
            if err in e:
                y[i] = 1

    return X_str, y

In [44]:
X_str, y = make_X_str_y(df, errors)

In [45]:
X = make_X(X_str, length)

In [46]:
with torch.no_grad():
#     X_tensor = torch.tensor(X, device='cuda:0').float()
    X_tensor = torch.tensor(X).float()
    # y_tensor = torch.tensor(y, dtype=torch.long, device='cuda:0')
            # 推論
#     y_pred= np.array(model(X_tensor).cpu()).argmax(axis=1)
    y_pred= np.array(model(X_tensor)).argmax(axis=1)

In [47]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
print('confusion matrix = \n', confusion_matrix(y_true=y, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y, y_pred=y_pred))
print('F-measure: ', metrics.f1_score(y, y_pred))

confusion matrix = 
 [[1675  133]
 [ 176   16]]
accuracy =  0.8455
F-measure:  0.093841642228739


In [48]:
len(y_pred)

2000

In [49]:
metrics.accuracy_score(y, np.zeros(2000))

0.904

In [51]:
correct_n = 0
bad_n = 0
all_tp_one = 0
for x, t, n in zip(X_str, y, y_pred):
    # t==1 : 本来の破綻
    # n==1 : 予想された破綻
    if t == 1:
        all_tp_one += 1
        if n == 1:
            # t==n==1 : 適切に検出
            correct_n += 1
            print(x)
        else:
            # 破綻なのに未検出
            bad_n += 1
            # print(x)
    else:
        # 破綻ではないのに破綻扱い
        if n == 1:
            bad_n += 1
            print(x)
print("corrent_n:", correct_n)
print("all_tp_one:", all_tp_one)
print("bad_n:", bad_n)
print("rate c:", correct_n/all_tp_one)

['そうですか', 'あはい']
['こんにちは。お名前は？', 'お名前はco.jpに添えます']
['どこにいったの？', '旅行を検討しちゃった']
['一緒に遊ぼう！', 'しりとりでもしましょうか']
['海外旅行されたことありますか？', '最近興味のあることは何ですか？']
['こんにちは', 'こんにちは']
['海外旅行の経験はありますか？', '海外旅行も楽しいかな。']
['興味があるスポーツは何ですか？', '今は、してないですね。']
['学生時代に何かスポーツしていましたか？', '今は、してないですね。']
['こんにちは！元気ですか？', '元気ですかは元気です']
['そうですね。', 'うん']
['インターネットはいまではだれでも使っていますよね。', '日本の国花は桜だけど、法律で決められたものじゃないんだよ。']
['こんばんは', 'こんばんわ']
['そうですね', 'うむ']
['お笑いの番組はよく見るよ。', 'お笑いはくだらないですね']
['嬉しいですね', 'なでなで']
['物知りなんですね', 'そのとおり']
['こんにちは。', 'こんにちはー']
['こんにちはー', 'こんにちはですー']
['それはゲイということですか？', 'お金がかかるのです']
['だれがですか？', 'お金にルーズなようですよ']
['だれがルーズなんですか？', '銀行からお金を借りるらしいですよ']
['競泳で金メダルでましたね', '金メダルは無理です']
['こんにちは。', 'こんにちは']
['そうですね。', 'うむ']
['次はいつ登山に行く予定なんですか？', 'インターネットで調べたところ、一位は、GWです。']
['あら、教えてもらえなくて残念です。', 'インターネットで調べたところ、一位は、レンズ豆です。']
['むしろ「じょうおう」って読み方を知らなかったよ。', '針のむしろのむしろが藁や竹で編んだマットのようなものと知っている人少ないですね。']
['そうだね。エントリーっていうのがちょっとよくわからないけど。', '防水はいいそうです']
['仲良しな感じがするね。', '仲良しの人同士なら、すぐに時間がすぎちゃってびっくりするよね。']
['こんにちは', 'こんにちは']
['こんにち