In [3]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

import sys
sys.path.append("../")
from datatools.analyzer import *
from utterance.error_tools import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim
import torch.nn.utils.rnn as rnn

In [102]:
path = "../hand_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
convs = read_conv(path, datalist)

In [103]:
def make_Xy_4test(convs, N=4):
    errors = ["Topic transition error", 'Lack of information', 'Unclear intention']
    errors = errors[:1]
    
    X = []
    y = []
    for conv in convs:
        dialogue = [""]*N
        for i, ut in enumerate( conv ) :
            # ユーザ発話駆動
            dialogue.append(clean_text( ut.utt) )
            if ut.is_exist_error():
                X.append( dialogue[-N:] )
                    # X.append(dialogue[-N:])
                if ut.is_error_included(errors) :
                    y.append(1)
                else:
                    y.append(0)
        
    return X, y

In [104]:
errors = ["Topic transition error", 'Lack of information', 'Unclear intention']
errors[:1]

['Topic transition error']

In [105]:
N = 3
X_str, y = make_Xy_4test(convs, N=N)
y.count(1)

192

In [106]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, tagset_size, vocab_dict):
        # 親クラスのコンストラクタ。決まり文句
        super(LSTMClassifier, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。    
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim,  padding_idx=0)
        # モデルを2つ定義
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim//2, batch_first=True,  bidirectional=True )
        # self.lstm2 = nn.LSTM(embedding_dim, hidden_dim//2, batch_first=True,  bidirectional=True )
        self.hidden2tag = nn.Linear(hidden_dim , tagset_size)
        # softmaxのLog版。dim=0で列、dim=1で行方向を確率変換。
        self.softmax = nn.LogSoftmax()
    
        self.vocab_dict = vocab_dict
    
    def forward(self, x):
        emb1 = self.word_embeddings(x)
        _, lstm1_out = self.lstm1(emb1)
        tag_space = self.hidden2tag(torch.cat([ lstm1_out[0][0], lstm1_out[0][1]], dim=1 ))
        y =self.softmax(tag_space)
        return y
    
    def last_context(self, x):
        emb1 = self.word_embeddings(x)
        # print(emb1.shape)
        _, lstm1_out = self.lstm1(emb1)
        context = torch.cat([ lstm1_out[0][0], lstm1_out[0][1]], dim=1 )
        return context
    
    def text2context(self, text):
        if isinstance(text, str):
            utt_id = self._sentence2ids(text, self.vocab_dict)
            utt_id_tensor = torch.tensor( [utt_id] , device='cuda:0', dtype=torch.int)
            # utt_id_tensor = torch.tensor( [utt_id] , device='cpu', dtype=torch.int)
            return self.last_context(utt_id_tensor)
        if isinstance(text, list):
            X = self._make_X(text, self.vocab_dict)
            utt_id_tensor = X.to(torch.int).cuda()
            # utt_id_tensor = X.to(torch.int)
            return self.last_context(utt_id_tensor)
        else:
            return 0

        
    def _sentence2ids(self, sentence:str, vocab_dict:dict):
        doc = self._sentence2formated(sentence)
        ids = np.zeros(len(doc))
        for i, key in enumerate(doc):
            # key = token.orth_
            if key in vocab_dict:
                ids[i] = vocab_dict[key]
            else:
                ids[i] = vocab_dict["[UNK]"]
        return ids
    
    def _sentence2formated(self, sen):
        return sum( fill_SYMBOL_ONE( sentence2normalize_noun(sen) ), [] )
    
    def _padding_vector(self, Xseq):
        Xseq = [ torch.tensor( xseq[:, None] ) for xseq in Xseq]
        Xseq = rnn.pad_sequence(Xseq, batch_first=True)
        Xseq = [ torch.flatten(xseq) for xseq in Xseq ] 
        return Xseq


    def _make_X(self, utt_list:list, vocab_dict:dict):
        utt_id_list = []
        for utt in tqdm( utt_list) :
            utt_id = self._sentence2ids(utt, vocab_dict)
            utt_id_list.append(utt_id)

        utt_id_pad = self._padding_vector(utt_id_list)
        upl = len(utt_id_pad[0])
        # X =   [ torch.Tensor([u, s]) for u, s in zip(usr_id_pad, sys_id_pad) ] 
        # print(usr_pad_len, sys_pad_len)
        X = torch.zeros( (len(utt_list), upl) )
        for i, u in enumerate(utt_id_pad):
            X[i, :upl] = u
        return X

In [107]:
model_path = "../models/response2/"
model_name = "forward_v2.pickle"
modelM = DataManager(model_path)
fmodel = modelM.load_data(model_name)

success load : ../models/response2/forward_v2.pickle


In [108]:
forward_xy_name = "../X_y_data/context_topic/X_forward_topic_ERROR_N={0}".format(N)

In [109]:
X_forward_all_str = sum(X_str, [])

if os.path.exists(forward_xy_name+".npy"):
    # X_forward_ids  = np.load(forward_xy_name+".npy")
    X_forward  = np.load(forward_xy_name+".npy")
    print("success load {0}.npy".format(forward_xy_name))
else:
    with torch.no_grad():
        fmodel.cpu()
        # X_forward_l =  fmodel.text2context(X_str)
        # 手で書くしかない
        x_length = len(X_forward_all_str)//N
        X_forward_ids = fmodel._make_X(X_forward_all_str, fmodel.vocab_dict).to(torch.int)
        X_forward_ids = X_forward_ids.reshape(x_length, N, -1)
        X_forward = np.array( [fmodel.last_context(Xfi).numpy() for Xfi in X_forward_ids] ) 
        fmodel.cuda()
        np.save(forward_xy_name, X_forward)
    

success load ../X_y_data/context_topic/X_forward_topic_ERROR_N=3.npy


In [110]:
from sentence_transformers import SentenceTransformer
# from sentence_transformers import models

bert_path = "../../corpus/pretrained/sbert_unclear1"
sbert = SentenceTransformer(bert_path)

[526] 2022-01-12 20:12:58,388 Info sentence_transformers.SentenceTransformer :Load pretrained SentenceTransformer: ../../corpus/pretrained/sbert_unclear1
[526] 2022-01-12 20:13:00,659 Info sentence_transformers.SentenceTransformer :Use pytorch device: cuda


In [111]:
x_length = len(X_forward_all_str)//N
# X_topic_vec = smodel.encode(X_forward_all_str).reshape(x_length, N, -1)
X_topic_vec = sbert.encode(X_forward_all_str).reshape(x_length, N, -1)

Batches:   0%|          | 0/127 [00:00<?, ?it/s]

In [112]:
def vec2feature(vector):
    diff = np.abs( vector[0] - vector[1] )
    return np.concatenate([vector.flatten(), diff])

In [113]:
emb_dim = 768
def sentence2formated(vectors):
    features = []
    prev_vector = np.zeros(emb_dim)
    for i, vector in enumerate(vectors):
        feature = vec2feature( np.array([prev_vector, vector]) ) 
        features.append(feature)
        prev_vector = vector
    return np.array(features)

In [114]:
X_topic = np.array([ sentence2formated(vec) for vec in X_topic_vec ])

In [115]:
X = np.concatenate([X_topic, X_forward], axis=2)
X = torch.from_numpy(X)
y = torch.Tensor(y)

In [116]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5, stratify=y)

In [117]:
class Datasets(torch.utils.data.Dataset):
    def __init__(self, X_data, y_data):
        # self.transform = transform

        self.X_data = X_data
        self.y_data = y_data

        self.datanum = len(X_data)

    def __len__(self):
        return self.datanum

    def __getitem__(self, idx):
        out_X = self.X_data[idx]
        out_y = self.y_data[idx]

        return out_X, out_y

In [118]:
import copy
class TopicClassifier(nn.Module):
    def __init__(self, topic_dim, forward_dim, topic_hid, for_hid, tagset_size):
        # 親クラスのコンストラクタ。決まり文句
        super(TopicClassifier, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。    
        self.tlen = topic_dim
        self.flen = forward_dim
        # self.hidden = hidden_dim
        # 768->256
        self.tlstm = nn.LSTM(topic_dim, topic_hid, batch_first=True)
        # self.lay2_lstm = nn.LSTM(hidden_dim+forward_dim//2, hidden_dim2, batch_first=True)
        self.flstm = nn.LSTM(forward_dim, for_hid, batch_first=True)
        # self.for2hid = nn.Linear(forward_dim , forward_dim//2)
        self.hid2out = nn.Linear(topic_hid+for_hid , tagset_size)
        self.tanh = nn.Tanh()
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax()

    def forward(self, x):
        x_topic = x[:, :, :self.tlen].to(torch.float)
        x_forward = x[:, :, self.tlen:].to(torch.float)
        # x_for_hid = self.for2hid(x_forward)
        # print(x_topic.shape)

        # forward_c = torch.stack( [ self.fmodel.last_context(xfid) for xfid in x_forward_id])
        # topic_out, _ = self.tlstm(x_topic)
        _, tout = self.tlstm(x_topic)
        _, fout = self.flstm(x_forward)

        # print("topic_out: ", topic_out.shape)
        # topic_out = self.relu(topic_out)
        # x_lay2 = torch.cat([topic_out, x_for_hid)], dim=2)

        # _, hc = self.lay2_lstm(x_lay2)
        # out = self.hid2out(hc[0][0])
        out = self.hid2out(torch.cat([tout[0][0], fout[0][0]], dim=1) )
        y = self.softmax(out)
        
        return y

In [119]:
BATCH_SIZE = 64
epoch_ = 150
trainset = Datasets(X_train, y_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size = BATCH_SIZE, shuffle = True, num_workers = 2)

In [120]:
TOPIC_DIM = emb_dim*3
FORWARD_DIM = 256
TOPIC_HID_DIM = emb_dim
FOR_HID_DIM = FORWARD_DIM//2
OUTPUT_DIM = 2

In [121]:
model = TopicClassifier(TOPIC_DIM, FORWARD_DIM, TOPIC_HID_DIM, FOR_HID_DIM, OUTPUT_DIM)
if torch.cuda.is_available():
   model.cuda()
loss_function = nn.NLLLoss()
# loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

In [122]:
losses = []
for epoch in tqdm( range(epoch_)  ):  # again, normally you would NOT do 300 epochs, it is toy data
    all_loss = 0
    for data in trainloader:
        # X_t_tensor = torch.tensor(data[0], device='cuda:0', dtype=torch.int16)
        X_t_tensor = data[0].cuda()
        y_t_tensor = data[1].to(torch.long).cuda()
        optimizer.zero_grad()
        model.zero_grad()
        # print(X_t_tensor.shape , y_t_tensor.view(-1,1).shape)

        score_ = model(X_t_tensor)
        # print(X_t_tensor.shape, score.view(-1,5).shape, y_t_tensor.view(-1,1).shape)
        loss_ = loss_function(score_,  y_t_tensor)
        loss_.backward()
        all_loss += loss_.item()
        optimizer.step()
        del score_
        del loss_
    losses.append(all_loss)
    if (epoch+1) % 20 == 0:
        print("epoch", epoch+1, "\t" , "loss", all_loss)
print("done")

 13%|█▎        | 20/150 [00:10<01:16,  1.70it/s]

epoch 20 	 loss 0.010506305901799351


 27%|██▋       | 40/150 [00:22<01:00,  1.83it/s]

epoch 40 	 loss 0.001729319577862043


 40%|████      | 60/150 [00:32<00:49,  1.83it/s]

epoch 60 	 loss 0.0006559003741131164


 53%|█████▎    | 80/150 [00:43<00:38,  1.82it/s]

epoch 80 	 loss 0.0003204873501090333


 67%|██████▋   | 100/150 [00:53<00:25,  1.99it/s]

epoch 100 	 loss 0.00018419328171148663


 80%|████████  | 120/150 [01:04<00:14,  2.04it/s]

epoch 120 	 loss 0.0001184726106657763


 93%|█████████▎| 140/150 [01:19<00:08,  1.14it/s]

epoch 140 	 loss 7.998909495654516e-05


100%|██████████| 150/150 [01:24<00:00,  1.77it/s]

done





In [123]:
with torch.no_grad():
    X_tensor = torch.tensor(X_test, device='cuda:0').float()
            # 推論
    y_pred= np.array(model(X_tensor).cpu()).argmax(axis=1)

  


In [124]:
# modelM = DataManager("../models/context_topic/")
# model_name = "sbert_context_unclear_LSTM.pickle"
# modelM.save_data(model_name, model)

In [125]:
score(y_test, y_pred)

confusion matrix = 
 [[330  17]
 [ 45  13]]
accuracy =  0.8469135802469135
precision =  0.43333333333333335
recall =  0.22413793103448276
f1 score =  0.29545454545454547


- epoch 150 

        confusion matrix = 
        [[146  44]
        [ 57 158]]
        accuracy =  0.7506172839506173
        precision =  0.7821782178217822
        recall =  0.7348837209302326
        f1 score =  0.7577937649880097

- epoch 250

        confusion matrix = 
        [[140  50]
        [ 53 162]]
        accuracy =  0.745679012345679
        precision =  0.7641509433962265
        recall =  0.7534883720930232
        f1 score =  0.7587822014051522

In [126]:
path = "../eval_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
convs_ = read_conv(path, datalist)

In [127]:
N = 3
X_str, y = make_Xy_4test(convs_, N=N)

In [128]:
forward_xy_eval_name = "../X_y_data/context_topic/X_forward_topic_ERROR_eval_N={0}".format(N)

In [129]:
X_forward_all_str = sum(X_str, [])

if os.path.exists(forward_xy_eval_name+".npy"):
    # X_forward_ids  = np.load(forward_xy_name+".npy")
    X_forward  = np.load(forward_xy_eval_name+".npy")
    print("success load {0}.npy".format(forward_xy_eval_name))
else:
    with torch.no_grad():
        fmodel.cpu()
        # X_forward_l =  fmodel.text2context(X_str)
        # 手で書くしかない
        x_length = len(X_forward_all_str)//N
        X_forward_ids = fmodel._make_X(X_forward_all_str, fmodel.vocab_dict).to(torch.int)
        X_forward_ids = X_forward_ids.reshape(x_length, N, -1)
        X_forward = np.array( [fmodel.last_context(Xfi).numpy() for Xfi in X_forward_ids] ) 
        # X_forward = X_forward.reshape(-1, 4, 256)
        fmodel.cuda()
        # X_forward_l = np.array( fmodel.text2context(X_forward_all_str).cpu() ) 
        # np.save(forward_xy_name, X_forward_ids)
        np.save(forward_xy_eval_name, X_forward)

success load ../X_y_data/context_topic/X_forward_topic_ERROR_eval_N=3.npy


In [130]:
x_length = len(X_forward_all_str)//N
# X_topic_vec = smodel.encode(X_forward_all_str).reshape(x_length, N, -1)
X_topic_vec = sbert.encode(X_forward_all_str).reshape(x_length, N, -1)
X_topic = np.array([ sentence2formated(vec) for vec in X_topic_vec ])

Batches:   0%|          | 0/130 [00:00<?, ?it/s]

In [131]:
X = np.concatenate([X_topic, X_forward], axis=2)
X = torch.from_numpy(X)
y = torch.Tensor(y)

In [132]:
with torch.no_grad():
    X_tensor = torch.tensor(X, device='cuda:0').float()
    y_pred= np.array(model(X_tensor).cpu()).argmax(axis=1)

  


In [133]:
score(y, y_pred)

confusion matrix = 
 [[1152   61]
 [ 133   40]]
accuracy =  0.86002886002886
precision =  0.39603960396039606
recall =  0.23121387283236994
f1 score =  0.29197080291970806


- 2つのLSTM の結果を統合(話題遷移エラー，発話意図不明確，情報不足すべて)

        confusion matrix = 
        [[471 211]
        [193 511]]
        accuracy =  0.7085137085137085
        precision =  0.7077562326869806
        recall =  0.7258522727272727
        f1 score =  0.7166900420757363

        TOPIC_DIM = emb_dim*3
        FORWARD_DIM = 256
        TOPIC_HID_DIM = emb_dim
        FOR_HID_DIM = FORWARD_DIM//2
        OUTPUT_DIM = 2
            
    - baseline よりも精度は高い

    - epoch 250
    
            confusion matrix = 
            [[433 249]
            [171 533]]
            accuracy =  0.696969696969697
            precision =  0.6815856777493606
            recall =  0.7571022727272727
            f1 score =  0.7173620457604307

- ハイパーパラメータを少なくしてみた
