In [3]:
from pathlib import Path
import json
import pandas as pd
import numpy as np
from numpy.lib.function_base import select
import spacy
import torch
import re

# from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaConfig
# from transformers import BertTokenizer, BertModel

from sklearn.model_selection import train_test_split
# from sklearn.multioutput import MultiOutputClassifier
# from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim

from pyknp import Juman
from sentence_transformers import SentenceTransformer

import pickle




In [4]:
class preprocessor:

    def __init__(self) -> None:
        self.nlp = spacy.load('ja_ginza')
        self.model_path = "/home/yamada/Downloads/training_bert_japanese"
        self.sen_model = SentenceTransformer(self.model_path, show_progress_bar=False)

        # 半角全角英数字
        # self.DELETE_PATTERN_1 = re.compile(r'[0-9０-９a-zA-Zａ-ｚＡ-Ｚ]+')
        # 記号
        self.DELETE_PATTERN_2 = re.compile(
            r'[\．_－―─！＠＃＄％＾＆\-‐|\\＊\“（）＿■×+α※÷⇒—●★☆〇◎◆▼◇△□(：〜～＋=)／*&^%$#@!~`){}［］…\[\]\"\'\”\’:;<>?＜＞〔〕〈〉？、。・,\./『』【】「」→←○《》≪≫\n\u3000]+')
        
        self.emb_size = self.get_sentence_vec("emb").shape[0]
        print(self.emb_size)

    def get_sentence_vec(self, sen) -> np.array:
        # sen_ = self.DELETE_PATTERN_1.sub(sen)
        sen_ = self.DELETE_PATTERN_2.sub("", sen)
        sentence_vec = self.nlp(sen_).vector
        # sentence_vec = self.sen_model.encode(sen)[0]
        return sentence_vec
    
    def read_json_with_NoErr(self, path:str, datalist:list) -> pd.DataFrame:
        cols = ['did', 'tid', 'usr', 'sys', 'ec']
        df = pd.DataFrame(index=[], columns=cols)

        for p in datalist:
            datapath = Path(path + p + '/')
            for file in datapath.glob("*.json"):
                # print(file)
                with open(file, "r") as f:
                    json_data = json.load(f)
                    did = json_data["dialogue-id"]
                    for t in json_data["turns"]:
                        if t["turn-index"] == 0:
                            continue
                        if t["speaker"] == "U":
                            usr = t["utterance"]
                            continue
                        if t["speaker"] == "S" :
                            tid = t["turn-index"]
                            sys = t["utterance"]
                            if t["error_category"]:
                                ec = t["error_category"]
                            else:
                                ec = ["No-Err"]
                            df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
        df.reset_index(inplace=True, drop=True)
        return df
    
    def make_error_dict(self, error_types):
        error_dict = {}
        for e in error_types:
            error_dict[e] = len(error_dict)
        return error_dict
    
    def extract_X_y(self, df:pd.DataFrame, error_types, prev_num) -> np.array:
        # nlp = spacy.load('ja_ginza')
        
        did = df.did[0]
        n = prev_num
        # print(did)
        # 全体
        X_data = []
        y_data = []
        # 各 did 
        sequence_did = []
        y_did = []
        # エラーの辞書定義
        error_dict = self.make_error_dict(error_types)

        # 初期の調整 padding
        for i in range(n-1):
            sequence_did.append(
                np.concatenate( [np.zeros(self.emb_size), np.zeros(self.emb_size)])
            )

        # didごとに返却する？
        # エラーが発生したら、開始からエラーまでの文脈を入力とする(N=5の固定長でも可能)
        # 先にこのベクトル列を作成し，Tensorに変換して， List に保持
        for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
            if did != d:
                did = d
                sequence_did = []
                y_did = []
                # 初期の調整 padding
                for i in range(n-1):
                    sequence_did.append(
                            np.concatenate( [np.zeros(self.emb_size), np.zeros(self.emb_size)])
                        )
                # break

            # sequence_did.append([u, s])
            sequence_did.append(
                    np.concatenate(
                        [self.get_sentence_vec(u), self.get_sentence_vec(s)]
                    )
                # [u, s]
            )
            if e[0] == "No-Err":
                continue
            else:
                y_each_error_label = np.zeros(len(error_types))
                for e_ in e:
                    y_each_error_label[error_dict[e_]] = 1
                X_data.append(sequence_did[-n:])
                # y_did = np.array(y_each_error_label)
                y_data.append(y_each_error_label)
        return np.array(X_data), np.array(y_data)

In [5]:
def predict_at_least_oneClass(clf, X) -> np.array:
    y_pred = clf.predict(X)
    p = clf.predict_proba(X)
    # print(y_pred)
    proba = np.array([[p[c][i][1] if (p[c][i].shape[0]!=1) else 0 
                     for c in range(len(error_types))] for i in range(len(X))])
    # print(proba)
  # replace [] to the highest probability label
    y_pred2 = np.empty((0, len(error_types)), int)
    for y, pr in zip(y_pred, proba):
        if  (sum(y) == 0):
            ans = np.zeros_like(y)
            ans[np.argmax(pr)] = 1
        else:
            ans = y
        y_pred2 = np.append(y_pred2, np.array([ans]), axis=0)
    return y_pred2

In [38]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, tagset_size, batch_size):
        # 親クラスのコンストラクタ。決まり文句
        super(LSTMClassifier, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。    
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        # LSTMの隠れ層。これ１つでOK。超便利。
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        # LSTMの出力を受け取って全結合してsoftmaxに食わせるための１層のネットワーク
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        # softmaxのLog版。dim=0で列、dim=1で行方向を確率変換。
        # self.softmax = 
    
    def forward(self, x):
        #embeds.size() = (batch_size × len(sentence) × embedding_dim)
        batch_size, seq_len = x.shape[0], x.shape[1]
        _, hidden_layer = self.lstm(x)
        # print(hidden_layer)
        y = self.hidden2tag(hidden_layer[0].view(batch_size, -1))
        y = F.log_softmax(y, dim=1)
        return y

In [7]:
class Datasets(torch.utils.data.Dataset):
    def __init__(self, X_data, y_data):
        # self.transform = transform

        self.X_data = X_data
        self.y_data = y_data

        self.datanum = len(X_data)

    def __len__(self):
        return self.datanum

    def __getitem__(self, idx):
        out_X = self.X_data[idx]
        out_y = self.y_data[idx]

        return out_X, out_y

In [31]:
class MultiLSTMClassifier:
    def __init__(self, embedding_dim, hidden_dim, target_size, batch_size, CUDA=True):
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.target_size = target_size
        self.batch_size = batch_size

        self.multi_models = self._make_model()
    
        if CUDA:
            self._model_toCUDA()

        self.is_set_target_names = False

    def _make_model(self):
        multi_models = []
        for _ in range(self.target_size):
            multi_models.append( LSTMClassifier(self.embedding_dim, self.hidden_dim, 2, self.batch_size) )
        return multi_models

    def _model_toCUDA(self):
        if torch.cuda.is_available():
            for model in self.multi_models:
                model.cuda()
        else:
            pass

    def set_optimizer(self, lr_=0.01):
        self.optimizers = []
        for model in self.multi_models:
            self.optimizers.append(optim.Adam(model.parameters(), lr=lr_))
    
    def set_loss_func(self):
        self.loss_funcs = []
        for _ in self.multi_models:
            self.loss_funcs.append(nn.NLLLoss())

    def set_target_names(self, target_names):
        self.target_names = target_names
        self.is_set_target_names = True

    def train(self, X, y, epoch=100, loss_border=0):
        # データセットを初期化
        # datasets = []
        # trainloaders = []
        
        
        # for i in range(self.target_size):
        #     datasets.append()
        #     trainloaders.append()
        
        # 学習をそれぞれのエラーで
        for i, model in enumerate(self.multi_models):
            dataset = Datasets(X, y[:, i])
            trainloader = torch.utils.data.DataLoader(dataset, batch_size = self.batch_size, shuffle = True, num_workers = 2)
            optimizer = self.optimizers[i]
            loss_function = self.loss_funcs[i]
            if self.is_set_target_names:
                print("error :", self.target_names[i], "\tstart")
            else:
                print("error :", i, "\tstart")
            for ep in range(epoch):
                all_loss = 0
                for data in trainloader:
                    X_t = data[0]
                    y_t = data[1]
                    X_tensor = torch.tensor(X_t, device='cuda:0').float()
                    y_tensor = torch.tensor(y_t, dtype=torch.long, device='cuda:0')
                    optimizer.zero_grad()
                    model.zero_grad()
                    
                    score = model(X_tensor)
                    loss_ = loss_function(score, y_tensor)
                    loss_.backward()
                    all_loss += loss_.item()
                    del score
                    del loss_
                    optimizer.step()
                if (ep+1) % 50 == 0:
                    # print("model[{0}], ".format(i), "epoch", ep+1, "\t" , "loss", all_loss)
                    print("model[{0}] epoch :{1} \t loss :{2}".format(i, ep+1, all_loss))
                if all_loss <= loss_border:
                    print("loss was under border(={0}) : train end".format(loss_border))
                    break
            print("")
    
    def predict(self, X, y, at_least_oneClass=False):
        if at_least_oneClass:
            return
        
        # とりあえず argmax で 0 or 1 を獲得
        with torch.no_grad():
            X_tensor = torch.tensor(X, device='cuda:0').float()
            y_tensor = torch.tensor(y, dtype=torch.long, device='cuda:0')
            # 推論
            y_pred = np.array(self.multi_models[0](X_tensor).cpu()).argmax(axis=1).reshape(-1,1)
            for i in range(1, self.target_size):
                model = self.multi_models[i]
                y_pred_each = np.array(model(X_tensor).cpu()).argmax(axis=1).reshape(-1,1)
                y_pred = np.concatenate([y_pred, y_pred_each], 1)
        
        return y_pred
                

In [9]:
class DataManager:
    def __init__(self, data_path) -> None:
        import os
        import pickle
        self.data_path = data_path
        os.makedirs(data_path, exist_ok=True)
        self.dir = os.listdir(data_path)

    def is_exist(self, name):
        return (name in self.dir)
    
    def save_data(self, name, obj):
        with open(self.data_path+name, "wb") as f:
            pickle.dump(obj, f)
        print("success save : {0}{1}".format(self.data_path, name))

    def load_data(self, name):
        with open(self.data_path+name, "rb") as f:
            obj = pickle.load(f)
        print("success load : {0}{1}".format(self.data_path, name))
        return obj

In [10]:
pre = preprocessor()


300


In [11]:


path = './error_category_classification/dbdc5_ja_dev_labeled/'
datalist = ['DCM', 'DIT', 'IRS']
    # List of error types
# error_types = ['Ignore question', 'Unclear intention', 
#             'Wrong information', 'Topic transition error', 
#             'Lack of information', 'Repetition', 
#             'Semantic error', 'Self-contradiction', 
#             'Contradiction', 'Grammatical error', 
#             'Ignore offer', 'Ignore proposal', 
#             'Lack of sociality', 'Lack of common sense',
#             'Uninterpretable', 'Ignore greeting', 
#             'No-Err'
#             ]
error_types = ['Unclear intention', 'Wrong information',
 'Ignore question', 'Topic transition error', 
 'Lack of information', 'Repetition', 
 'Contradiction', 'Self-contradiction',
  'Lack of common sense', 'Semantic error',
   'Grammatical error', 'Ignore proposal', 
   'Ignore offer', 'Lack of sociality', 
   'Uninterpretable', 'Ignore greeting', 
   'No-Err']
df = pre.read_json_with_NoErr(path, datalist)
    # df = pre.read_json(path, datalist)
print(df.shape)



(2000, 5)


In [12]:
EMBEDDING_DIM = pre.emb_size*2
HIDDEN_DIM = pre.emb_size*2
# OUTPUT_DIM = len(error_types)-1
OUTPUT_DIM = 8
# OUTPUT_DIM = 5
seq_len = 3
mode = "ginza"
# mode = "senBERT"

data_path = "./X_y_data/seq{0}/".format(seq_len)

model_path = "./models/seq{0}/".format(seq_len)

files = "_".join(datalist)
data_name = "data_{0}_{1}.pickle".format(mode, files)
model_name = "model_{0}.pickle".format(mode)
print(data_name)
print(model_name)

modelM = DataManager(model_path)
# modelM.is_exist(model_name)
dataM = DataManager(data_path)

data_ginza_DCM_DIT_IRS.pickle
model_ginza.pickle


In [39]:
if dataM.is_exist(data_name):
    
    DATA_Xy = dataM.load_data(data_name)
    X_data = DATA_Xy[0]
    y_data = DATA_Xy[1]
else:
    X_data, y_data = pre.extract_X_y(df, error_types, seq_len)
    dataM.save_data(data_name, [X_data, y_data])

success load : ./X_y_data/seq3/data_ginza_DCM_DIT_IRS.pickle


In [12]:
# X_data, y_data = pre.extract_X_y(df, error_types, seq_len)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.30, random_state=5)

In [41]:
leng = len(y_train)
print(leng)
for i, v in enumerate(y_train):
    if leng %(i+1) == 0:
        print(i+1, end=", ")

943
1, 23, 41, 943, 

In [42]:
BATCH_SIZE = 41
y_train.shape

(943, 17)

In [43]:
multi_model = MultiLSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, BATCH_SIZE)
multi_model.set_optimizer(lr_=0.01)
multi_model.set_loss_func()
multi_model.set_target_names(error_types)


In [44]:
epoch = 700
loss_border = 0.0001
multi_model.train(X_train, y_train, epoch, loss_border)

error : Unclear intention 	start




model[0] epoch :50 	 loss :0.0030111046235106187
model[0] epoch :100 	 loss :0.00041632228339949506
model[0] epoch :150 	 loss :0.00012157770254361822
loss was under border(=0.0001) : train end

error : Wrong information 	start
model[1] epoch :50 	 loss :0.010360683882026933
model[1] epoch :100 	 loss :0.0015127691585803404
model[1] epoch :150 	 loss :0.0004271040334060672
model[1] epoch :200 	 loss :0.00017512895749405288
loss was under border(=0.0001) : train end

error : Ignore question 	start
model[2] epoch :50 	 loss :0.003688056831379072
model[2] epoch :100 	 loss :0.0005925267759039343
model[2] epoch :150 	 loss :0.000187842361242474
loss was under border(=0.0001) : train end

error : Topic transition error 	start
model[3] epoch :50 	 loss :0.0035956454848928843
model[3] epoch :100 	 loss :0.00038503663472511107
model[3] epoch :150 	 loss :0.0001102094017824129
loss was under border(=0.0001) : train end

error : Lack of information 	start
model[4] epoch :50 	 loss :0.00053099301

In [45]:
y_pred = multi_model.predict(X_test, y_test)
print(y_pred.shape)
print(y_test.shape)
print('EM:', metrics.accuracy_score(y_test[:,:OUTPUT_DIM], y_pred[:, :OUTPUT_DIM]) )

(405, 8)
(405, 17)
EM: 0.38271604938271603


In [21]:
with torch.no_grad():
    X_tensor = torch.tensor(X_test, device='cuda:0').float()
    y_tensor = torch.tensor(y_test, dtype=torch.long, device='cuda:0')
            # 推論
    pred_y = np.array(multi_model.multi_models[0](X_tensor).cpu()).argmax(axis=1).reshape(-1,1)
    # print(pred_y[:10])
    for i in range(1, multi_model.target_size):
        model = multi_model.multi_models[i]
        pred_y_each = np.array(model(X_tensor).cpu()).argmax(axis=1).reshape(-1,1)
        pred_y = np.concatenate([pred_y, pred_y_each], 1)
    print(pred_y[:10,:])

[[1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [46]:

y_test[:,:OUTPUT_DIM].shape

(405, 8)

In [47]:
for i in range(OUTPUT_DIM):
    print("error[{0}]  accuracy: {1}".format(error_types[i],metrics.accuracy_score(y_test[:, i], y_pred[:, i])))

error[Unclear intention]  accuracy: 0.8666666666666667
error[Wrong information]  accuracy: 0.6049382716049383
error[Ignore question]  accuracy: 0.8641975308641975
error[Topic transition error]  accuracy: 0.8518518518518519
error[Lack of information]  accuracy: 0.9580246913580247
error[Repetition]  accuracy: 0.9580246913580247
error[Contradiction]  accuracy: 0.9876543209876543
error[Self-contradiction]  accuracy: 0.9901234567901235


In [24]:
modelM.save_data(model_name, multi_model)

success save : ./models/seq3/model_ginza.pickle
