In [1]:
from pathlib import Path
import json
import pandas as pd
import numpy as np
from numpy.lib.function_base import select
import spacy
import torch
import re

# from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaConfig
# from transformers import BertTokenizer, BertModel

from sklearn.model_selection import train_test_split
# from sklearn.multioutput import MultiOutputClassifier
# from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import losss
import torch.optim as optim




In [72]:
class preprocessor:

    def __init__(self) -> None:
        self.nlp = spacy.load('ja_ginza')
        # self.model_path = "/home/yamada/Downloads/training_bert_japanese"
        # self.model = SentenceTransformer(self.model_path, show_progress_bar=False)

        # 半角全角英数字
        # self.DELETE_PATTERN_1 = re.compile(r'[0-9０-９a-zA-Zａ-ｚＡ-Ｚ]+')
        # 記号
        self.DELETE_PATTERN_2 = re.compile(
            r'[\．_－―─！＠＃＄％＾＆\-‐|\\＊\“（）＿■×+α※÷⇒—●★☆〇◎◆▼◇△□(：〜～＋=)／*&^%$#@!~`){}［］…\[\]\"\'\”\’:;<>?＜＞〔〕〈〉？、。・,\./『』【】「」→←○《》≪≫\n\u3000]+')
        
        self.emb_size = self.get_sentence_vec("emb").shape[0]
        print(self.emb_size)

    def get_sentence_vec(self, sen) -> np.array:
        # sen_ = self.DELETE_PATTERN_1.sub(sen)
        sen_ = self.DELETE_PATTERN_2.sub("", sen)
        sentence_vec = self.nlp(sen_).vector
        # sentence_vec = self.model.encode(sen)[0]
        return sentence_vec

    def read_json(self, path:str, datalist:list) -> pd.DataFrame:
        cols = ['did', 'tid', 'usr', 'sys', 'ec']
        df = pd.DataFrame(index=[], columns=cols)
        # datalist = ['DCM', 'DIT', 'IRS']
        for p in datalist:
            datapath = Path(path + p + '/')
            print(datapath)
            # print(list(datapath.glob("*.json")))
            for file in datapath.glob("*.json"):
                # print(file)
                with open(file, "r") as f:
                    json_data = json.load(f)
                    did = json_data["dialogue-id"]
                    for t in json_data["turns"]:
                        if t["turn-index"] == 0:
                            continue
                        if t["speaker"] == "U":
                            usr = t["utterance"]
                            continue
                        if t["speaker"] == "S" and t["error_category"] != None:
                            tid = t["turn-index"]
                            sys = t["utterance"]
                            ec = t["error_category"]
                            df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
        df.reset_index(inplace=True, drop=True)
        return df

    def read_json_with_NoErr(self, path:str, datalist:list) -> pd.DataFrame:
        cols = ['did', 'tid', 'usr', 'sys', 'ec']
        df = pd.DataFrame(index=[], columns=cols)

        for p in datalist:
            datapath = Path(path + p + '/')
            for file in datapath.glob("*.json"):
                # print(file)
                with open(file, "r") as f:
                    json_data = json.load(f)
                    did = json_data["dialogue-id"]
                    for t in json_data["turns"]:
                        if t["turn-index"] == 0:
                            continue
                        if t["speaker"] == "U":
                            usr = t["utterance"]
                            continue
                        if t["speaker"] == "S" :
                            tid = t["turn-index"]
                            sys = t["utterance"]
                            if t["error_category"]:
                                ec = t["error_category"]
                            else:
                                ec = ["No-Err"]
                            df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
        df.reset_index(inplace=True, drop=True)
        return df
    
    def feature_extraction(self, df:pd.DataFrame) -> np.array:
        return np.array([np.concatenate([self.get_sentence_vec(u), self.get_sentence_vec(s)]) for u,s in zip(df.usr, df.sys)])
    
    def feature_extraction_context2(self, df:pd.DataFrame) -> np.array:
        # nlp = spacy.load('ja_ginza')
        feature = []
        did = 0
        for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
            if did != d:
                u_prev_vec = self.get_sentence_vec(u)
                s_prev_vec = self.get_sentence_vec(s)
                did = d
                if e[0] != "No-Err":
                    each = np.array(
                        [np.concatenate(
                            [np.zeros(self.emb_size),
                            np.zeros(self.emb_size), 
                            u_prev_vec, 
                            s_prev_vec]
                        )]
                    ) 
                    feature.append(each[0])
            else:
                # エラーである
                if e[0] != "No-Err":
                    u_vec = self.get_sentence_vec(u)
                    s_vec = self.get_sentence_vec(s)
                    each = np.array(
                        [np.concatenate(
                            [u_vec,
                            s_vec, 
                            u_prev_vec, 
                            s_prev_vec]
                        )]
                    )
                    feature.append(each[0])
                    u_prev_vec = u_vec
                    s_prev_vec = s_vec
                # エラーではない
                else:    
                    u_prev_vec = self.get_sentence_vec(u)
                    s_prev_vec = self.get_sentence_vec(s)
        return np.array(feature)
    
    def extract_y(self, df:pd.DataFrame, error_types) -> np.array:
        y = []
        for ec in df.ec:
            if ec[0] == "No-Err":
                continue
            y_each_err = np.zeros(len(error_types))
            for i, err in enumerate( error_types ):
                if err in ec:
                    y_each_err[i] = 1
            y.append(y_each_err)
        return np.array(y)

    def make_error_dict(self, error_types):
        error_dict = {}
        for e in error_types:
            error_dict[e] = len(error_dict)
        return error_dict

    def div_did_error(self, df:pd.DataFrame, error_types) -> np.array:
        # nlp = spacy.load('ja_ginza')
        
        did = df.did[0]
        # 全体
        X_data = []
        y_data = []
        # 各 did 
        sequence_did = []
        y_did = []
        
        # エラーの辞書定義
        error_dict = self.make_error_dict(error_types)
        for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
            # did で学習データを分割してみる？
            y_one_conv = np.zeros(len(error_types))
            
            if did != d:
                did = d
                # 登録用データ修正
                sequence_did = np.array(sequence_did)
                y_did = np.array(y_did)

                # training_data.append([sequence_did, y_did])
                X_data.append(sequence_did)
                y_data.append(y_did)
                sequence_did = []
                y_did = []

            for e_ in e:
                y_one_conv[error_dict[e_]] = 1

            sequence_did.append(
                np.concatenate(
                    [self.nlp(u).vector,
                    self.nlp(s).vector]
                )
            )
            y_did.append(y_one_conv)

        sequence_did = np.array(sequence_did)
        y_did = np.array(y_did)
        X_data.append(sequence_did)
        y_data.append(y_did)
        return np.array(X_data), np.array( y_data )

    # 頑張って学習データを新たに分割
    def extract_X_y(self, df:pd.DataFrame, error_types) -> np.array:
        # nlp = spacy.load('ja_ginza')
        
        did = df.did[0]
        # print(did)
        # 全体
        X_data = []
        y_data = []
        # 各 did 
        sequence_did = []
        y_did = []
        # エラーの辞書定義
        error_dict = self.make_error_dict(error_types)

        # didごとに返却する？
        # エラーが発生したら、開始からエラーまでの文脈を入力とする(N=5の固定長でも可能)
        # 先にこのベクトル列を作成し，Tensorに変換して， List に保持
        for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
            if did != d:
                did = d
                sequence_did = []
                y_did = []
                # break

            # sequence_did.append([u, s])
            sequence_did.append(
                    np.concatenate([self.nlp(u).vector, self.nlp(s).vector])
                # [u, s]
            )
            if e[0] == "No-Err":
                continue
            else:
                y_each_error_label = np.zeros(len(error_types))
                for e_ in e:
                    y_each_error_label[error_dict[e_]] = 1
                X_data.append(sequence_did[-5:])
                # y_did = np.array(y_each_error_label)
                y_data.append(y_each_error_label)
        return X_data, torch.tensor(y_data, device='cuda:0', dtype=torch.long)
        


In [3]:
def predict_at_least_oneClass(clf, X) -> np.array:
    y_pred = clf.predict(X)
    p = clf.predict_proba(X)
    # print(y_pred)
    proba = np.array([[p[c][i][1] if (p[c][i].shape[0]!=1) else 0 
                     for c in range(len(error_types))] for i in range(len(X))])
    # print(proba)
  # replace [] to the highest probability label
    y_pred2 = np.empty((0, len(error_types)), int)
    for y, pr in zip(y_pred, proba):
        if  (sum(y) == 0):
            ans = np.zeros_like(y)
            ans[np.argmax(pr)] = 1
        else:
            ans = y
        y_pred2 = np.append(y_pred2, np.array([ans]), axis=0)
    return y_pred2

In [4]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, tagset_size):
        # 親クラスのコンストラクタ。決まり文句
        super(LSTMClassifier, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。
        self.hidden_dim = hidden_dim
        # インプットの単語をベクトル化するために使う
        # self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # LSTMの隠れ層。これ１つでOK。超便利。
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        # LSTMの出力を受け取って全結合してsoftmaxに食わせるための１層のネットワーク
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        # softmaxのLog版。dim=0で列、dim=1で行方向を確率変換。
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        _, lstm_out = self.lstm(x.view(len(x), 1, -1 ) )
        y = self.hidden2tag(lstm_out[0].view(-1, self.hidden_dim))
        y = self.softmax(y)
        return y

In [73]:
EMBEDDING_DIM = 600
HIDDEN_DIM = 600
pre = preprocessor()
path = './error_category_classification/dbdc5_ja_dev_labeled/'
datalist = ['DCM']
    # List of error types
error_types = ['Ignore question', 'Unclear intention', 
            'Wrong information', 'Topic transition error', 
            'Lack of information', 'Repetition', 
            'Semantic error', 'Self-contradiction', 
            'Contradiction', 'Grammatical error', 
            'Ignore offer', 'Ignore proposal', 
            'Lack of sociality', 'Lack of common sense',
            'Uninterpretable', 'Ignore greeting', 
            'No-Err'
            ]
df = pre.read_json_with_NoErr(path, datalist)
    # df = pre.read_json(path, datalist)
print(df.shape)

300
(670, 5)


In [6]:
model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, 2)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
   model.cuda()

In [74]:
X_data, y_data = pre.extract_X_y(df, error_types)

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.20, random_state=5)

In [77]:
y_train_ = y_train[:, 0]
y_test_ = y_test[:, 0]
y_train_

tensor([0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
        1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
        1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,

In [79]:
losses = []
for epoch in range(1500):  # again, normally you would NOT do 300 epochs, it is toy data
    all_loss = 0
    for X_t, y_t in zip(X_train, y_train_):
        X_t_tensor = torch.tensor(X_t, device='cuda:0')
        y_t_tensor = torch.tensor([y_t], dtype=torch.long, device='cuda:0')
        optimizer.zero_grad()
        model.zero_grad()
        score = model(X_t_tensor)
        loss_ = loss_function(score, y_t_tensor)
        loss_.backward()
        optimizer.step()
        all_loss += loss_.item()
    losses.append(all_loss)
    if (epoch+1) % 50 == 0:
        print("epoch", epoch+1, "\t" , "loss", all_loss)
print("done")

epoch 50 	 loss 2083.577615232178
epoch 100 	 loss 2036.9021016129323
epoch 150 	 loss 1693.3125752017486
epoch 200 	 loss 1877.6715104081877
epoch 250 	 loss 2061.6088132960635
epoch 300 	 loss 2283.664155610385
epoch 350 	 loss 1926.3038743625834
epoch 400 	 loss 2297.4894335748195
epoch 450 	 loss 1759.7459834949573
epoch 500 	 loss 2016.7090647841178
epoch 550 	 loss 1950.9299698612458
epoch 600 	 loss 2044.207030028284
epoch 650 	 loss 2172.046194552974
epoch 700 	 loss 1865.262272161953
epoch 750 	 loss 1681.8945366197627
epoch 800 	 loss 1946.7060477366686
epoch 850 	 loss 2024.0394287732865
epoch 900 	 loss 2206.128927666453
epoch 950 	 loss 2148.0223680699164
epoch 1000 	 loss 2152.413977678666
epoch 1050 	 loss 2148.1493997904436
epoch 1100 	 loss 2183.3519965110727
epoch 1150 	 loss 2104.244491664486
epoch 1200 	 loss 1718.215649185662
epoch 1250 	 loss 2043.7107574853198
epoch 1300 	 loss 1745.0383913763249
epoch 1350 	 loss 1737.0150554496095
epoch 1400 	 loss 2149.1884760

In [8]:
len(X_data)

6

In [50]:
X_data[0][0].shape

(600,)

[101.25090551376343,
 41.07545018196103,
 15.189394047018084,
 10.21215343437143,
 4.580461127799907,
 0.32040360709549986,
 9.82208001560138,
 0.05771276634180822,
 0.49403877534132334,
 0.05411070529589779,
 0.0573089870149488,
 0.10201472786684462,
 0.011147599252126383,
 0.00490452447820644,
 0.002775311950244941,
 0.0019663393031805754,
 0.001685113791609183,
 0.0010825724493770394,
 0.0007133898452593712,
 0.0006413141563825775,
 0.0005916316404181998,
 0.0005594628164544702,
 0.0005364675635064486,
 0.0005193101678742096,
 0.0005060845141997561,
 0.0004954800606356002,
 0.0004865435548708774,
 0.0004787984653376043,
 0.0004718873678939417,
 0.00046509514868375845,
 0.00045854107884224504,
 0.0004515099717536941,
 0.0004434058973856736,
 0.0004320832958910614,
 0.00041611161577748135,
 0.000398113377741538,
 0.00038404881706810556,
 0.00037403714668471366,
 0.0003661710652522743,
 0.00035997376835439354,
 0.000354610885551665,
 0.00034996308750123717,
 0.00034567284819786437,
 0.