In [5]:
from pathlib import Path
import json
import pandas as pd
import numpy as np
from numpy.lib.function_base import select
import spacy
import torch
import re

# from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaConfig
# from transformers import BertTokenizer, BertModel

from sklearn.model_selection import train_test_split
# from sklearn.multioutput import MultiOutputClassifier
# from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim

# from pyknp import Juman
# from sentence_transformers import SentenceTransformer

import pickle

In [6]:
class preprocessor:

    def __init__(self) -> None:
        self.nlp = spacy.load('ja_ginza')
        # self.model_path = "/home/yamada/Downloads/training_bert_japanese"
        # self.sen_model = SentenceTransformer(self.model_path, show_progress_bar=False)

        # 半角全角英数字
        # self.DELETE_PATTERN_1 = re.compile(r'[0-9０-９a-zA-Zａ-ｚＡ-Ｚ]+')
        # 記号
        # self.DELETE_PATTERN_2 = re.compile(
        #     r'[\．_－―─！＠＃＄％＾＆\-‐|\\＊\“（）＿■×+α※÷⇒—●★☆〇◎◆▼◇△□(：〜～＋=)／*&^%$#@!~`){}［］…\[\]\"\'\”\’:;<>?＜＞〔〕〈〉？、。・,\./『』【】「」→←○《》≪≫\n\u3000]+')
        
        self.emb_size = self.get_sentence_vec("emb").shape[0]
        print(self.emb_size)

    def get_sentence_vec(self, sen) -> np.array:
        # sen_ = self.DELETE_PATTERN_1.sub(sen)
        # sen_ = self.DELETE_PATTERN_2.sub("", sen)
        sentence_vec = self.nlp(sen).vector
        # sentence_vec = self.sen_model.encode(sen)[0]
        return sentence_vec
    
    def read_json_with_NoErr(self, path:str, datalist:list) -> pd.DataFrame:
        cols = ['did', 'tid', 'usr', 'sys', 'ec']
        df = pd.DataFrame(index=[], columns=cols)

        for p in datalist:
            datapath = Path(path + p + '/')
            for file in datapath.glob("*.json"):
                # print(file)
                with open(file, "r") as f:
                    json_data = json.load(f)
                    did = json_data["dialogue-id"]
                    for t in json_data["turns"]:
                        if t["turn-index"] == 0:
                            continue
                        if t["speaker"] == "U":
                            usr = t["utterance"]
                            continue
                        if t["speaker"] == "S" :
                            tid = t["turn-index"]
                            sys = t["utterance"]
                            if t["error_category"]:
                                ec = t["error_category"]
                            else:
                                ec = ["No-Err"]
                            df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
        df.reset_index(inplace=True, drop=True)
        return df
    
    def make_error_dict(self, error_types):
        error_dict = {}
        for e in error_types:
            error_dict[e] = len(error_dict)
        return error_dict
    
    def extract_X_y(self, df:pd.DataFrame, error_types, prev_num) -> np.array:
        # nlp = spacy.load('ja_ginza')
        
        did = df.did[0]
        n = prev_num
        # print(did)
        # 全体
        X_data = []
        y_data = []
        # 各 did 
        sequence_did = []
        y_did = []
        # エラーの辞書定義
        error_dict = self.make_error_dict(error_types)

        # 初期の調整 padding
        for i in range(n-1):
            sequence_did.append(
                np.concatenate( [np.zeros(self.emb_size), np.zeros(self.emb_size)])
            )

        # didごとに返却する？
        # エラーが発生したら、開始からエラーまでの文脈を入力とする(N=5の固定長でも可能)
        # 先にこのベクトル列を作成し，Tensorに変換して， List に保持
        for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
            if did != d:
                did = d
                sequence_did = []
                y_did = []
                # 初期の調整 padding
                for i in range(n-1):
                    sequence_did.append(
                            np.concatenate( [np.zeros(self.emb_size), np.zeros(self.emb_size)])
                        )
                # break

            # sequence_did.append([u, s])
            sequence_did.append(
                    np.concatenate(
                        [self.get_sentence_vec(u), self.get_sentence_vec(s)]
                    )
                # [u, s]
            )
            if e[0] == "No-Err":
                continue
            else:
                y_each_error_label = np.zeros(len(error_types))
                for e_ in e:
                    y_each_error_label[error_dict[e_]] = 1
                X_data.append(sequence_did[-n:])
                # y_did = np.array(y_each_error_label)
                y_data.append(y_each_error_label)
        return np.array(X_data), np.array(y_data)
    
    # 特定のエラーを取得
    def particular_error_usr(self, df:pd.DataFrame, error_set, prev_num) -> np.array:
        sequence_did = []
        n = prev_num
        X_data = []
        y_data = [] 
        for d, u, s, ec in zip(df.did, df.usr, df.sys, df.ec):
            for e in ec:
                if e in error_set:
                    X_data.append(self.get_sentence_vec(u))
                    y_data.append(1)
        # エラーの回数
        error_num = len(X_data)
        # df_sample = df.sample(n=errror_num+50)
        i = 0
        print("error num:{0}".format(error_num))
        while True:
            df_ = df.sample()
            u = str(df_.usr).split()[1]
            if "？" in u or "?" in u:
                continue    
            for e in ec:
                if e in error_set:
                    continue
            
            X_data.append(self.get_sentence_vec(u))
            y_data.append(0)
            # print(u)
            i += 1
            if i % 50 == 0:
                print("i is {0}".format(i))
            if i == error_num:
                break
        
        return np.array(X_data),np.array(y_data)
        


In [7]:
class Datasets(torch.utils.data.Dataset):
    def __init__(self, X_data, y_data):
        # self.transform = transform

        self.X_data = X_data
        self.y_data = y_data

        self.datanum = len(X_data)

    def __len__(self):
        return self.datanum

    def __getitem__(self, idx):
        out_X = self.X_data[idx]
        out_y = self.y_data[idx]

        return out_X, out_y

In [8]:
class DataManager:
    def __init__(self, data_path) -> None:
        import os
        import pickle
        self.data_path = data_path
        os.makedirs(data_path, exist_ok=True)
        self.dir = os.listdir(data_path)

    def is_exist(self, name):
        return (name in self.dir)
    
    def save_data(self, name, obj):
        with open(self.data_path+name, "wb") as f:
            pickle.dump(obj, f)
        print("success save : {0}{1}".format(self.data_path, name))

    def load_data(self, name):
        with open(self.data_path+name, "rb") as f:
            obj = pickle.load(f)
        print("success load : {0}{1}".format(self.data_path, name))
        return obj

In [9]:
pre = preprocessor()

300


In [10]:
path = './error_category_classification/dbdc5_ja_dev_labeled/'
datalist = ['DCM', 'DIT', 'IRS']
# datalist = ['DCM']
    # List of error types
error_types = ['Unclear intention', 'Wrong information',
    'Ignore question', 'Topic transition error', 
    'Lack of information', 'Repetition', 
    'Contradiction', 'Self-contradiction',
    'Lack of common sense', 'Semantic error',
    'Grammatical error', 'Ignore proposal', 
    'Ignore offer', 'Lack of sociality', 
    'Uninterpretable', 'Ignore greeting', 
    'No-Err']
df = pre.read_json_with_NoErr(path, datalist)
print(df.shape)


(2000, 5)


In [11]:
error_set = set(["Ignore question"])
error_set

{'Ignore question'}

In [12]:
# 質問しているかを判定する識別機
class Classifier(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, tagset_size, batch_size):
        super(Classifier, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.f1 = nn.Linear(embedding_dim, hidden_dim)
        self.f2 = nn.Linear(hidden_dim, tagset_size)
    
    def forward(self, x):
        y = self.f1(x)
        y = torch.relu(y)
        y = self.f2(y)
        y = F.log_softmax(y)
        return y

In [13]:
EMBEDDING_DIM = pre.emb_size
HIDDEN_DIM = pre.emb_size*2
OUTPUT_DIM = 2
mode = "ginza"


In [16]:
particular_name = "particular"
particular_error= "ig_question"
model_path = "./models/{0}/".format(particular_name)
model_name = "model_{0}_{1}.pickle".format(particular_error, mode)
modelM = DataManager(model_path)
model_name
torch.cuda.is_available()

False

In [17]:
if modelM.is_exist(model_name):
    model = modelM.load_data(model_name)

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [None]:
X_data, y_data = pre.particular_error_usr(df, error_set, 1)


error num:305
i is 50
i is 100
i is 150
i is 200
i is 250
i is 300


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.30, random_state=5)

In [None]:
leng = len(y_train)
print(leng)
for i, v in enumerate(y_train):
    if leng %(i+1) == 0:
        print(i+1, end=", ")

427
1, 7, 61, 427, 

In [None]:
batch_size = 61
# epoch = 300

In [None]:
trainset = Datasets(X_train, y_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size = batch_size, shuffle = True, num_workers = 2)

In [None]:
model = Classifier(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, batch_size)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
if torch.cuda.is_available():
   model.cuda()

In [None]:
losses = []

# print("error[{0}]".format(error_types[error_i]))
for epoch in range(1000):  # again, normally you would NOT do 300 epochs, it is toy data
    all_loss = 0
    for data in trainloader:
        X_t_tensor = torch.tensor(data[0], device='cuda:0').float()
        y_t_tensor = torch.tensor(data[1], device='cuda:0').long()
        optimizer.zero_grad()
        model.zero_grad()
        # print(X_t_tensor.shape)

        score = model(X_t_tensor)
        loss_ = loss_function(score, y_t_tensor)
        loss_.backward()
        all_loss += loss_.item()
        optimizer.step()
        del score
        del loss_
    losses.append(all_loss)
    if (epoch+1) % 50 == 0:
        print("epoch", epoch+1, "\t" , "loss", all_loss)
    # if all_loss <= loss_border:
    #     print("loss was under border(={0}) : train end".format(loss_border))
    #     break
print("done")



epoch 50 	 loss 1.5156225711107254
epoch 100 	 loss 0.8507134988903999
epoch 150 	 loss 0.5924378745257854
epoch 200 	 loss 0.43736477941274643
epoch 250 	 loss 0.3377938885241747
epoch 300 	 loss 0.2623869962990284
epoch 350 	 loss 0.2042178250849247
epoch 400 	 loss 0.1626150319352746
epoch 450 	 loss 0.12914054188877344
epoch 500 	 loss 0.1023526107892394
epoch 550 	 loss 0.0822045598179102
epoch 600 	 loss 0.0653474589344114
epoch 650 	 loss 0.05322202853858471
epoch 700 	 loss 0.0419496838003397
epoch 750 	 loss 0.034301294945180416
epoch 800 	 loss 0.027080659056082368
epoch 850 	 loss 0.022061792318709195
epoch 900 	 loss 0.017942906357347965
epoch 950 	 loss 0.014610492973588407
epoch 1000 	 loss 0.011934398906305432
done


In [None]:
with torch.no_grad():
    X_tensor = torch.tensor(X_test, device='cuda:0').float()
    y_tensor = torch.tensor(y_test, dtype=torch.long, device='cuda:0')
            # 推論
    y_pred = np.array(model(X_tensor).cpu()).argmax(axis=1)
    



In [None]:
metrics.accuracy_score(y_test, y_pred)

0.912568306010929

In [None]:
# query = ["銀閣寺は何故銀色ではないのですか？", "それは違うと思います"]
query = "ほんとうですか？"
vec = pre.get_sentence_vec(query)
vec_ = torch.tensor(vec, device='cuda:0').float()
score = model(vec_)
score_np = score.cpu().detach().numpy()
score_np.argmax()




1

In [None]:
y_test

array([1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 0])

In [None]:
y_pred

array([1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0])

In [None]:
if not modelM.is_exist(model_name):
    modelM.save_data(model_name, model)

success save : ./models/particular/model_ig_question_ginza.pickle


In [None]:
# 質問扱いのものがいくつ存在するか

user_list = []

for u in df.usr:
    user_list.append(pre.get_sentence_vec(u))
print(len(user_list))
# 推論
with torch.no_grad():
    X_tensor = torch.tensor(user_list, device='cuda:0').float()
    y_pred = np.array(model(X_tensor).cpu()).argmax(axis=1)


2000




In [None]:
print(len(np.nonzero(y_pred>0)[0]))
num = 0
error_part = 0
for d, u, s, ec, is_q in zip(df.did, df.usr, df.sys, df.ec, y_pred):
    if is_q and ("？" in u or "?" in u):
        # print(u)
        num += 1
        is_error = False
        for e in ec:
            if e in error_set:
                error_part += 1
                is_error = True
        if not is_error:
            print(u, s)
print("ある程度質問形式とみられる数", num)
print("質問と見られる中で．", error_part)

NameError: name 'np' is not defined

(2000,)