In [1]:
from pathlib import Path
import json
import pandas as pd
import numpy as np
from numpy.lib.function_base import select
import spacy
import torch
import re

# from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaConfig
# from transformers import BertTokenizer, BertModel

from sklearn.model_selection import train_test_split
# from sklearn.multioutput import MultiOutputClassifier
# from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim

# from pyknp import Juman
# from sentence_transformers import SentenceTransformer

import pickle
torch.cuda.is_available()



True

In [2]:
class preprocessor:

    def __init__(self) -> None:
        self.nlp = spacy.load('ja_ginza')
        # self.model_path = "/home/yamada/Downloads/training_bert_japanese"
        # self.sen_model = SentenceTransformer(self.model_path, show_progress_bar=False)

        # 半角全角英数字
        # self.DELETE_PATTERN_1 = re.compile(r'[0-9０-９a-zA-Zａ-ｚＡ-Ｚ]+')
        # 記号
        # self.DELETE_PATTERN_2 = re.compile(
        #     r'[\．_－―─！＠＃＄％＾＆\-‐|\\＊\“（）＿■×+α※÷⇒—●★☆〇◎◆▼◇△□(：〜～＋=)／*&^%$#@!~`){}［］…\[\]\"\'\”\’:;<>?＜＞〔〕〈〉？、。・,\./『』【】「」→←○《》≪≫\n\u3000]+')
        
        self.emb_size = self.get_sentence_vec("emb").shape[0]
        print(self.emb_size)

    def get_sentence_vec(self, sen) -> np.array:
        # sen_ = self.DELETE_PATTERN_1.sub(sen)
        # sen_ = self.DELETE_PATTERN_2.sub("", sen)
        sentence_vec = self.nlp(sen).vector
        # sentence_vec = self.sen_model.encode(sen)[0]
        return sentence_vec
    
    def read_json_with_NoErr(self, path:str, datalist:list) -> pd.DataFrame:
        cols = ['did', 'tid', 'usr', 'sys', 'ec']
        df = pd.DataFrame(index=[], columns=cols)

        for p in datalist:
            datapath = Path(path + p + '/')
            for file in datapath.glob("*.json"):
                # print(file)
                with open(file, "r") as f:
                    json_data = json.load(f)
                    did = json_data["dialogue-id"]
                    for t in json_data["turns"]:
                        if t["turn-index"] == 0:
                            continue
                        if t["speaker"] == "U":
                            usr = t["utterance"]
                            continue
                        if t["speaker"] == "S" :
                            tid = t["turn-index"]
                            sys = t["utterance"]
                            if t["error_category"]:
                                ec = t["error_category"]
                            else:
                                ec = ["No-Err"]
                            df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
        df.reset_index(inplace=True, drop=True)
        return df
    
    def make_error_dict(self, error_types):
        error_dict = {}
        for e in error_types:
            error_dict[e] = len(error_dict)
        return error_dict
    
    def extract_X_y(self, df:pd.DataFrame, error_types, prev_num) -> np.array:
        # nlp = spacy.load('ja_ginza')
        
        did = df.did[0]
        n = prev_num
        # print(did)
        # 全体
        X_data = []
        y_data = []
        # 各 did 
        sequence_did = []
        y_did = []
        # エラーの辞書定義
        error_dict = self.make_error_dict(error_types)

        # 初期の調整 padding
        for i in range(n-1):
            sequence_did.append(
                np.concatenate( [np.zeros(self.emb_size), np.zeros(self.emb_size)])
            )

        # didごとに返却する？
        # エラーが発生したら、開始からエラーまでの文脈を入力とする(N=5の固定長でも可能)
        # 先にこのベクトル列を作成し，Tensorに変換して， List に保持
        for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
            if did != d:
                did = d
                sequence_did = []
                y_did = []
                # 初期の調整 padding
                for i in range(n-1):
                    sequence_did.append(
                            np.concatenate( [np.zeros(self.emb_size), np.zeros(self.emb_size)])
                        )
                # break

            # sequence_did.append([u, s])
            sequence_did.append(
                    np.concatenate(
                        [self.get_sentence_vec(u), self.get_sentence_vec(s)]
                    )
                # [u, s]
            )
            if e[0] == "No-Err":
                continue
            else:
                y_each_error_label = np.zeros(len(error_types))
                for e_ in e:
                    y_each_error_label[error_dict[e_]] = 1
                X_data.append(sequence_did[-n:])
                # y_did = np.array(y_each_error_label)
                y_data.append(y_each_error_label)
        return np.array(X_data), np.array(y_data)
    
    # 特定のエラーを取得
    def particular_error_usr(self, df:pd.DataFrame, error_set, prev_num) -> np.array:
        sequence_did = []
        n = prev_num
        X_data = []
        y_data = [] 
        for d, u, s, ec in zip(df.did, df.usr, df.sys, df.ec):
            for e in ec:
                if e in error_set:
                    X_data.append(self.get_sentence_vec(u))
                    y_data.append(1)
        # エラーの回数
        error_num = len(X_data)
        # df_sample = df.sample(n=errror_num+50)
        i = 0
        print("error num:{0}".format(error_num))
        while True:
            df_ = df.sample()
            u = str(df_.usr).split()[1]
            if "？" in u or "?" in u:
                continue    
            for e in ec:
                if e in error_set:
                    continue
            
            X_data.append(self.get_sentence_vec(u))
            y_data.append(0)
            # print(u)
            i += 1
            if i % 50 == 0:
                print("i is {0}".format(i))
            if i == error_num:
                break
        
        return np.array(X_data),np.array(y_data)
        


In [3]:
class Datasets(torch.utils.data.Dataset):
    def __init__(self, X_data, y_data):
        # self.transform = transform

        self.X_data = X_data
        self.y_data = y_data

        self.datanum = len(X_data)

    def __len__(self):
        return self.datanum

    def __getitem__(self, idx):
        out_X = self.X_data[idx]
        out_y = self.y_data[idx]

        return out_X, out_y

In [4]:
class DataManager:
    def __init__(self, data_path) -> None:
        import os
        import pickle
        self.data_path = data_path
        os.makedirs(data_path, exist_ok=True)
        self.dir = os.listdir(data_path)

    def is_exist(self, name):
        return (name in self.dir)
    
    def save_data(self, name, obj):
        with open(self.data_path+name, "wb") as f:
            pickle.dump(obj, f)
        print("success save : {0}{1}".format(self.data_path, name))

    def load_data(self, name):
        with open(self.data_path+name, "rb") as f:
            obj = pickle.load(f)
        print("success load : {0}{1}".format(self.data_path, name))
        return obj

In [5]:
pre = preprocessor()

300


In [6]:
path = './error_category_classification/dbdc5_ja_dev_labeled/'
datalist = ['DCM', 'DIT', 'IRS']
# datalist = ['DCM']
    # List of error types
error_types = ['Unclear intention', 'Wrong information',
    'Ignore question', 'Topic transition error', 
    'Lack of information', 'Repetition', 
    'Contradiction', 'Self-contradiction',
    'Lack of common sense', 'Semantic error',
    'Grammatical error', 'Ignore proposal', 
    'Ignore offer', 'Lack of sociality', 
    'Uninterpretable', 'Ignore greeting', 
    'No-Err']
df = pre.read_json_with_NoErr(path, datalist)
print(df.shape)


(2000, 5)


In [7]:
error_set = set(["Ignore question"])
error_set

{'Ignore question'}

In [16]:
# 質問しているかを判定する識別機
class Classifier(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, tagset_size, batch_size):
        super(Classifier, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.f1 = nn.Linear(embedding_dim, hidden_dim)
        self.f2 = nn.Linear(hidden_dim, tagset_size)
    
    def forward(self, x):
        y = self.f1(x)
        y = torch.relu(y)
        y = self.f2(y)
        y = F.log_softmax(y)
        return y

In [None]:
# 

In [8]:
EMBEDDING_DIM = pre.emb_size
HIDDEN_DIM = pre.emb_size*2
OUTPUT_DIM = 2
mode = "ginza"


In [9]:
particular_name = "particular"
particular_error= "ig_question"
model_path = "./models/{0}/".format(particular_name)
model_name = "model_{0}_{1}.pickle".format(particular_error, mode)
modelM = DataManager(model_path)
model_name
torch.cuda.is_available()

True

In [17]:
if modelM.is_exist(model_name):
    model = modelM.load_data(model_name)

success load : ./models/particular/model_ig_question_ginza.pickle


In [12]:
X_data, y_data = pre.particular_error_usr(df, error_set, 1)


error num:305
i is 50
i is 100
i is 150
i is 200
i is 250
i is 300


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.30, random_state=5)

In [None]:
leng = len(y_train)
print(leng)
for i, v in enumerate(y_train):
    if leng %(i+1) == 0:
        print(i+1, end=", ")

427
1, 7, 61, 427, 

In [122]:
batch_size = 61
# epoch = 300

In [123]:
trainset = Datasets(X_train, y_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size = batch_size, shuffle = True, num_workers = 2)

In [124]:
model = Classifier(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, batch_size)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
if torch.cuda.is_available():
   model.cuda()

In [125]:
losses = []


# print("error[{0}]".format(error_types[error_i]))
for epoch in range(1000):  # again, normally you would NOT do 300 epochs, it is toy data
    all_loss = 0
    for data in trainloader:
        X_t_tensor = torch.tensor(data[0], device='cuda:0').float()
        y_t_tensor = torch.tensor(data[1], device='cuda:0').long()
        optimizer.zero_grad()
        model.zero_grad()
        # print(X_t_tensor.shape)

        score = model(X_t_tensor)
        loss_ = loss_function(score, y_t_tensor)
        print(loss_.shape)
        loss_.backward()
        all_loss += loss_.item()
        optimizer.step()
        del score
        del loss_
    losses.append(all_loss)
    if (epoch+1) % 50 == 0:
        print("epoch", epoch+1, "\t" , "loss", all_loss)
    # if all_loss <= loss_border:
    #     print("loss was under border(={0}) : train end".format(loss_border))
    #     break
print("done")



torch.Size([])


In [18]:
with torch.no_grad():
    X_tensor = torch.tensor(X_test, device='cuda:0').float()
    y_tensor = torch.tensor(y_test, dtype=torch.long, device='cuda:0')
            # 推論
    y_pred = np.array(model(X_tensor).cpu()).argmax(axis=1)
    



In [19]:
metrics.accuracy_score(y_test, y_pred)

0.912568306010929

In [None]:
if not modelM.is_exist(model_name):
    modelM.save_data(model_name, model)

success save : ./models/particular/model_ig_question_ginza.pickle


In [35]:
# 質問扱いのものがいくつ存在するか

user_list = []

for u in df.usr:
    user_list.append(pre.get_sentence_vec(u))
print(len(user_list))
# 推論
with torch.no_grad():
    X_tensor = torch.tensor(user_list, device='cuda:0').float()
    y_pred = np.array(model(X_tensor).cpu()).argmax(axis=1)


2000




In [None]:
print(len(np.nonzero(y_pred>0)[0]))
num = 0
error_part = 0
for d, u, s, ec, is_q in zip(df.did, df.usr, df.sys, df.ec, y_pred):
    if is_q and ("？" in u or "?" in u):
        # print(u)
        num += 1
        is_error = False
        for e in ec:
            if e in error_set:
                error_part += 1
                is_error = True
        if not is_error:
            print(u, s)
print("ある程度質問形式とみられる数", num)
print("質問と見られる中で．", error_part)

NameError: name 'np' is not defined

In [None]:
if modelM.is_exist(model_name):
    model = modelM.load_data(model_name)

In [20]:
def get_base_feature(df:pd.DataFrame, error_set, is_act):
    X_pos_data = []
    y_pos_neg = []

    # positive
    for d, u, s, ec, is_a in zip(df.did, df.usr, df.sys, df.ec, is_act):
        if is_a and ("？" in u or "?" in u):

            if ec[0] == "No-Err":
                neg = str(df.sample().usr).split()[1]
                X_pos_data.append(pre.get_sentence_vec(u))
                y_pos_neg.append(
                    [pre.get_sentence_vec(s), pre.get_sentence_vec(neg)]
                )
    return np.array(X_pos_data), np.array(y_pos_neg)

In [75]:
# 教師なし学習における，新たな損失関数
def MarginLoss(f_pos, f_neg, eta=0.5):
    return torch.max(torch.tensor(0, device='cuda:0').float() ,  torch.sum(eta - f_pos + f_neg) )

In [79]:
def MarginLoss_(f_pos, f_neg, eta=0.5):
    return torch.sum(eta - f_pos + f_neg)

In [80]:
poss = torch.tensor([0.5, 0.6])
negg = torch.tensor([-0.9, -0.8])
MarginLoss_(poss, negg)

tensor(-1.8000)

In [97]:
# LCD モデルの学習
class LCD(nn.Module):

    def __init__(self, feature_dim, hidden_dim, target_dim):
        super(LCD, self).__init__()
        self.input_dim = feature_dim
        self.hidden_dim = hidden_dim
        self.target_dim = target_dim

        self.f1 = nn.Linear(feature_dim, hidden_dim)
        self.f2 = nn.Linear(hidden_dim, target_dim)
    
    def forward(self, x):
        y = self.f1(x)
        y = torch.relu(y)
        y = self.f2(y)
        y = torch.tanh(y)
        return y
    

In [34]:
FEATURE_DIM = pre.emb_size*4
HIDDEN_DIM = FEATURE_DIM*2
OUTPUT_DIM = 1
eta = 2

(183,)

In [24]:
lcd_model_name = "model_{2}_{0}_{1}.pickle".format(particular_error, mode, "LCD")
lcd_model_name
lcd_modelM = DataManager(model_path)

In [98]:
lcd_model = LCD(FEATURE_DIM, HIDDEN_DIM, OUTPUT_DIM)
lcd_opt = optim.Adam(lcd_model.parameters(), lr=0.01)
if torch.cuda.is_available():
    print("cuda available")
    lcd_model.cuda()


cuda available


In [36]:
X_pos_data, y_pos_neg = get_base_feature(df, error_set, y_pred)
print("len :",  len(X_pos_data))

len : 112


In [27]:
y_pos_neg[:, 0, :].shape

(2, 300)

In [37]:
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X_pos_data, y_pos_neg, test_size=0.30, random_state=5)

In [38]:
leng = len(X_train_l)
print(leng)
for i  in range(leng):
    if leng %(i+1) == 0:
        print(i+1, end=", ")

78
1, 2, 3, 6, 13, 26, 39, 78, 

In [57]:
lcd_batch_size = 26
trainset_l = Datasets(X_train_l, y_train_l)
trainloader_l = torch.utils.data.DataLoader(trainset_l, batch_size = lcd_batch_size, shuffle = True, num_workers = 2)

In [63]:
def make_LCD_feature(S, T):
    # print(type(S))
    if isinstance(S, torch.Tensor) or isinstance(S, np.ndarray) or isinstance(S, list):
        feature = []
        # print(len(S))
        for s, t in zip(S, T):
            feature.append(np.concatenate([s, t, np.abs(s-t), s*t]))
        return np.array(feature)
    else:
        return np.concatenate([S, T, np.abs(S-T), S*T])


In [99]:
losses = []
for epoch in range(1000):  # again, normally you would NOT do 300 epochs, it is toy data
    all_loss = 0
    for data in trainloader_l:
        s = data[0]
        s_pos = data[1][:, 0, :]
        s_neg = data[1][:, 1, :]
        # print(s.shape)
        X_pos = torch.tensor(make_LCD_feature(s, s_pos) , device='cuda:0').float()
        X_neg = torch.tensor(make_LCD_feature(s, s_neg) , device='cuda:0').float()
        # print(X_pos.shape)
        lcd_opt.zero_grad()
        lcd_model.zero_grad()
        
        
        f_pos_score = lcd_model(X_pos)
        f_neg_score = lcd_model(X_neg)
        
        loss_ = MarginLoss(f_pos_score, f_neg_score, 1)
        print(f_pos_score[:5], f_neg_score[:5])
        # print(loss_)
        loss_.backward()
        all_loss += loss_.item()
        del f_pos_score
        del f_neg_score
        del loss_
        lcd_opt.step()
    losses.append(all_loss)
    if (epoch+1) % 50 == 0:
        print("epoch", epoch+1, "\t" , "loss", all_loss)
    # break
    # if all_loss <= loss_border:
    #     print("loss was under border(={0}) : train end".format(loss_border))
    #     break
print("done")

tensor([[0.0078],
        [0.0167],
        [0.0003],
        [0.0141],
        [0.0117]], device='cuda:0', grad_fn=<SliceBackward>) tensor([[0.0145],
        [0.0037],
        [0.0046],
        [0.0089],
        [0.0103]], device='cuda:0', grad_fn=<SliceBackward>)
tensor([[0.2069],
        [0.3927],
        [0.1245],
        [0.1622],
        [0.2133]], device='cuda:0', grad_fn=<SliceBackward>) tensor([[0.0565],
        [0.2762],
        [0.1995],
        [0.1348],
        [0.0326]], device='cuda:0', grad_fn=<SliceBackward>)
tensor([[-0.9381],
        [-0.8346],
        [-0.7089],
        [-0.5575],
        [-0.3835]], device='cuda:0', grad_fn=<SliceBackward>) tensor([[-0.8143],
        [-0.6688],
        [-0.8569],
        [-0.8340],
        [-0.9883]], device='cuda:0', grad_fn=<SliceBackward>)
tensor([[1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000]], device='cuda:0', grad_fn=<SliceBackward>) tensor([[1.0000],
        [1.0000],
        [1.0000],
     

KeyboardInterrupt: 