In [2]:
from pathlib import Path
import json
import pandas as pd
import numpy as np

import collections


import spacy

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics

import torch
import torch.nn as nn
import torch.nn.functional as F



In [3]:
path = './error_category_classification/dbdc5_ja_dev_labeled/'
# Names of the dialogue systems
datalist = ['DCM', 'DIT', 'IRS']
# datalist = ['DCM']
# List of error types
error_types = ['Ignore question', 'Unclear intention', 'Wrong information', 'Topic transition error', 'Lack of information', 
'Repetition', 'Semantic error', 'Self-contradiction', 'Contradiction', 'Grammatical error', 'Ignore offer', 
'Ignore proposal', 'Lack of sociality', 'Lack of common sense', 'Uninterpretable', 'Ignore greeting', "No-Err"]

In [4]:
def read_json(path:str, datalist:list) -> pd.DataFrame:
    cols = ['did', 'tid', 'usr', 'sys', 'ec']
    df = pd.DataFrame(index=[], columns=cols)

    for p in datalist:
        datapath = Path(path + p + '/')
        for file in datapath.glob("*.json"):
            with open(file, "r") as f:
                json_data = json.load(f)
                did = json_data["dialogue-id"]
                for t in json_data["turns"]:
                    if t["turn-index"] == 0:
                        continue
                    if t["speaker"] == "U":
                        usr = t["utterance"]
                        continue
                    if t["speaker"] == "S" and t["error_category"] != None:
                        tid = t["turn-index"]
                        sys = t["utterance"]
                        ec = t["error_category"]
                        df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
    df.reset_index(inplace=True, drop=True)
    return df

In [5]:
def read_json_with_NoErr(path:str, datalist:list) -> pd.DataFrame:
    cols = ['did', 'tid', 'usr', 'sys', 'ec']
    df = pd.DataFrame(index=[], columns=cols)

    for p in datalist:
        datapath = Path(path + p + '/')
        for file in datapath.glob("*.json"):
            with open(file, "r") as f:
                json_data = json.load(f)
                did = json_data["dialogue-id"]
                for t in json_data["turns"]:
                    if t["turn-index"] == 0:
                        continue
                    if t["speaker"] == "U":
                        usr = t["utterance"]
                        continue
                    if t["speaker"] == "S" :
                        tid = t["turn-index"]
                        sys = t["utterance"]
                        if t["error_category"]:
                            ec = t["error_category"]
                        else:
                            ec = ["No-Err"]
                        df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
    df.reset_index(inplace=True, drop=True)
    return df

In [6]:
df_all = read_json_with_NoErr(path, datalist)
df_all

Unnamed: 0,did,tid,usr,sys,ec
0,1470646494,2,そうですね。最近とても暑いですから。,海に行かないんですね？,[Contradiction]
1,1470646494,4,休日に行きたいと思います。,海は大丈夫ですね,[Lack of information]
2,1470646494,6,はい。あなたは海に行きますか？,海は危険です,"[Ignore question, Self-contradiction]"
3,1470646494,8,何故ですか？,海はいけないです,[Ignore question]
4,1470646494,10,そうですか。山に行くのはどうでしょうか？,わあ！いいですね！,[No-Err]
...,...,...,...,...,...
1995,1503290726,12,何を別売りしているの？,地下鉄で。名大前ができてすごく便利になりましたね、ここの大学。,[Repetition]
1996,1503290726,14,今は2回押してないよ,紅葉も遅いみたいですしね。私は、いつもベーッシクな色で変わりばえしないのですが。,[Topic transition error]
1997,1503290726,16,ベーシックだよね,何処にあるお店ですか？ベースコートもトップコートも塗ると持ちが違いますよ。,[Topic transition error]
1998,1503290726,18,君は女性なんだね,中間ですか？,[Lack of information]


In [7]:
df = read_json(path, datalist)
df

Unnamed: 0,did,tid,usr,sys,ec
0,1470646494,2,そうですね。最近とても暑いですから。,海に行かないんですね？,[Contradiction]
1,1470646494,4,休日に行きたいと思います。,海は大丈夫ですね,[Lack of information]
2,1470646494,6,はい。あなたは海に行きますか？,海は危険です,"[Ignore question, Self-contradiction]"
3,1470646494,8,何故ですか？,海はいけないです,[Ignore question]
4,1470646494,12,山はお好きなのですか？,わあ！いいですね！,"[Ignore question, Repetition]"
...,...,...,...,...,...
1343,1503290726,12,何を別売りしているの？,地下鉄で。名大前ができてすごく便利になりましたね、ここの大学。,[Repetition]
1344,1503290726,14,今は2回押してないよ,紅葉も遅いみたいですしね。私は、いつもベーッシクな色で変わりばえしないのですが。,[Topic transition error]
1345,1503290726,16,ベーシックだよね,何処にあるお店ですか？ベースコートもトップコートも塗ると持ちが違いますよ。,[Topic transition error]
1346,1503290726,18,君は女性なんだね,中間ですか？,[Lack of information]


In [8]:
y = np.array([[1 if (i in ec) else 0 for i in error_types] for ec in df.ec])

# Display data statistics
print('Number of breakdowns: ', y.shape[0])
print('-- Frequency of labels --')
error_times_dict = {}
for e,c in zip(error_types, sum(y)):
  error_times_dict[e] = c
error_freq = sorted(error_times_dict.items(), key=lambda x:x[1], reverse=True)
for tup in error_freq:
  print("{0}\t\t\t{1}".format(tup[0], tup[1]))
error_types_2 = [ ef[0] for ef in error_freq]
print(error_types_2)

print('-- Frequency of sets of labels (sorted) --')
df_all['ec'].apply(tuple).value_counts()

Number of breakdowns:  1348
-- Frequency of labels --
Unclear intention			474
Wrong information			376
Ignore question			305
Topic transition error			192
Lack of information			54
Repetition			48
Contradiction			18
Self-contradiction			12
Lack of common sense			7
Semantic error			6
Grammatical error			4
Ignore proposal			3
Ignore offer			1
Lack of sociality			1
Uninterpretable			0
Ignore greeting			0
No-Err			0
['Unclear intention', 'Wrong information', 'Ignore question', 'Topic transition error', 'Lack of information', 'Repetition', 'Contradiction', 'Self-contradiction', 'Lack of common sense', 'Semantic error', 'Grammatical error', 'Ignore proposal', 'Ignore offer', 'Lack of sociality', 'Uninterpretable', 'Ignore greeting', 'No-Err']
-- Frequency of sets of labels (sorted) --


(No-Err,)                                      652
(Unclear intention,)                           389
(Wrong information,)                           376
(Ignore question,)                             158
(Topic transition error,)                      141
(Ignore question, Unclear intention)            80
(Lack of information,)                          46
(Ignore question, Topic transition error)       46
(Repetition,)                                   36
(Contradiction,)                                18
(Ignore question, Repetition)                   11
(Self-contradiction,)                           10
(Lack of information, Ignore question)           8
(Semantic error,)                                6
(Lack of common sense,)                          6
(Topic transition error, Unclear intention)      5
(Grammatical error,)                             4
(Ignore proposal,)                               3
(Ignore question, Self-contradiction)            2
(Lack of sociality,)           

In [9]:
def feature_extraction(df:pd.DataFrame) -> np.array:
    nlp = spacy.load('ja_ginza')

  # Make feature vector
    return np.array([np.concatenate([nlp(u).vector, nlp(s).vector]) for u,s in zip(df.usr, df.sys)])

In [10]:
def feature_extraction_context2(df:pd.DataFrame) -> np.array:
    nlp = spacy.load('ja_ginza')
    feature = []
    did = 0
    for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
        if did != d:
            u_prev_vec = nlp(u).vector
            s_prev_vec = nlp(s).vector
            did = d
                  
            if e[0] != "No-Err":
                each = np.array(
                    [np.concatenate(
                        [np.zeros(300),
                        np.zeros(300), 
                        u_prev_vec, 
                        s_prev_vec]
                    )]
                ) 
                feature.append(each[0])

        else:     
            # エラーである
            if e[0] != "No-Err":
                u_vec = nlp(u).vector
                s_vec = nlp(s).vector
                each = np.array(
                    [np.concatenate(
                        [u_vec,
                        s_vec, 
                        u_prev_vec, 
                        s_prev_vec]
                    )]
                )
                feature.append(each[0])
                u_prev_vec = u_vec
                s_prev_vec = s_vec
            # エラーではない
            else:    
                u_prev_vec = nlp(u).vector
                s_prev_vec = nlp(s).vector
    return np.array(feature)


In [11]:
def predict_at_least_oneClass(clf, X) -> np.array:
    y_pred = clf.predict(X)
    p = clf.predict_proba(X)
    # print(y_pred)
    proba = np.array([[p[c][i][1] if (p[c][i].shape[0]!=1) else 0 
                     for c in range(len(error_types))] for i in range(len(X))])
    # print(proba)
  # replace [] to the highest probability label
    y_pred2 = np.empty((0, len(error_types)), int)
    for y, pr in zip(y_pred, proba):
        if  (sum(y) == 0):
            ans = np.zeros_like(y)
            ans[np.argmax(pr)] = 1
        else:
            ans = y
        y_pred2 = np.append(y_pred2, np.array([ans]), axis=0)
    return y_pred2

In [12]:
def extract_y(df:pd.DataFrame) -> np.array:
    y = []
    for ec in df.ec:
        if ec[0] == "No-Err":
            continue
        y_each_err = np.zeros(len(error_types))
        for i, err in enumerate( error_types ):
            if err in ec:
                y_each_err[i] = 1
        y.append(y_each_err)
    return np.array(y)


In [13]:
# df = read_json_with_NoErr(path, datalist)
# df_ = read_json(path, datalist)

# feature extraction
X = feature_extraction_context2(df_all)
print("success feature_extraction")
# print(df_.shape, X.shape)


# Make target (Multilabel)
# y = np.array([[1 if (i in ec) else 0 for i in error_types] for ec in df.ec])
y = extract_y(df_all)
print("success extract y")
print("size | X:", X.shape, "y:", y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)

# clf = MultiOutputClassifier(AdaBoostClassifier()).fit(X_train, y_train)
# y_pred = predict_at_least_oneClass(clf, X_test)

# print('EM:', metrics.accuracy_score(y_test, y_pred))
# print('F-measure: ', metrics.f1_score(y_test, y_pred, average='samples'))

success feature_extraction
success extract y
size | X: (1348, 1200) y: (1348, 17)


In [14]:
def show_error(df:pd.DataFrame) -> np.array:
    nlp = spacy.load('ja_ginza')
    feature = []
    did = 0
    # show_error_set = set([
    #     "Ignore question", "Ignore proposal",  
    #     "Ignore greeting", "Ignore offer"
    #     # ])
    show_error_set = set([
        "Ignore proposal",  
        "Ignore greeting", "Ignore offer"
        ])
    print(show_error_set)
    for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
        # if did != d:
        #     print()
        for e_ in e:
            if e_ in show_error_set:
                print("error",e_,":", u, s)


In [15]:
show_error(df)

{'Ignore proposal', 'Ignore offer', 'Ignore greeting'}
error Ignore proposal : あなたも一緒にお祭りに行きませんか？ お祭りをみます
error Ignore proposal : 蔵王ハートランドのほうが楽しいと思いますよ。行ってみますか？ 私が行った集まりでは、参加した大人の分だけ大根を掘れたので、宮城県蔵王町の蔵王ハートランドでは大根を2本掘りました。
error Ignore offer : そうなんですね。では伊勢海老の美味しいところを簡潔に説明してください。 浅田選手が出ないのが残念ですけど、さすがに女子フィギュアはメダル期待できそうですよね。
error Ignore proposal : 楽しいです、あなたもやりませんか? そうなのですね。私は、ボードをやりますよ。


In [16]:
df.shape

(1348, 5)

In [17]:
def make_error_dict(error_types):
    error_dict = {}
    for e in error_types:
        error_dict[e] = len(error_dict)
    return error_dict

In [18]:
def div_did_error(df:pd.DataFrame, error_types) -> np.array:
    nlp = spacy.load('ja_ginza')
    
    did = df.did[0]
    # 全体
    X_data = []
    y_data = []
    # 各 did 
    sequence_did = []
    y_did = []
    
    # エラーの辞書定義
    error_dict = make_error_dict(error_types)
    for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
        # did で学習データを分割してみる？
        y_one_conv = np.zeros(len(error_types))
        
        if did != d:
            did = d
            # 登録用データ修正
            sequence_did = np.array(sequence_did)
            y_did = np.array(y_did)
            X_data.append(sequence_did)
            y_data.append(y_did)

            sequence_did = []
            y_did = []

            # break

        for e_ in e:
            y_one_conv[error_dict[e_]] = 1

        sequence_did.append(
            np.concatenate(
                [nlp(u).vector,
                nlp(s).vector]
            )
        )
        # print(sequence_did[0].shape)
        y_did.append(y_one_conv)

    sequence_did = np.array(sequence_did)
    y_did = np.array(y_did)
    X_data.append(sequence_did)
    y_data.append(y_did)
    return X_data, y_data
        


In [19]:
df_all.shape

(2000, 5)

In [20]:
# y_data = np.array(y_data)

In [21]:
# 頑張って学習データを新たに分割
def extract_X_y(df:pd.DataFrame, error_types, prev_num) -> np.array:
    nlp = spacy.load('ja_ginza')
    n = prev_num
    did = df.did[0]
    # print(did)
    # 全体
    X_data = []
    y_data = []
    # 各 did 
    sequence_did = []
    y_did = []
    # エラーの辞書定義
    error_dict = make_error_dict(error_types)

    # 初期の調整 padding
    for i in range(n-1):
        sequence_did.append(
                np.concatenate( [np.zeros(300), np.zeros(300)])
            )

    # didごとに返却する？
    # エラーが発生したら、開始からエラーまでの文脈を入力とする(N=5の固定長でも可能)
    # 先にこのベクトル列を作成し，Tensorに変換して， List に保持
    for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
        if did != d:
            did = d
            sequence_did = []
            y_did = []
            for i in range(n-1):
                sequence_did.append(
                        np.concatenate( [np.zeros(300), np.zeros(300)])
                    )
            # break

        # sequence_did.append([u, s])
        sequence_did.append(
                np.concatenate( [nlp(u).vector, nlp(s).vector])
            )
        if e[0] == "No-Err":
            continue
        else:
            y_each_error_label = np.zeros(len(error_types))
            for e_ in e:
                y_each_error_label[error_dict[e_]] = 1
            X_data.append(sequence_did[-n:])
            y_data.append(y_each_error_label)
    return np.array(X_data), np.array(y_data)
    


In [22]:
X_data, y_data = extract_X_y(df_all, error_types, 5)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.20, random_state=5)

In [24]:
X_data.shape



(1348, 5, 600)

In [25]:
y_data.shape

(1348, 17)

In [26]:
leng = 383
for i in range(leng):
    if i+1 % leng == 0:
        print(i)


In [27]:
class Mydatasets(torch.utils.data.Dataset):
    def __init__(self, X_data, y_data):
        # self.transform = transform

        self.X_data = X_data
        self.y_data = y_data

        self.datanum = len(X_data)

    def __len__(self):
        return self.datanum

    def __getitem__(self, idx):
        out_X = self.X_data[idx]
        out_y = self.y_data[idx]

        return out_X, out_y

In [28]:
trainset = Mydatasets(X_data[:380], y_data[:380, 0])
trainloader = torch.utils.data.DataLoader(trainset, batch_size = 95, shuffle = True, num_workers = 2)

In [29]:
for data in trainloader:
    print(data[0].shape)
    print(data[1])
    break

torch.Size([95, 5, 600])
tensor([0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
        0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1.,
        0., 0., 0., 1., 0.], dtype=torch.float64)


In [30]:
95*4


380

In [34]:
# ignore を表示しよう
# ignore_list = ['Ignore question', 'Ignore offer', 'Ignore proposal', 'Ignore greeting']
ignore_list = ['Ignore question']
ignore_set = set(ignore_list)

i = 0
for d, u, s, ec in zip(df.did, df.usr, df.sys, df.ec):
    for e in ec:
        if e in ignore_set:
            pass
print(i)


0


In [36]:
#エラーの種類を洗い出す
error_set = set()
for d, u, s, ec in zip(df.did, df.usr, df.sys, df.ec):
    for e in ec:
        if e not in error_set:
            error_set.add(e)

print(error_set)

{'Lack of sociality', 'Unclear intention', 'Grammatical error', 'Self-contradiction', 'Semantic error', 'Wrong information', 'Ignore proposal', 'Contradiction', 'Ignore offer', 'Repetition', 'Lack of common sense', 'Lack of information', 'Topic transition error', 'Ignore question'}


In [38]:
len(error_set)

14

In [41]:
error_types_set = set(error_types)
print(error_types_set - error_set)

{'Uninterpretable', 'No-Err', 'Ignore greeting'}
