In [2]:
from pathlib import Path
import json
import pandas as pd
import collections
import numpy as np

import spacy

In [3]:
path = './error_category_classification/dbdc5_ja_dev_labeled/'
# Names of the dialogue systems
datalist = ['DCM']
# datalist = ['DCM', 'DIT', 'IRS']
# List of error types
error_types = ['Ignore question', 'Unclear intention', 'Wrong information', 'Topic transition error', 'Lack of information', 
'Repetition', 'Semantic error', 'Self-contradiction', 'Contradiction', 'Grammatical error', 'Ignore offer', 
'Ignore proposal', 'Lack of sociality', 'Lack of common sense', 'Uninterpretable', 'Ignore greeting']

In [4]:
def read_json(path:str, datalist:list) -> pd.DataFrame:
    cols = ['did', 'tid', 'usr', 'sys', 'ec']
    datalist = ['DCM']
    # datalist = ['DCM', 'DIT', 'IRS']
    df = pd.DataFrame(index=[], columns=cols)

    for p in datalist:
        datapath = Path(path + p + '/')
        for file_ in datapath.glob("*.json"):
            # print(file_)
            with open(file_, "r") as f:
                json_data = json.load(f)
                did = json_data["dialogue-id"]
                for t in json_data["turns"]:
                    if t["turn-index"] == 0:
                        continue
                    if t["speaker"] == "U":
                        usr = t["utterance"]
                        continue
                    if t["speaker"] == "S" and t["error_category"] != None:
                        tid = t["turn-index"]
                        sys = t["utterance"]
                        ec = t["error_category"]
                        df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
    df.reset_index(inplace=True, drop=True)
    return df

In [5]:
df = read_json(path, datalist)
df

Unnamed: 0,did,tid,usr,sys,ec
0,1470622805,6,今日は暑いですね,元気ですかは元気です,[Unclear intention]
1,1470622805,8,そうですか,ごめんね,[Unclear intention]
2,1470622805,10,謝らなくていいですよ,元気ですかは元気ですね,[Repetition]
3,1470622805,12,とても元気です,もちろんですってのが元気ですかにいいですね,[Unclear intention]
4,1470622805,14,何のことですか？,色々と,[Ignore question]
...,...,...,...,...,...
378,1550629275,8,あなたは1日に何時間働いていますか？,そうなんですか。　じゃあ、博物館に行かれたことはありますか？,"[Ignore question, Topic transition error]"
379,1550629275,10,博物館に行ったことはないんです。,博物館という研究施設がおすすめです,[Unclear intention]
380,1550629275,12,博物館面白そうですよね。今度行ってみたいと思います。,いま、美容系の企業がアンチエイジング商品の開発に力を入れているみたいだね。,[Topic transition error]
381,1550629275,14,アンチエイジング興味あります。,興味あることは良く覚えるのに、興味ないことは頭の中を素通りしてしまうよね。,[Unclear intention]


In [None]:
def read_json_with_NoErr(path:str, datalist:list) -> pd.DataFrame:
    cols = ['did', 'tid', 'usr', 'sys', 'ec']
    df = pd.DataFrame(index=[], columns=cols)

    for p in datalist:
        datapath = Path(path + p + '/')
        for file in datapath.glob("*.json"):
            with open(file, "r") as f:
                json_data = json.load(f)
                did = json_data["dialogue-id"]
                for t in json_data["turns"]:
                    if t["turn-index"] == 0:
                        continue
                    if t["speaker"] == "U":
                        usr = t["utterance"]
                        continue
                    if t["speaker"] == "S" :
                        tid = t["turn-index"]
                        sys = t["utterance"]
                        if t["error_category"]:
                            ec = t["error_category"]
                        else:
                            ec = ["No-Err"]
                        df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
    df.reset_index(inplace=True, drop=True)
    return df

In [6]:
y = np.array([[1 if (i in ec) else 0 for i in error_types] for ec in df.ec])

# Display data statistics
print('Number of breakdowns: ', y.shape[0])
print('-- Frequency of labels --')
for e,c in zip(error_types, sum(y)):
  print(e,c)
print('-- Frequency of sets of labels (sorted) --')
df['ec'].apply(tuple).value_counts()

Number of breakdowns:  383
-- Frequency of labels --
Ignore question 129
Unclear intention 181
Wrong information 2
Topic transition error 51
Lack of information 22
Repetition 34
Semantic error 6
Self-contradiction 6
Contradiction 8
Grammatical error 4
Ignore offer 0
Ignore proposal 1
Lack of sociality 1
Lack of common sense 7
Uninterpretable 0
Ignore greeting 0
-- Frequency of sets of labels (sorted) --


(Unclear intention,)                           142
(Ignore question,)                              62
(Ignore question, Unclear intention)            38
(Topic transition error,)                       33
(Repetition,)                                   26
(Lack of information,)                          18
(Ignore question, Topic transition error)       17
(Contradiction,)                                 8
(Ignore question, Repetition)                    7
(Semantic error,)                                6
(Lack of common sense,)                          6
(Self-contradiction,)                            5
(Grammatical error,)                             4
(Lack of information, Ignore question)           4
(Wrong information,)                             2
(Ignore question, Self-contradiction)            1
(Topic transition error, Unclear intention)      1
(Lack of sociality,)                             1
(Ignore proposal,)                               1
(Lack of common sense, Repetiti

In [18]:
def continuous_error(df:pd.DataFrame) -> pd.DataFrame:
    cols = ['did', 'tid', 'usr', 'sys', 'ec']
    df_cont = pd.DataFrame(index=[], columns=cols)
    id = 0
    for d, t, u, s, e in zip(df.did, df.tid, df.usr, df.sys, df.ec):
        if id == t-2:
            # print(t)
            # print(u)
            # print(s)
            # print()
            df_cont = df_cont.append(pd.DataFrame([d, t, u, s, e], index = cols).T)
        id = t
        if id > t:
            id = 0
    return df_cont


In [19]:
df_cont = continuous_error(df)
df_cont

Unnamed: 0,did,tid,usr,sys,ec
0,1470622805,8,そうですか,ごめんね,[Unclear intention]
0,1470622805,10,謝らなくていいですよ,元気ですかは元気ですね,[Repetition]
0,1470622805,12,とても元気です,もちろんですってのが元気ですかにいいですね,[Unclear intention]
0,1470622805,14,何のことですか？,色々と,[Ignore question]
0,1470622805,16,なるほど,せやろ,[Unclear intention]
...,...,...,...,...,...
0,1550629275,8,あなたは1日に何時間働いていますか？,そうなんですか。　じゃあ、博物館に行かれたことはありますか？,"[Ignore question, Topic transition error]"
0,1550629275,10,博物館に行ったことはないんです。,博物館という研究施設がおすすめです,[Unclear intention]
0,1550629275,12,博物館面白そうですよね。今度行ってみたいと思います。,いま、美容系の企業がアンチエイジング商品の開発に力を入れているみたいだね。,[Topic transition error]
0,1550629275,14,アンチエイジング興味あります。,興味あることは良く覚えるのに、興味ないことは頭の中を素通りしてしまうよね。,[Unclear intention]


In [20]:
print('-- Frequency of sets of labels (sorted) --')
df_cont['ec'].apply(tuple).value_counts()

-- Frequency of sets of labels (sorted) --


(Unclear intention,)                           81
(Ignore question,)                             34
(Ignore question, Unclear intention)           21
(Topic transition error,)                      17
(Repetition,)                                  15
(Lack of information,)                          9
(Ignore question, Topic transition error)       8
(Ignore question, Repetition)                   6
(Grammatical error,)                            4
(Lack of common sense,)                         4
(Self-contradiction,)                           3
(Semantic error,)                               2
(Lack of information, Ignore question)          2
(Contradiction,)                                2
(Wrong information,)                            1
(Topic transition error, Unclear intention)     1
(Ignore question, Self-contradiction)           1
(Lack of common sense, Repetition)              1
Name: ec, dtype: int64