In [1]:
from pathlib import Path
import json
import pandas as pd
import collections
import numpy as np

import spacy

In [2]:
def read_json(path:str, datalist:list) -> pd.DataFrame:
    cols = ['did', 'tid', 'usr', 'sys', 'ec']
    datalist = ['DCM', 'DIT', 'IRS']
    df = pd.DataFrame(index=[], columns=cols)

    for p in datalist:
        datapath = Path(path + p + '/')
        for file_ in datapath.glob("*.json"):
            # print(file_)
            with open(file_, "r") as f:
                json_data = json.load(f)
                did = json_data["dialogue-id"]
                for t in json_data["turns"]:
                    if t["turn-index"] == 0:
                        continue
                    if t["speaker"] == "U":
                        usr = t["utterance"]
                        continue
                    if t["speaker"] == "S" and t["error_category"] != None:
                        tid = t["turn-index"]
                        sys = t["utterance"]
                        ec = t["error_category"]
                        df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
    df.reset_index(inplace=True, drop=True)
    return df

In [6]:
# Path to the development data
path = './error_category_classification/dbdc5_ja_dev_labeled/'
# Names of the dialogue systems
datalist = ['DCM', 'DIT', 'IRS']
# List of error types
error_types = ['Ignore question', 'Unclear intention', 'Wrong information', 'Topic transition error', 'Lack of information', 
'Repetition', 'Semantic error', 'Self-contradiction', 'Contradiction', 'Grammatical error', 'Ignore offer', 
'Ignore proposal', 'Lack of sociality', 'Lack of common sense', 'Uninterpretable', 'Ignore greeting']
#print('Number of error types:', len(error_types))

In [7]:
df = read_json(path, datalist)
df

Unnamed: 0,did,tid,usr,sys,ec
0,1470646494,2,そうですね。最近とても暑いですから。,海に行かないんですね？,[Contradiction]
1,1470646494,4,休日に行きたいと思います。,海は大丈夫ですね,[Lack of information]
2,1470646494,6,はい。あなたは海に行きますか？,海は危険です,"[Ignore question, Self-contradiction]"
3,1470646494,8,何故ですか？,海はいけないです,[Ignore question]
4,1470646494,12,山はお好きなのですか？,わあ！いいですね！,"[Ignore question, Repetition]"
...,...,...,...,...,...
1343,1503290726,12,何を別売りしているの？,地下鉄で。名大前ができてすごく便利になりましたね、ここの大学。,[Repetition]
1344,1503290726,14,今は2回押してないよ,紅葉も遅いみたいですしね。私は、いつもベーッシクな色で変わりばえしないのですが。,[Topic transition error]
1345,1503290726,16,ベーシックだよね,何処にあるお店ですか？ベースコートもトップコートも塗ると持ちが違いますよ。,[Topic transition error]
1346,1503290726,18,君は女性なんだね,中間ですか？,[Lack of information]


In [8]:
# Make target (Multilabel)
y = np.array([[1 if (i in ec) else 0 for i in error_types] for ec in df.ec])

# Display data statistics
print('Number of breakdowns: ', y.shape[0])
print('-- Frequency of labels --')
for e,c in zip(error_types, sum(y)):
  print(e,c)
print('-- Frequency of sets of labels (sorted) --')
df['ec'].apply(tuple).value_counts()

Number of breakdowns:  1348
-- Frequency of labels --
Ignore question 305
Unclear intention 474
Wrong information 376
Topic transition error 192
Lack of information 54
Repetition 48
Semantic error 6
Self-contradiction 12
Contradiction 18
Grammatical error 4
Ignore offer 1
Ignore proposal 3
Lack of sociality 1
Lack of common sense 7
Uninterpretable 0
Ignore greeting 0
-- Frequency of sets of labels (sorted) --


(Unclear intention,)                           389
(Wrong information,)                           376
(Ignore question,)                             158
(Topic transition error,)                      141
(Ignore question, Unclear intention)            80
(Lack of information,)                          46
(Ignore question, Topic transition error)       46
(Repetition,)                                   36
(Contradiction,)                                18
(Ignore question, Repetition)                   11
(Self-contradiction,)                           10
(Lack of information, Ignore question)           8
(Lack of common sense,)                          6
(Semantic error,)                                6
(Topic transition error, Unclear intention)      5
(Grammatical error,)                             4
(Ignore proposal,)                               3
(Ignore question, Self-contradiction)            2
(Lack of sociality,)                             1
(Lack of common sense, Repetiti

In [9]:
def feature_extraction(df:pd.DataFrame) -> np.array:
    nlp = spacy.load('ja_ginza')

  # Make feature vector
    return np.array([np.concatenate([nlp(u).vector, nlp(s).vector]) for u,s in zip(df.usr, df.sys)])

In [10]:
def predict_at_least_oneClass(clf, X) -> np.array:
  y_pred = clf.predict(X)
  p = clf.predict_proba(X)
  proba = np.array([[p[c][i][1] if (p[c][i].shape[0]!=1) else 0 
                     for c in range(len(error_types))] for i in range(len(X))])
  # replace [] to the highest probability label
  y_pred2 = np.empty((0, len(error_types)), int)
  for y, pr in zip(y_pred, proba):
    if  (sum(y) == 0):
      ans = np.zeros_like(y)
      ans[np.argmax(pr)] = 1
    else:
      ans = y
    y_pred2 = np.append(y_pred2, np.array([ans]), axis=0)
  return y_pred2

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics

# read development data
df = read_json(path, datalist)

# feature extraction
X = feature_extraction(df)

# Make target (Multilabel)
y = np.array([[1 if (i in ec) else 0 for i in error_types] for ec in df.ec])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)

clf = MultiOutputClassifier(AdaBoostClassifier()).fit(X_train, y_train)
y_pred = predict_at_least_oneClass(clf, X_test)

print('EM:', metrics.accuracy_score(y_test, y_pred))
print('F-measure: ', metrics.f1_score(y_test, y_pred, average='samples'))

EM: 0.5111111111111111
F-measure:  0.6179012345679011


In [12]:
X.shape

(1348, 600)

In [13]:
nlp = spacy.load('ja_ginza')
nlp(df.usr[0]).vector.shape

(300,)

In [14]:
len(error_types)

16

In [15]:
df.ec

0                             [Contradiction]
1                       [Lack of information]
2       [Ignore question, Self-contradiction]
3                           [Ignore question]
4               [Ignore question, Repetition]
                        ...                  
1343                             [Repetition]
1344                 [Topic transition error]
1345                 [Topic transition error]
1346                    [Lack of information]
1347                 [Topic transition error]
Name: ec, Length: 1348, dtype: object

In [16]:
ec

NameError: name 'ec' is not defined