In [1]:
from pathlib import Path
import json
import pandas as pd
import numpy as np

import collections


import spacy

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics

In [39]:
path = './error_category_classification/dbdc5_ja_dev_labeled/'
# Names of the dialogue systems
datalist = ['DCM', 'DIT', 'IRS']
# datalist = ['DCM']
# List of error types
error_types = ['Ignore question', 'Unclear intention', 'Wrong information', 'Topic transition error', 'Lack of information', 
'Repetition', 'Semantic error', 'Self-contradiction', 'Contradiction', 'Grammatical error', 'Ignore offer', 
'Ignore proposal', 'Lack of sociality', 'Lack of common sense', 'Uninterpretable', 'Ignore greeting']

In [3]:
def read_json(path:str, datalist:list) -> pd.DataFrame:
    cols = ['did', 'tid', 'usr', 'sys', 'ec']
    df = pd.DataFrame(index=[], columns=cols)

    for p in datalist:
        datapath = Path(path + p + '/')
        for file in datapath.glob("*.json"):
            with open(file, "r") as f:
                json_data = json.load(f)
                did = json_data["dialogue-id"]
                for t in json_data["turns"]:
                    if t["turn-index"] == 0:
                        continue
                    if t["speaker"] == "U":
                        usr = t["utterance"]
                        continue
                    if t["speaker"] == "S" and t["error_category"] != None:
                        tid = t["turn-index"]
                        sys = t["utterance"]
                        ec = t["error_category"]
                        df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
    df.reset_index(inplace=True, drop=True)
    return df

In [41]:
def read_json_with_NoErr(path:str, datalist:list) -> pd.DataFrame:
    cols = ['did', 'tid', 'usr', 'sys', 'ec']
    df = pd.DataFrame(index=[], columns=cols)

    for p in datalist:
        datapath = Path(path + p + '/')
        for file in datapath.glob("*.json"):
            with open(file, "r") as f:
                json_data = json.load(f)
                did = json_data["dialogue-id"]
                for t in json_data["turns"]:
                    if t["turn-index"] == 0:
                        continue
                    if t["speaker"] == "U":
                        usr = t["utterance"]
                        continue
                    if t["speaker"] == "S" :
                        tid = t["turn-index"]
                        sys = t["utterance"]
                        if t["error_category"]:
                            ec = t["error_category"]
                        else:
                            ec = ["No-Err"]
                        df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
    df.reset_index(inplace=True, drop=True)
    return df

In [42]:
df_all = read_json_with_NoErr(path, datalist)
df_all

Unnamed: 0,did,tid,usr,sys,ec
0,1470622805,2,こんにちは,おはよ,[No-Err]
1,1470622805,4,元気ですか？,元気ですよ,[No-Err]
2,1470622805,6,今日は暑いですね,元気ですかは元気です,[Unclear intention]
3,1470622805,8,そうですか,ごめんね,[Unclear intention]
4,1470622805,10,謝らなくていいですよ,元気ですかは元気ですね,[Repetition]
...,...,...,...,...,...
1995,1550577973,12,そうですね。私は年間パスポート持っていますよ。,そうなのですか、なるほど、おすすめの香りあります？私、バニラっぽいのが好きです。,[Topic transition error]
1996,1550577973,14,私は柑橘系が好きですね。,男性はバニラ系好きな方あまりみかけないです。,[No-Err]
1997,1550577973,16,そうですね。私は食べる方のバニラは大好きです。,最近のミスチルも好きなんですが,[Topic transition error]
1998,1550577973,18,ミスチルは昔のイノセントワールドなんか好きですよ。,ほうほう。バンド系よりもデジタルな音楽は私もすきです。,[No-Err]


In [6]:
df = read_json(path, datalist)
df

Unnamed: 0,did,tid,usr,sys,ec
0,1470622805,6,今日は暑いですね,元気ですかは元気です,[Unclear intention]
1,1470622805,8,そうですか,ごめんね,[Unclear intention]
2,1470622805,10,謝らなくていいですよ,元気ですかは元気ですね,[Repetition]
3,1470622805,12,とても元気です,もちろんですってのが元気ですかにいいですね,[Unclear intention]
4,1470622805,14,何のことですか？,色々と,[Ignore question]
...,...,...,...,...,...
378,1550629275,8,あなたは1日に何時間働いていますか？,そうなんですか。　じゃあ、博物館に行かれたことはありますか？,"[Ignore question, Topic transition error]"
379,1550629275,10,博物館に行ったことはないんです。,博物館という研究施設がおすすめです,[Unclear intention]
380,1550629275,12,博物館面白そうですよね。今度行ってみたいと思います。,いま、美容系の企業がアンチエイジング商品の開発に力を入れているみたいだね。,[Topic transition error]
381,1550629275,14,アンチエイジング興味あります。,興味あることは良く覚えるのに、興味ないことは頭の中を素通りしてしまうよね。,[Unclear intention]


In [7]:
y = np.array([[1 if (i in ec) else 0 for i in error_types] for ec in df_all.ec])

# Display data statistics
print('Number of breakdowns: ', y.shape[0])
print('-- Frequency of labels --')
for e,c in zip(error_types, sum(y)):
  print(e,c)
print('-- Frequency of sets of labels (sorted) --')
df_all['ec'].apply(tuple).value_counts()

Number of breakdowns:  670
-- Frequency of labels --
Ignore question 129
Unclear intention 181
Wrong information 2
Topic transition error 51
Lack of information 22
Repetition 34
Semantic error 6
Self-contradiction 6
Contradiction 8
Grammatical error 4
Ignore offer 0
Ignore proposal 1
Lack of sociality 1
Lack of common sense 7
Uninterpretable 0
Ignore greeting 0
-- Frequency of sets of labels (sorted) --


(No-Err,)                                      287
(Unclear intention,)                           142
(Ignore question,)                              62
(Ignore question, Unclear intention)            38
(Topic transition error,)                       33
(Repetition,)                                   26
(Lack of information,)                          18
(Ignore question, Topic transition error)       17
(Contradiction,)                                 8
(Ignore question, Repetition)                    7
(Semantic error,)                                6
(Lack of common sense,)                          6
(Self-contradiction,)                            5
(Grammatical error,)                             4
(Lack of information, Ignore question)           4
(Wrong information,)                             2
(Ignore question, Self-contradiction)            1
(Topic transition error, Unclear intention)      1
(Ignore proposal,)                               1
(Lack of sociality,)           

In [8]:
def feature_extraction(df:pd.DataFrame) -> np.array:
    nlp = spacy.load('ja_ginza')

  # Make feature vector
    return np.array([np.concatenate([nlp(u).vector, nlp(s).vector]) for u,s in zip(df.usr, df.sys)])

In [28]:
def feature_extraction_context2(df:pd.DataFrame) -> np.array:
    nlp = spacy.load('ja_ginza')
    feature = []
    did = 0
    for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
        if did != d:
            u_prev_vec = nlp(u).vector
            s_prev_vec = nlp(s).vector
            did = d
                  
            if e[0] != "No-Err":
                each = np.array(
                    [np.concatenate(
                        [np.zeros(300),
                        np.zeros(300), 
                        u_prev_vec, 
                        s_prev_vec]
                    )]
                ) 
                feature.append(each[0])

        else:     
            # エラーである
            if e[0] != "No-Err":
                u_vec = nlp(u).vector
                s_vec = nlp(s).vector
                each = np.array(
                    [np.concatenate(
                        [u_vec,
                        s_vec, 
                        u_prev_vec, 
                        s_prev_vec]
                    )]
                )
                feature.append(each[0])
                u_prev_vec = u_vec
                s_prev_vec = s_vec
            # エラーではない
            else:    
                u_prev_vec = nlp(u).vector
                s_prev_vec = nlp(s).vector
    return np.array(feature)


In [10]:
def predict_at_least_oneClass(clf, X) -> np.array:
    y_pred = clf.predict(X)
    p = clf.predict_proba(X)
    # print(y_pred)
    proba = np.array([[p[c][i][1] if (p[c][i].shape[0]!=1) else 0 
                     for c in range(len(error_types))] for i in range(len(X))])
    # print(proba)
  # replace [] to the highest probability label
    y_pred2 = np.empty((0, len(error_types)), int)
    for y, pr in zip(y_pred, proba):
        if  (sum(y) == 0):
            ans = np.zeros_like(y)
            ans[np.argmax(pr)] = 1
        else:
            ans = y
        y_pred2 = np.append(y_pred2, np.array([ans]), axis=0)
    return y_pred2

In [37]:
def extract_y(df:pd.DataFrame) -> np.array:
    y = []
    for ec in df.ec:
        if ec[0] == "No-Err":
            continue
        y_each_err = np.zeros(len(error_types))
        for i, err in enumerate( error_types ):
            if err in ec:
                y_each_err[i] = 1
        y.append(y_each_err)
    return np.array(y)


In [43]:
# df = read_json_with_NoErr(path, datalist)
# df_ = read_json(path, datalist)

# feature extraction
X = feature_extraction_context2(df_all)
print("success feature_extraction")
# print(df_.shape, X.shape)


# Make target (Multilabel)
# y = np.array([[1 if (i in ec) else 0 for i in error_types] for ec in df.ec])
y = extract_y(df_all)
print("success extract y")
print("size | X:", X.shape, "y:", y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)

clf = MultiOutputClassifier(AdaBoostClassifier()).fit(X_train, y_train)
y_pred = predict_at_least_oneClass(clf, X_test)

print('EM:', metrics.accuracy_score(y_test, y_pred))
print('F-measure: ', metrics.f1_score(y_test, y_pred, average='samples'))

success feature_extraction
success extract y
size | X: (1348, 1200) y: (1348, 16)
EM: 0.43333333333333335
F-measure:  0.5413580246913579


In [36]:
y

array([['', '1', '', ..., '', '', ''],
       ['', '1', '', ..., '', '', ''],
       ['', '', '', ..., '', '', ''],
       ...,
       ['', '', '', ..., '', '', ''],
       ['', '1', '', ..., '', '', ''],
       ['', '1', '', ..., '', '', '']], dtype='<U22')

In [37]:
y_pred = predict_at_least_oneClass(clf, X_test)
y_pred

[[0 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]]
[[4.04149857e-01 5.13217973e-01 1.39676410e-08 ... 3.22934213e-09
  0.00000000e+00 0.00000000e+00]
 [6.15453322e-01 5.17288253e-01 4.48846210e-12 ... 4.07817842e-08
  0.00000000e+00 0.00000000e+00]
 [3.86877696e-01 5.01891708e-01 5.72957238e-13 ... 4.24439375e-09
  0.00000000e+00 0.00000000e+00]
 ...
 [3.99552361e-01 4.92385825e-01 6.14181464e-11 ... 1.36689057e-12
  0.00000000e+00 0.00000000e+00]
 [1.90990886e-01 5.23530523e-01 5.99790437e-11 ... 2.04073900e-09
  0.00000000e+00 0.00000000e+00]
 [3.58078334e-01 5.13292726e-01 5.52278974e-12 ... 1.73177833e-09
  0.00000000e+00 0.00000000e+00]]


array([[0, 1, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [19]:
print(torch.cuda.is_available())

True
