In [3]:
from pathlib import Path
import json
import pandas as pd
import numpy as np

import collections


import spacy

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics



In [4]:
path = './error_category_classification/dbdc5_ja_dev_labeled/'
# Names of the dialogue systems
# datalist = ['DCM', 'DIT', 'IRS']
datalist = ['DCM']
# List of error types
error_types = ['Ignore question', 'Unclear intention', 'Wrong information', 'Topic transition error', 'Lack of information', 
'Repetition', 'Semantic error', 'Self-contradiction', 'Contradiction', 'Grammatical error', 'Ignore offer', 
'Ignore proposal', 'Lack of sociality', 'Lack of common sense', 'Uninterpretable', 'Ignore greeting', "No-Err"]

In [5]:
def read_json(path:str, datalist:list) -> pd.DataFrame:
    cols = ['did', 'tid', 'usr', 'sys', 'ec']
    df = pd.DataFrame(index=[], columns=cols)

    for p in datalist:
        datapath = Path(path + p + '/')
        for file in datapath.glob("*.json"):
            with open(file, "r") as f:
                json_data = json.load(f)
                did = json_data["dialogue-id"]
                for t in json_data["turns"]:
                    if t["turn-index"] == 0:
                        continue
                    if t["speaker"] == "U":
                        usr = t["utterance"]
                        continue
                    if t["speaker"] == "S" and t["error_category"] != None:
                        tid = t["turn-index"]
                        sys = t["utterance"]
                        ec = t["error_category"]
                        df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
    df.reset_index(inplace=True, drop=True)
    return df

In [6]:
def read_json_with_NoErr(path:str, datalist:list) -> pd.DataFrame:
    cols = ['did', 'tid', 'usr', 'sys', 'ec']
    df = pd.DataFrame(index=[], columns=cols)

    for p in datalist:
        datapath = Path(path + p + '/')
        for file in datapath.glob("*.json"):
            with open(file, "r") as f:
                json_data = json.load(f)
                did = json_data["dialogue-id"]
                for t in json_data["turns"]:
                    if t["turn-index"] == 0:
                        continue
                    if t["speaker"] == "U":
                        usr = t["utterance"]
                        continue
                    if t["speaker"] == "S" :
                        tid = t["turn-index"]
                        sys = t["utterance"]
                        if t["error_category"]:
                            ec = t["error_category"]
                        else:
                            ec = ["No-Err"]
                        df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
    df.reset_index(inplace=True, drop=True)
    return df

In [7]:
df_all = read_json_with_NoErr(path, datalist)
df_all

Unnamed: 0,did,tid,usr,sys,ec
0,1470646494,2,そうですね。最近とても暑いですから。,海に行かないんですね？,[Contradiction]
1,1470646494,4,休日に行きたいと思います。,海は大丈夫ですね,[Lack of information]
2,1470646494,6,はい。あなたは海に行きますか？,海は危険です,"[Ignore question, Self-contradiction]"
3,1470646494,8,何故ですか？,海はいけないです,[Ignore question]
4,1470646494,10,そうですか。山に行くのはどうでしょうか？,わあ！いいですね！,[No-Err]
...,...,...,...,...,...
665,1470623483,12,かっこいいね,ありがとう,[Unclear intention]
666,1470623483,14,どういたしまして,はい,[No-Err]
667,1470623483,16,真夏日が続いて夏バテしそうだよ,真夏日は続くんですね？真夏日まで気温が上がりますよねー,[Unclear intention]
668,1470623483,18,夜は寝苦しい,真夏日はいいですね,[No-Err]


In [8]:
df = read_json(path, datalist)
df

Unnamed: 0,did,tid,usr,sys,ec
0,1470646494,2,そうですね。最近とても暑いですから。,海に行かないんですね？,[Contradiction]
1,1470646494,4,休日に行きたいと思います。,海は大丈夫ですね,[Lack of information]
2,1470646494,6,はい。あなたは海に行きますか？,海は危険です,"[Ignore question, Self-contradiction]"
3,1470646494,8,何故ですか？,海はいけないです,[Ignore question]
4,1470646494,12,山はお好きなのですか？,わあ！いいですね！,"[Ignore question, Repetition]"
...,...,...,...,...,...
378,1470623483,8,固いといえばくるみかな,風早が好きですね,[Topic transition error]
379,1470623483,10,花火見に行ったよ,風早がかっこよいですね,[Topic transition error]
380,1470623483,12,かっこいいね,ありがとう,[Unclear intention]
381,1470623483,16,真夏日が続いて夏バテしそうだよ,真夏日は続くんですね？真夏日まで気温が上がりますよねー,[Unclear intention]


In [9]:
y = np.array([[1 if (i in ec) else 0 for i in error_types] for ec in df_all.ec])

# Display data statistics
print('Number of breakdowns: ', y.shape[0])
print('-- Frequency of labels --')
for e,c in zip(error_types, sum(y)):
  print(e,c)
print('-- Frequency of sets of labels (sorted) --')
df_all['ec'].apply(tuple).value_counts()

Number of breakdowns:  670
-- Frequency of labels --
Ignore question 129
Unclear intention 181
Wrong information 2
Topic transition error 51
Lack of information 22
Repetition 34
Semantic error 6
Self-contradiction 6
Contradiction 8
Grammatical error 4
Ignore offer 0
Ignore proposal 1
Lack of sociality 1
Lack of common sense 7
Uninterpretable 0
Ignore greeting 0
No-Err 287
-- Frequency of sets of labels (sorted) --


(No-Err,)                                      287
(Unclear intention,)                           142
(Ignore question,)                              62
(Ignore question, Unclear intention)            38
(Topic transition error,)                       33
(Repetition,)                                   26
(Lack of information,)                          18
(Ignore question, Topic transition error)       17
(Contradiction,)                                 8
(Ignore question, Repetition)                    7
(Lack of common sense,)                          6
(Semantic error,)                                6
(Self-contradiction,)                            5
(Lack of information, Ignore question)           4
(Grammatical error,)                             4
(Wrong information,)                             2
(Topic transition error, Unclear intention)      1
(Lack of common sense, Repetition)               1
(Ignore question, Self-contradiction)            1
(Lack of sociality,)           

In [10]:
def feature_extraction(df:pd.DataFrame) -> np.array:
    nlp = spacy.load('ja_ginza')

  # Make feature vector
    return np.array([np.concatenate([nlp(u).vector, nlp(s).vector]) for u,s in zip(df.usr, df.sys)])

In [11]:
def feature_extraction_context2(df:pd.DataFrame) -> np.array:
    nlp = spacy.load('ja_ginza')
    feature = []
    did = 0
    for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
        if did != d:
            u_prev_vec = nlp(u).vector
            s_prev_vec = nlp(s).vector
            did = d
                  
            if e[0] != "No-Err":
                each = np.array(
                    [np.concatenate(
                        [np.zeros(300),
                        np.zeros(300), 
                        u_prev_vec, 
                        s_prev_vec]
                    )]
                ) 
                feature.append(each[0])

        else:     
            # エラーである
            if e[0] != "No-Err":
                u_vec = nlp(u).vector
                s_vec = nlp(s).vector
                each = np.array(
                    [np.concatenate(
                        [u_vec,
                        s_vec, 
                        u_prev_vec, 
                        s_prev_vec]
                    )]
                )
                feature.append(each[0])
                u_prev_vec = u_vec
                s_prev_vec = s_vec
            # エラーではない
            else:    
                u_prev_vec = nlp(u).vector
                s_prev_vec = nlp(s).vector
    return np.array(feature)


In [12]:
def predict_at_least_oneClass(clf, X) -> np.array:
    y_pred = clf.predict(X)
    p = clf.predict_proba(X)
    # print(y_pred)
    proba = np.array([[p[c][i][1] if (p[c][i].shape[0]!=1) else 0 
                     for c in range(len(error_types))] for i in range(len(X))])
    # print(proba)
  # replace [] to the highest probability label
    y_pred2 = np.empty((0, len(error_types)), int)
    for y, pr in zip(y_pred, proba):
        if  (sum(y) == 0):
            ans = np.zeros_like(y)
            ans[np.argmax(pr)] = 1
        else:
            ans = y
        y_pred2 = np.append(y_pred2, np.array([ans]), axis=0)
    return y_pred2

In [13]:
def extract_y(df:pd.DataFrame) -> np.array:
    y = []
    for ec in df.ec:
        if ec[0] == "No-Err":
            continue
        y_each_err = np.zeros(len(error_types))
        for i, err in enumerate( error_types ):
            if err in ec:
                y_each_err[i] = 1
        y.append(y_each_err)
    return np.array(y)


In [14]:
# df = read_json_with_NoErr(path, datalist)
# df_ = read_json(path, datalist)

# feature extraction
X = feature_extraction_context2(df_all)
print("success feature_extraction")
# print(df_.shape, X.shape)


# Make target (Multilabel)
# y = np.array([[1 if (i in ec) else 0 for i in error_types] for ec in df.ec])
y = extract_y(df_all)
print("success extract y")
print("size | X:", X.shape, "y:", y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)

# clf = MultiOutputClassifier(AdaBoostClassifier()).fit(X_train, y_train)
# y_pred = predict_at_least_oneClass(clf, X_test)

# print('EM:', metrics.accuracy_score(y_test, y_pred))
# print('F-measure: ', metrics.f1_score(y_test, y_pred, average='samples'))

success feature_extraction
success extract y
size | X: (383, 1200) y: (383, 17)


In [15]:
def show_error(df:pd.DataFrame) -> np.array:
    nlp = spacy.load('ja_ginza')
    feature = []
    did = 0
    # show_error_set = set([
    #     "Ignore question", "Ignore proposal",  
    #     "Ignore greeting", "Ignore offer"
    #     # ])
    show_error_set = set([
        "Ignore proposal",  
        "Ignore greeting", "Ignore offer"
        ])
    print(show_error_set)
    for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
        # if did != d:
        #     print()
        for e_ in e:
            if e_ in show_error_set:
                print("error",e_,":", u, s)


In [16]:
show_error(df)

{'Ignore proposal', 'Ignore offer', 'Ignore greeting'}
error Ignore proposal : あなたも一緒にお祭りに行きませんか？ お祭りをみます


In [17]:
df.shape

(383, 5)

In [20]:
def make_error_dict(error_types):
    error_dict = {}
    for e in error_types:
        error_dict[e] = len(error_dict)
    return error_dict

In [76]:
def div_did_error(df:pd.DataFrame, error_types) -> np.array:
    nlp = spacy.load('ja_ginza')
    
    did = df.did[0]
    # 全体
    X_data = []
    y_data = []
    # 各 did 
    sequence_did = []
    y_did = []
    
    # エラーの辞書定義
    error_dict = make_error_dict(error_types)
    for d, u, s, e in zip(df.did, df.usr, df.sys, df.ec):
        # did で学習データを分割してみる？
        y_one_conv = np.zeros(len(error_types))
        
        if did != d:
            did = d
            # 登録用データ修正
            sequence_did = np.array(sequence_did)
            y_did = np.array(y_did)
            X_data.append(sequence_did)
            y_data.append(y_did)

            sequence_did = []
            y_did = []

            # break

        for e_ in e:
            y_one_conv[error_dict[e_]] = 1

        sequence_did.append(
            np.concatenate(
                [nlp(u).vector,
                nlp(s).vector]
            )
        )
        # print(sequence_did[0].shape)
        y_did.append(y_one_conv)

    sequence_did = np.array(sequence_did)
    y_did = np.array(y_did)
    X_data.append(sequence_did)
    y_data.append(y_did)
    return X_data, y_data
        


In [77]:
X_data, y_data = div_did_error(df_all, error_types)



In [72]:
print(X_data[4].shape)
print(len(X_data))

(10, 600)
67


In [73]:
print(y_data[5].shape)
print(len(y_data))

(10, 17)
67


In [29]:
df_all.shape

(670, 5)

In [78]:
y_data = np.array(y_data)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.20, random_state=5)
        