In [1]:
def read_json(path:str, datalist:list) -> pd.DataFrame:
    cols = ['did', 'tid', 'usr', 'sys', 'ec']
    datalist = ['DCM', 'DIT', 'IRS']
    df = pd.DataFrame(index=[], columns=cols)

    for p in datalist:
        datapath = Path(path + p + '/')
        for file_ in datapath.glob("*.json"):
            # print(file_)
            with open(file_, "r") as f:
                json_data = json.load(f)
                did = json_data["dialogue-id"]
                for t in json_data["turns"]:
                    if t["turn-index"] == 0:
                        continue
                    if t["speaker"] == "U":
                        usr = t["utterance"]
                        continue
                    if t["speaker"] == "S" and t["error_category"] != None:
                        tid = t["turn-index"]
                        sys = t["utterance"]
                        ec = t["error_category"]
                        df = df.append(pd.DataFrame([did, tid, usr, sys, ec], index = cols).T)
    df.reset_index(inplace=True, drop=True)
    return df

NameError: name 'pd' is not defined

In [None]:
# Path to the development data
path = './error_category_classification/dbdc5_ja_dev_labeled/'
# Names of the dialogue systems
datalist = ['DCM', 'DIT', 'IRS']
# List of error types
error_types = ['Ignore question', 'Unclear intention', 'Wrong information', 'Topic transition error', 'Lack of information', 
'Repetition', 'Semantic error', 'Self-contradiction', 'Contradiction', 'Grammatical error', 'Ignore offer', 
'Ignore proposal', 'Lack of sociality', 'Lack of common sense', 'Uninterpretable', 'Ignore greeting']
#print('Number of error types:', len(error_types))

In [None]:
from pathlib import Path
import json
import pandas as pd
import collections
import numpy as np

import spacy

In [None]:
df = read_json(path, datalist)
df

In [None]:
# Make target (Multilabel)
y = np.array([[1 if (i in ec) else 0 for i in error_types] for ec in df.ec])

# Display data statistics
print('Number of breakdowns: ', y.shape[0])
print('-- Frequency of labels --')
for e,c in zip(error_types, sum(y)):
  print(e,c)
print('-- Frequency of sets of labels (sorted) --')
df['ec'].apply(tuple).value_counts()

In [None]:
def feature_extraction(df:pd.DataFrame) -> np.array:
    nlp = spacy.load('ja_ginza')

  # Make feature vector
    return np.array([np.concatenate([nlp(u).vector, nlp(s).vector]) for u,s in zip(df.usr, df.sys)])

In [None]:
def predict_at_least_oneClass(clf, X) -> np.array:
  y_pred = clf.predict(X)
  p = clf.predict_proba(X)
  proba = np.array([[p[c][i][1] if (p[c][i].shape[0]!=1) else 0 
                     for c in range(len(error_types))] for i in range(len(X))])
  # replace [] to the highest probability label
  y_pred2 = np.empty((0, len(error_types)), int)
  for y, pr in zip(y_pred, proba):
    if  (sum(y) == 0):
      ans = np.zeros_like(y)
      ans[np.argmax(pr)] = 1
    else:
      ans = y
    y_pred2 = np.append(y_pred2, np.array([ans]), axis=0)
  return y_pred2

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics

# read development data
df = read_json(path, datalist)

# feature extraction
X = feature_extraction(df)

# Make target (Multilabel)
y = np.array([[1 if (i in ec) else 0 for i in error_types] for ec in df.ec])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)

clf = MultiOutputClassifier(AdaBoostClassifier()).fit(X_train, y_train)
y_pred = predict_at_least_oneClass(clf, X_test)

print('EM:', metrics.accuracy_score(y_test, y_pred))
print('F-measure: ', metrics.f1_score(y_test, y_pred, average='samples'))

In [None]:
X.shape

In [None]:
nlp = spacy.load('ja_ginza')
nlp(df.usr[0]).vector.shape

In [None]:
len(error_types)

In [None]:
df.ec

In [None]:
ec