In [10]:
from pathlib import Path
import os
import json
import copy
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics


In [6]:
# from ..tools.preprocess import Preprocessor
from feature import Feature



In [35]:
def json2data(path):
    cols = ["text", "label", "subLabel"]
    df = pd.DataFrame(index=[], columns=cols)
    files = os.listdir(path)
    for cop in files:
        if "." not in cop:
            continue
        with open(path+cop, "r") as f:
            json_data = json.load(f)
            mode = cop.split(".")[0]
            for data in json_data[mode]:
                text = data["data"]
                label = data["label"][0]
                subLabel = ""
                df = df.append(pd.DataFrame([text, label, subLabel], index = cols).T)
    df.reset_index(inplace=True, drop=True)
    return df

In [20]:
label_list = "YN WH please plain".split()
label_dict = dict( zip(label_list, range(len(label_list))) )

def extract_X_y(df):
    X = []
    y = []
    for te, la in zip(df.text, df.label):
        X.append(te)
        y.append(label_dict[la])
    return X, y
    

In [31]:
corpus_root = "../corpus"
# name = "question/short"
name = "question"
data_path = "/".join([corpus_root, name]) + "/"
data_path

'../corpus/question/'

In [36]:
df = json2data(data_path)
df


Unnamed: 0,text,label,subLabel
0,メニューを見せていただけますか？,please,
1,おいでいただけますか？,please,
2,マッシュポテトをもらえますか？,please,
3,伝言を預かっていただけますか？,please,
4,ご一緒しませんか？,please,
...,...,...,...
619,あのロボットがどこから来たか、知っておるか？,YN,
620,君は彼女が優勝するチャンスがあると思う？,YN,
621,「その雑誌の最新号を見ましたか？あなたが夢中になっている有名人が表紙に載っていますよ！」,YN,
622,きみたち、おいしいパン屋さんを知らないかな？,YN,


In [37]:
X, y = extract_X_y(df)

In [38]:
X_train_str, X_test_str, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5)

In [39]:
F = Feature()
F.make_features(X_train_str)

X_train = []
X_test = []
for i, x_t_str in enumerate( X_train_str ):
    x = F.featurization(x_t_str)
    X_train.append(x)
for i, x_t_str in enumerate( X_test_str ):
    x = F.featurization(x_t_str)
    X_test.append(x)
X_train = np.array(X_train)
X_test = np.array(X_test)

300


In [46]:
lr = LogisticRegression(solver='sag', max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=1000000, solver='sag')

In [47]:
y_pred = lr.predict(X_test)

In [48]:
print(label_dict)
for y_p, x_s in zip(y_pred[:10], X_test_str[:10]):
    print("{0} : {1}".format(y_p, x_s))

{'YN': 0, 'WH': 1, 'please': 2, 'plain': 3}
0 : ほかに手はあるかい？
0 : あなたはどう思った？
3 : 寝過ごしちゃったから。
1 : あのロボットがどこから来たか、知っておるか？
0 : そこで何か買ったのかい？
3 : 僕が知っているかぎり、水圧は変更できないよ。
0 : ブン君と見つけたカメを覚えていますか？
1 : そのデイジー、彼がくれたの？
2 : 何を作っていますか？
1 : 議題として挙がっている最初の項目は何かしら。


In [49]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))
# print('precision = ', precision_score(y_true=y_test, y_pred=y_pred))
# print('recall = ', recall_score(y_true=y_test, y_pred=y_pred))
# print('f1 score = ', f1_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[35  7  4  4]
 [ 8 26  5  3]
 [ 5  2 19  1]
 [ 0  5  0 64]]
accuracy =  0.7659574468085106
