In [12]:
from pathlib import Path
import os
import json
import copy
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics


In [13]:
# from ..tools.preprocess import Preprocessor
import sys
sys.dont_write_bytecode = True
sys.path.append('../')
from tools.maneger import DataManager
# from utterance.feature import Feature
from feature import Feature

In [14]:
def json2data(path):
    cols = ["text", "label", "subLabel"]
    df = pd.DataFrame(index=[], columns=cols)
    files = os.listdir(path)
    for cop in files:
        if "." not in cop:
            continue
        with open(path+cop, "r") as f:
            json_data = json.load(f)
            mode = cop.split(".")[0]
            for data in json_data[mode]:
                text = data["data"]
                label = data["label"][0]
                subLabel = ""
                df = df.append(pd.DataFrame([text, label, subLabel], index = cols).T)
    df.reset_index(inplace=True, drop=True)
    return df

In [15]:
label_list = "YN WH please plain".split()
label_dict = dict( zip(label_list, range(len(label_list))) )

def extract_X_y(df):
    X = []
    y = []
    for te, la in zip(df.text, df.label):
        X.append(te)
        y.append(label_dict[la])
    return X, y
    

In [16]:
corpus_root = "../../corpus"
# name = "question/short"
name = "question"
data_path = "/".join([corpus_root, name]) + "/"
data_path

'../../corpus/question/'

In [17]:
df = json2data(data_path)
df


Unnamed: 0,text,label,subLabel
0,メニューを見せていただけますか？,please,
1,おいでいただけますか？,please,
2,マッシュポテトをもらえますか？,please,
3,伝言を預かっていただけますか？,please,
4,ご一緒しませんか？,please,
...,...,...,...
1606,前に会ったことがあるのですか？,YN,
1607,そこでは何か特別なことをしましたか？,YN,
1608,もう質問は終わったかい？,YN,
1609,よく眠れた？,YN,


In [18]:
X, y = extract_X_y(df)

In [19]:
X_train_str, X_test_str, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5)

In [20]:
F_path = "../X_y_data/response/"
F_name = "typeClassify_F.dill"
featureM = DataManager(F_path, format_="dill")

if featureM.is_exist(F_name):
    F = featureM.load_data(F_name)
else:
    F = Feature()
    F.make_features(X_train_str)
    featureM.save_data(F_name, F)

300
success save : ../X_y_data/response/typeClassify_F.dill


In [21]:


X_train = []
X_test = []
for i, x_t_str in enumerate( X_train_str ):
    x = F.featurization(x_t_str)
    X_train.append(x)
for i, x_t_str in enumerate( X_test_str ):
    x = F.featurization(x_t_str)
    X_test.append(x)
X_train = np.array(X_train)
X_test = np.array(X_test)

In [22]:
lr = LogisticRegression(solver='sag', max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=1000, solver='sag')

In [23]:
y_pred = lr.predict(X_test)

In [24]:
print(label_dict)
for y_p, x_s in zip(y_pred[:10], X_test_str[:10]):
    print("{0} : {1}".format(y_p, x_s))

{'YN': 0, 'WH': 1, 'please': 2, 'plain': 3}
0 : いっしょに行きたいんじゃない？
3 : あなたがネガティブ・フォース卿ですね。
0 : 君は、僕が正直だと思うかい？
3 : 締め切りに間に合わせることができました。
3 : ぼくたち、学校の図書館のことで校長先生にお話ししたいのですが。
3 : 好きだわ。
3 : あれはりんごの木？
3 : イケブクロ博士のところに持って帰ります。
0 : 前の携帯からうまくデータを移行できたの？
0 : 机の中にあるのかな？


In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_pred))
# print('precision = ', precision_score(y_true=y_test, y_pred=y_pred))
# print('recall = ', recall_score(y_true=y_test, y_pred=y_pred))
# print('f1 score = ', f1_score(y_true=y_test, y_pred=y_pred))

confusion matrix = 
 [[ 59   4  13  18]
 [  6  13   1   4]
 [ 11   0  70   6]
 [  1   1   3 274]]
accuracy =  0.859504132231405


In [26]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()


In [27]:
model_path = "../models/response/"
model_name = "typeClassify_M.pickle"
modelM = DataManager(model_path)
print(model_name)

typeClassify_M.pickle


In [28]:
modelM.save_data(model_name, lr)

success save : ../models/response/typeClassify_M.pickle


In [29]:
# accuracy =  0.7913907284768212
# accuracy =  0.8326446280991735
# accuracy =  0.8367768595041323

In [33]:
F2_path = "../X_y_data/response/"
F2_name = "typeClassify_F2.dill"
featureM2 = DataManager(F2_path, format_="dill")

F2 = Feature()
F2.make_features(X_train_str[:3])
featureM2.save_data(F2_name, F2)

300
success save : ../X_y_data/response/typeClassify_F2.dill


In [34]:
F2.featurization("そうですね．最近熱いですから")

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [35]:
F2.set_preprocessor()

TypeError: set_preprocessor() missing 1 required positional argument: 'pre'