In [9]:
import re
import sys
import json


import sys
sys.dont_write_bytecode = True
sys.path.append('../')
from datatools.analyzer import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim
import torch.nn.utils.rnn as rnn

In [11]:
path = "../eval_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
convs = read_conv(path, datalist)

In [12]:

y = []
utt_list = []
errors = ["Unclear intention", "Topic transition error", "Lack of information"]
for conv in convs:
    # utt_list_conv = [""]*5
    utt_list_conv = []
    for i, ut in enumerate( conv ) :
        utt_list_conv.append(ut.utt)
        # システム発話で，[文脈-形式]のエラー
        if ut.is_system() and ut.is_exist_error():
        # if ut.is_system():
            # usr_sys.append( [conv[i-1].utt, ut.utt] )
            utt_list.append( utt_list_conv[-5:] )
            if ut.is_error_included(errors):
                y.append(1)
            else:
                y.append(0)

In [17]:
from gensim.models import KeyedVectors

w2v_path = "../../corpus/w2v/"
# fasttext
# https://qiita.com/Hironsan/items/513b9f93752ecee9e670
w2v_name =  "dep-ja-300dim"
w2v_name =  "model.vec"
w2v_model = KeyedVectors.load_word2vec_format(w2v_path+w2v_name)

In [13]:
symbol_path = "../models/base/"
symbol_name = "context_symbol.pickle"
symbolM = DataManager(symbol_path)
SYMBOL_w2v = symbolM.load_data(symbol_name)

success load : ../models/base/context_symbol.pickle


In [14]:
from gensim.models import KeyedVectors
from tqdm import tqdm
import torch.nn.utils.rnn as rnn

def w2v(word, w2v_model:KeyedVectors, SYMBOL_w2v:dict):
    # 形態素が登録されていたとき
    
    if word in SYMBOL_w2v:
        vector = SYMBOL_w2v[word]
    elif word in w2v_model:
        vector = w2v_model[word]
    else:
        vector = SYMBOL_w2v["[UNK]"]
    return torch.from_numpy(vector)

def sentence2formated(sen, w2v_model, SYMBOL_w2v):
    normal = sentence2morpheme(sen, sents_span=False)

    # 1文だけ
    if len(normal) < 2:
        formated =  fill_SYMBOL_ONE(normal)[0]
    else:
        normal_sep = fill_SYMBOL_SEP(normal)
        formated =  fill_SYMBOL_ONE( [sum( normal_sep, [] )] )[0]
    
    return  torch.stack( [w2v(w, w2v_model, SYMBOL_w2v) for w in formated] )

def make_X(utt_list:list, w2v_model, SYMBOL_w2v):
    utt_morp_list = []
    for utt in tqdm( utt_list) :
        # ["FOS", "aa", "[SEP]", "bb", "EOS"] : 1データ
        utt_morp = sentence2formated(utt, w2v_model, SYMBOL_w2v)
        utt_morp_list.append(utt_morp)

    X = rnn.pad_sequence(utt_morp_list, batch_first=True)
    return X

In [18]:
X = make_X(utt_list, w2v_model, SYMBOL_w2v)

  
100%|██████████| 1386/1386 [02:18<00:00, 10.01it/s]


In [15]:
from gensim.models import KeyedVectors

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, tagset_size, w2v_model, SYMBOL_w2v):
        # 親クラスのコンストラクタ。決まり文句
        super(LSTMClassifier, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。    
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        # モデルを2つ定義
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim//2, batch_first=True,  bidirectional=True )
        # self.lstm2 = nn.LSTM(embedding_dim, hidden_dim//2, batch_first=True,  bidirectional=True )
        self.xtoy_2 = nn.Linear(embedding_dim*3 , hidden_dim)
        self.y3toy = nn.Linear(hidden_dim, tagset_size)
        # softmaxのLog版。dim=0で列、dim=1で行方向を確率変換。
        self.softmax = nn.LogSoftmax()
        self.tanh = nn.Tanh()
    
        self.w2v_model = w2v_model
        self.SYMBOL_w2v = SYMBOL_w2v
    
    def pooling(self, A):
        # A : dim3
        # pooled = []
        b_len = len(A)
        f_len = len(A[0][0])
        pooled = torch.zeros((b_len, f_len)).cuda()
        for i, batch in enumerate( A ):
            for j in range(f_len):
                # batch_pooled.append( A[i, torch.argmax(A[i, :, j]), j] )
                pooled[i, j] = A[i, torch.argmax(A[i, :, j]), j]
        return pooled

    def pooling_2(self, A):
        # A : dim3
        if len(A.shape) == 2:
            A = torch.stack([A])
        b_len = len(A)
        seq_len = len(A[0])
        m = nn.MaxPool1d(seq_len, stride=seq_len)
        B = A.permute((0, 2, 1))
        return m(B).reshape(b_len, -1)
    
    def forward(self, e):
        
        out, hc = self.bilstm(e)
        x = torch.cat([ out, e], dim=2 )
        y_2 = self.tanh( self.xtoy_2(x) )
        y_3 = self.pooling_2(y_2)
        y = self.softmax( self.y3toy(y_3) )
        return y

In [16]:
model_path = "../models/base/"
model_name = "context_form.pickle"
modelM = DataManager(model_path)
model = modelM.load_data(model_name)

success load : ../models/base/context_form.pickle


In [19]:
with torch.no_grad():
    X_tensor = torch.tensor(X, device='cuda:0').float()
    y_tensor = torch.tensor(y, device='cuda:0', dtype=torch.long)
            # 推論
    y_pred= np.array(model(X_tensor).cpu()).argmax(axis=1)

  
  self.dropout, self.training, self.bidirectional, self.batch_first)


In [20]:
print('confusion matrix = \n', confusion_matrix(y_true=y, y_pred=y_pred))
print('accuracy = ', accuracy_score(y_true=y, y_pred=y_pred))
print('precision = ', precision_score(y_true=y, y_pred=y_pred))
print('recall = ', recall_score(y_true=y, y_pred=y_pred))
print('f1 score = ', f1_score(y_true=y, y_pred=y_pred))

confusion matrix = 
 [[284 398]
 [148 556]]
accuracy =  0.6060606060606061
precision =  0.5828092243186582
recall =  0.7897727272727273
f1 score =  0.6706875753920386


- 過去5発話での結果

        confusion matrix = 
        [[284 398]
        [148 556]]
        accuracy =  0.6060606060606061
        precision =  0.5828092243186582
        recall =  0.7897727272727273
        f1 score =  0.6706875753920386