In [92]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

import sys
sys.path.append("../")
from datatools.analyzer import *
from utterance.error_tools import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor

In [93]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim
import torch.nn.utils.rnn as rnn

In [94]:
path = "../hand_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
convs = read_conv(path, datalist)

In [95]:
def make_Xy_4test(convs, N=4):
    errors = ["Topic transition error", 'Lack of information', 'Unclear intention']
    # errors = errors[:1]
    
    X = []
    y = []
    for conv in convs:
        dialogue = [""]*N
        for i, ut in enumerate( conv ) :
            # ユーザ発話駆動
            dialogue.append(clean_text( ut.utt) )
            if ut.is_exist_error():
                X.append( dialogue[-N:] )
                    # X.append(dialogue[-N:])
                if ut.is_error_included(errors) :
                    y.append(1)
                else:
                    y.append(0)
        
    return X, y

In [96]:
N = 3
X_str, y = make_Xy_4test(convs, N=N)
y.count(1)

715

In [97]:
from sentence_transformers import SentenceTransformer
# from sentence_transformers import models

# bert_path = "../../corpus/pretrained/sbert_unclear1"
bert_path = "../../corpus/pretrained/sbert_context_form2"
sbert = SentenceTransformer(bert_path)

[11555] 2022-01-13 13:08:26,555 Info sentence_transformers.SentenceTransformer :Load pretrained SentenceTransformer: ../../corpus/pretrained/sbert_context_form2
[11555] 2022-01-13 13:08:27,221 Info sentence_transformers.SentenceTransformer :Use pytorch device: cuda


In [98]:
X_forward_all_str = sum(X_str, [])

x_length = len(X_forward_all_str)//N
# X_topic_vec = smodel.encode(X_forward_all_str).reshape(x_length, N, -1)
X_topic_vec = sbert.encode(X_forward_all_str).reshape(x_length, N, -1)

Batches:   0%|          | 0/127 [00:00<?, ?it/s]

In [99]:
def vec2feature(vector):
    diff = np.abs( vector[0] - vector[1] )
    return np.concatenate([vector.flatten(), diff])

In [100]:
emb_dim = 768
def sentence2formated(vectors):
    features = []
    prev_vector = np.zeros(emb_dim)
    for i, vector in enumerate(vectors):
        feature = vec2feature( np.array([prev_vector, vector]) ) 
        features.append(feature)
        prev_vector = vector
    return np.array(features)

In [101]:
X_topic = np.array([ sentence2formated(vec) for vec in X_topic_vec ])

In [102]:
X = torch.from_numpy(X_topic)
y = torch.Tensor(y)

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5, stratify=y)

In [104]:
class Datasets(torch.utils.data.Dataset):
    def __init__(self, X_data, y_data):
        # self.transform = transform

        self.X_data = X_data
        self.y_data = y_data

        self.datanum = len(X_data)

    def __len__(self):
        return self.datanum

    def __getitem__(self, idx):
        out_X = self.X_data[idx]
        out_y = self.y_data[idx]

        return out_X, out_y

In [105]:
import copy
class TopicClassifier2(nn.Module):
    def __init__(self, topic_dim, forward_dim, topic_hid, for_hid, tagset_size):
        # 親クラスのコンストラクタ。決まり文句
        super(TopicClassifier2, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。    
        self.tlen = topic_dim
        self.flen = forward_dim
        # self.hidden = hidden_dim
        # 768->256
        self.tlstm = nn.LSTM(topic_dim, topic_hid, batch_first=True)
        # self.lay2_lstm = nn.LSTM(hidden_dim+forward_dim//2, hidden_dim2, batch_first=True)
        # self.flstm = nn.LSTM(forward_dim, for_hid, batch_first=True)
        # self.for2hid = nn.Linear(forward_dim , forward_dim//2)
        self.hid2out = nn.Linear(topic_hid , tagset_size)
        self.tanh = nn.Tanh()
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax()

    def forward(self, x):
        x_topic = x[:, :, :self.tlen].to(torch.float)
        # x_forward = x[:, :, self.tlen:].to(torch.float)
        # x_for_hid = self.for2hid(x_forward)
        # print(x_topic.shape)

        # forward_c = torch.stack( [ self.fmodel.last_context(xfid) for xfid in x_forward_id])
        # topic_out, _ = self.tlstm(x_topic)
        _, tout = self.tlstm(x_topic)
        # _, fout = self.flstm(x_forward)

        # print("topic_out: ", topic_out.shape)
        # topic_out = self.relu(topic_out)
        # x_lay2 = torch.cat([topic_out, x_for_hid)], dim=2)

        # _, hc = self.lay2_lstm(x_lay2)
        # out = self.hid2out(hc[0][0])
        # out = self.hid2out(torch.cat([tout[0][0], fout[0][0]], dim=1) )
        out = self.hid2out(tout[0][0])
        y = self.softmax(out)
        
        return y

In [106]:
BATCH_SIZE = 64
epoch_ = 150
trainset = Datasets(X_train, y_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size = BATCH_SIZE, shuffle = True, num_workers = 2)

In [107]:
TOPIC_DIM = emb_dim*3
FORWARD_DIM = 256
TOPIC_HID_DIM = emb_dim
FOR_HID_DIM = FORWARD_DIM//2
OUTPUT_DIM = 2

In [108]:
model = TopicClassifier2(TOPIC_DIM, FORWARD_DIM, TOPIC_HID_DIM, FOR_HID_DIM, OUTPUT_DIM)
if torch.cuda.is_available():
   model.cuda()
loss_function = nn.NLLLoss()
# loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

In [109]:
losses = []
for epoch in tqdm( range(epoch_)  ):  # again, normally you would NOT do 300 epochs, it is toy data
    all_loss = 0
    for data in trainloader:
        # X_t_tensor = torch.tensor(data[0], device='cuda:0', dtype=torch.int16)
        X_t_tensor = data[0].cuda()
        y_t_tensor = data[1].to(torch.long).cuda()
        optimizer.zero_grad()
        model.zero_grad()
        # print(X_t_tensor.shape , y_t_tensor.view(-1,1).shape)

        score_ = model(X_t_tensor)
        # print(X_t_tensor.shape, score.view(-1,5).shape, y_t_tensor.view(-1,1).shape)
        loss_ = loss_function(score_,  y_t_tensor)
        loss_.backward()
        all_loss += loss_.item()
        optimizer.step()
        del score_
        del loss_
    losses.append(all_loss)
    if (epoch+1) % 50 == 0:
        print("epoch", epoch+1, "\t" , "loss", all_loss)
print("done")

 33%|███▎      | 50/150 [00:13<00:27,  3.61it/s]

epoch 50 	 loss 0.0012779679309460334


 67%|██████▋   | 100/150 [00:26<00:13,  3.59it/s]

epoch 100 	 loss 0.00017367821874358924


100%|██████████| 150/150 [00:40<00:00,  3.73it/s]

epoch 150 	 loss 5.510256528395985e-05
done





In [110]:
with torch.no_grad():
    X_tensor = torch.tensor(X_test, device='cuda:0').float()
            # 推論
    y_pred= np.array(model(X_tensor).cpu()).argmax(axis=1)

  


In [111]:
score(y_test, y_pred)

confusion matrix = 
 [[153  37]
 [ 35 180]]
accuracy =  0.8222222222222222
precision =  0.8294930875576036
recall =  0.8372093023255814
f1 score =  0.8333333333333333


In [112]:
path = "../eval_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
convs_ = read_conv(path, datalist)

In [113]:
N = 3
X_str, y = make_Xy_4test(convs_, N=N)

In [114]:
X_forward_all_str = sum(X_str, [])

x_length = len(X_forward_all_str)//N
# X_topic_vec = smodel.encode(X_forward_all_str).reshape(x_length, N, -1)
X_topic_vec = sbert.encode(X_forward_all_str).reshape(x_length, N, -1)

Batches:   0%|          | 0/130 [00:00<?, ?it/s]

In [115]:
X_topic = np.array([ sentence2formated(vec) for vec in X_topic_vec ])

In [116]:
X = torch.from_numpy(X_topic)
y = torch.Tensor(y)

In [117]:
with torch.no_grad():
    X_tensor = torch.tensor(X, device='cuda:0').float()
    y_pred= np.array(model(X_tensor).cpu()).argmax(axis=1)

  


In [118]:
list(y).count(1)

704

In [119]:
score(y, y_pred)

confusion matrix = 
 [[478 204]
 [172 532]]
accuracy =  0.7287157287157288
precision =  0.7228260869565217
recall =  0.7556818181818182
f1 score =  0.7388888888888889


- sbert context form v2

        confusion matrix = 
        [[478 204]
        [172 532]]
        accuracy =  0.7287157287157288
        precision =  0.7228260869565217
        recall =  0.7556818181818182
        f1 score =  0.7388888888888889

- sbert unclear1

        confusion matrix = 
        [[444 238]
        [181 523]]
        accuracy =  0.6976911976911977
        precision =  0.6872536136662286
        recall =  0.7428977272727273
        f1 score =  0.7139931740614335