In [1]:
import re
import sys
import json


import sys
sys.dont_write_bytecode = True
sys.path.append('../')
from datatools.analyzer import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics



In [58]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim
import torch.nn.utils.rnn as rnn

In [13]:
path = "../../corpus/func_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
convs = read_conv(path, datalist)

In [30]:
usr_sys = []
y = []
utt_list = []
errors = ["Ignore question", "Ignore offer", "Ignore proposal", "Ignore greeting"]
for conv in convs:
    for i, ut in enumerate( conv ) :
        utt_list.append(ut.utt)
        # システム発話で，無視系統のエラー
        # if ut.is_system() and ut.is_exist_error():
        if ut.is_system():
            usr_sys.append( [conv[i-1].utt, ut.utt] )
            if ut.is_error_included(errors):
                y.append(1)
            else:
                y.append(0)

In [48]:
from tqdm import tqdm
def make_vocab_dict(text:str):
    vocab_dict = dict()
    doc = nlp(text)
    print("analyzed vocab text")
    vocab_dict["[PAD]"] = 0
    for token in tqdm(doc):
        key = token.lemma_
        if key not in vocab_dict:
            vocab_dict[key] = len(vocab_dict)

    vocab_dict["[UNK]"] = len(vocab_dict)
    return vocab_dict


In [49]:
vocab_text = " ".join(utt_list)

In [50]:
vocab_dict = make_vocab_dict(vocab_text)
vocab_size = len(vocab_dict)

100%|██████████| 58287/58287 [00:00<00:00, 1859636.53it/s]

analyzed vocab text





In [53]:
vocab_path = "../X_y_data/base/"
vocab_name = "vocab_dict.pickle"
vocabM = DataManager(vocab_path)

In [54]:
vocabM.save_data(vocab_name, vocab_dict)

success save : ../X_y_data/base/vocab_dict.pickle


In [56]:
data_path = "../X_y_data/base/"
data_name = "response_Xy_ver{0}.pickle".format(1)
dataM = DataManager(data_path)

In [57]:
def sentence2ids(sentence:str, vocab_dict:dict):
    doc = nlp(sentence)
    ids = np.zeros(len(doc))
    for i, token in enumerate(doc):
        key = token.lemma_
        if key in vocab_dict:
            ids[i] = vocab_dict[key]
        else:
            ids[i] = vocab_dict["[UNK]"]
    return ids


In [107]:
from tqdm import tqdm
import torch.nn.utils.rnn as rnn

def padding_vector(Xseq):
    Xseq = [ torch.tensor( xseq[:, None] ) for xseq in Xseq]
    Xseq = rnn.pad_sequence(Xseq, batch_first=True)
    Xseq = torch.Tensor( [ torch.flatten(xseq) for xseq in Xseq ] )
    return Xseq


def make_X(usr_sys:list, vocab_dict:dict):
    usr_id_list = []
    sys_id_list = []
    for turn in tqdm( usr_sys ) :
        usr_id = sentence2ids(turn[0], vocab_dict)
        usr_id_list.append(usr_id)

        sys_id = sentence2ids(turn[1], vocab_dict)
        sys_id_list.append(sys_id)
    
    # usr_id_pad = rnn.pad_sequence(torch.Tensor( usr_id_list) , batch_first=True)
    # sys_id_pad = rnn.pad_sequence(torch.Tensor( sys_id_list), batch_first=True)
    usr_id_pad = padding_vector(usr_id_list)
    sys_id_pad = padding_vector(sys_id_list)

    X =  torch.Tensor( [ torch.Tensor([u, s]) for u, s in zip(usr_id_pad, sys_id_pad) ] ) 
    return X


In [108]:
X = make_X(usr_sys[:10], vocab_dict)

100%|██████████| 10/10 [00:00<00:00, 50.73it/s]


ValueError: only one element tensors can be converted to Python scalars

In [None]:
# dataM.save_data(data_name, [X, y])

success save : ../X_y_data/base/response_Xy_ver1.pickle


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)

In [None]:
class Datasets(torch.utils.data.Dataset):
    def __init__(self, X_data, y_data):
        # self.transform = transform

        self.X_data = X_data
        self.y_data = y_data

        self.datanum = len(X_data)

    def __len__(self):
        return self.datanum

    def __getitem__(self, idx):
        out_X = self.X_data[idx]
        out_y = self.y_data[idx]

        return out_X, out_y

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, tagset_size, batch_size):
        # 親クラスのコンストラクタ。決まり文句
        super(LSTMClassifier, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。    
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim,  padding_idx=0)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim//2, batch_first=True,  bidirectional=True )
        self.lstm2 = nn.LSTM(embedding_dim, hidden_dim//2, batch_first=True,  bidirectional=True )
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        # softmaxのLog版。dim=0で列、dim=1で行方向を確率変換。
        self.softmax = nn.LogSoftmax()
    
    def forward(self, x):
        #embeds.size() = (batch_size × len(sentence) × embedding_dim)
        usr_ = x[0]
        sys_ = x[1]
        embeds = self.word_embeddings(x)
        _, lstm1_out = self.lstm1(embeds)
        # print(hidden_layer)
        # bilstm_out = torch.cat([lstm_out[0][0], lstm_out[0][1]], dim=1)
        tag_space = self.hidden2tag(lstm1_out[0])
        # y = self.hidden2tag(hidden_layer[0].view(batch_size, -1))

        # y = self.hidden2tag(bilstm_out)
        y =self.softmax(tag_space.squeeze())
        return y

In [None]:
BATCH_SIZE = 32
epoch_ = 600
trainset = Datasets(X_train, y_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size = BATCH_SIZE, shuffle = True, num_workers = 2)

In [None]:
VOCAB_SIZE = len(vocab_dict)
EMBEDDING_DIM = 128
HIDDEN_DIM = EMBEDDING_DIM*2
OUTPUT_DIM = EMBEDDING_DIM

In [None]:
np.array(X_train)

ValueError: only one element tensors can be converted to Python scalars

In [None]:
type(X_train[0][0])

torch.Tensor