In [3]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

import sys
sys.path.append("../")
from datatools.analyzer import *
from utterance.error_tools import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim
import torch.nn.utils.rnn as rnn

In [5]:
with open("../../corpus/NTT/persona.json", "r", encoding="utf-8") as f:
    convs = json.load(f)

In [6]:
import random
random.seed(0)

def make_Xy(convs, n=4, rate=3):

    X_str = []
    y = []

    all_utt = []
    for did in tqdm( convs["convs"] ) :
        dids = list( did.keys() )[0]
        all_utt += did[dids]
    random.shuffle(all_utt)

    j = 0

    for did in tqdm( convs["convs"] ):
        dids = list( did.keys() )[0]
        conv = did[dids]
        # print(conv)
        for i in range(n-1, len(conv)):
            p = (i-n+1)
            # print(i, "[{0}:{1}]".format(p, p+n), conv[p:p+n-1])
            # 正例
            if i%rate != 0:
                X_str.append( conv[p:p+n] )
                y.append(0)
                # print(i, conv[p:p+n])
            # 負例
            else:
                X_str.append( conv[p:p+n-1]+[all_utt[j]] )
                j += 1
                y.append(1)
    
    return X_str, y


In [7]:
# n: 発話長， rate: エラー発話の確率
X_str, y = make_Xy(convs, n=4)

100%|██████████| 5016/5016 [00:00<00:00, 1359260.17it/s]
100%|██████████| 5016/5016 [00:00<00:00, 59041.55it/s]


In [8]:
from gensim.models import KeyedVectors

w2v_path = "../../corpus/w2v/"
# fasttext
# https://qiita.com/Hironsan/items/513b9f93752ecee9e670
w2v_name =  "dep-ja-300dim"
w2v_name =  "model.vec"
w2v_model = KeyedVectors.load_word2vec_format(w2v_path+w2v_name)

[3131] 2021-12-31 16:49:04,665 Info gensim.models.keyedvectors :loading projection weights from ../../corpus/w2v/model.vec
[3131] 2021-12-31 16:50:06,571 Info gensim.utils :KeyedVectors lifecycle event {'msg': 'loaded (351122, 300) matrix of type float32 from ../../corpus/w2v/model.vec', 'binary': False, 'encoding': 'utf8', 'datetime': '2021-12-31T16:50:06.571591', 'gensim': '4.0.1', 'python': '3.6.9 (default, Jan 26 2021, 15:33:00) \n[GCC 8.4.0]', 'platform': 'Linux-5.4.72-microsoft-standard-WSL2-x86_64-with-Ubuntu-18.04-bionic', 'event': 'load_word2vec_format'}


In [10]:
wsv_dim = w2v_model["あ"].shape[0]
add_keys = ["FOS", "EOS", "[SEP]", "[UNK]", "[NONE]"]
add_weights = [np.random.randn(wsv_dim) for _ in range(len(add_keys))]
add_weights = [ v/np.linalg.norm(v) for v in add_weights ]
SYMBOL_w2v = dict(zip(add_keys, add_weights))

In [11]:
symbol_path = "../models/context_topic/"
symbol_name = "symbol.pickle"
symbolM = DataManager(symbol_path)
symbolM.save_data(symbol_name, SYMBOL_w2v)

success save : ../models/context_topic/symbol.pickle


In [57]:
ppmi_dataname = "../../corpus/collocation/ppmi_ntt1"
ppmi_matrix2 = np.load(ppmi_dataname+".npy")

In [58]:
toyoshima_set = set("NOUN PROPN VERB ADJ".split())

def w2v(word, w2v_model:KeyedVectors, SYMBOL_w2v:dict):
    if word in SYMBOL_w2v:
        vector = SYMBOL_w2v[word]
    elif word in w2v_model:
        vector = w2v_model[word]
    else:
        vector = SYMBOL_w2v["[UNK]"]
    return vector

def filtering(doc, filter_set):
    left = []
    for token in doc:
        if token.pos_ in filter_set:
            left.append(token.lemma_)
    return left if len(left)>0 else ["[NONE]"]

def doc2vec(doc, w2v_model:KeyedVectors, SYMBOL_w2v:dict):
    left = filtering(doc, toyoshima_set)
    return np.mean([ w2v(w, w2v_model, SYMBOL_w2v) for w in left], axis=0)

# 副詞など，ほぼすべて
def doc2vec2(doc, w2v_model:KeyedVectors, SYMBOL_w2v:dict):
    left = filtering(doc, independent_set)
    return np.mean([ w2v(w, w2v_model, SYMBOL_w2v) for w in left], axis=0)

def sentence2formated(sen, w2v_model:KeyedVectors, SYMBOL_w2v:dict):
    docs = sentence2docs(sen, sents_span=False)
    vector = [np.zeros(300)]
    for i, doc in enumerate(docs):
        if i==0:
            prev_vector = doc2vec2(doc, w2v_model, SYMBOL_w2v)
        else:
            current_vector = doc2vec2(doc, w2v_model, SYMBOL_w2v)
            diff_vec = np.abs(prev_vector-current_vector)
            norm = np.linalg.norm(diff_vec)
            if norm==0:
                norm = 1            
            # vector.append( diff_vec/norm )
            vector.append( diff_vec)
            prev_vector = current_vector
    return np.array(vector)

In [59]:
conv = X_str[7]
vector = sentence2formated(conv, w2v_model, SYMBOL_w2v)

In [60]:
[np.linalg.norm(v) for v in vector]

[0.0, 2.1304090245956444, 1.270484948329428, 0.9575943953814545]

In [17]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, tagset_size, vocab_dict):
        # 親クラスのコンストラクタ。決まり文句
        super(LSTMClassifier, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。    
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim,  padding_idx=0)
        # モデルを2つ定義
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim//2, batch_first=True,  bidirectional=True )
        # self.lstm2 = nn.LSTM(embedding_dim, hidden_dim//2, batch_first=True,  bidirectional=True )
        self.hidden2tag = nn.Linear(hidden_dim , tagset_size)
        # softmaxのLog版。dim=0で列、dim=1で行方向を確率変換。
        self.softmax = nn.LogSoftmax()
    
        self.vocab_dict = vocab_dict
    
    def forward(self, x):
        emb1 = self.word_embeddings(x)
        _, lstm1_out = self.lstm1(emb1)
        tag_space = self.hidden2tag(torch.cat([ lstm1_out[0][0], lstm1_out[0][1]], dim=1 ))
        y =self.softmax(tag_space)
        return y
    
    def last_context(self, x):
        emb1 = self.word_embeddings(x)
        # print(emb1.shape)
        _, lstm1_out = self.lstm1(emb1)
        context = torch.cat([ lstm1_out[0][0], lstm1_out[0][1]], dim=1 )
        return context
    
    def text2context(self, text):
        if isinstance(text, str):
            utt_id = self._sentence2ids(text, self.vocab_dict)
            utt_id_tensor = torch.tensor( [utt_id] , device='cuda:0', dtype=torch.int)
            # utt_id_tensor = torch.tensor( [utt_id] , device='cpu', dtype=torch.int)
            return self.last_context(utt_id_tensor)
        if isinstance(text, list):
            X = self._make_X(text, self.vocab_dict)
            utt_id_tensor = X.to(torch.int).cuda()
            # utt_id_tensor = X.to(torch.int)
            return self.last_context(utt_id_tensor)
        else:
            return 0

        
    def _sentence2ids(self, sentence:str, vocab_dict:dict):
        doc = self._sentence2formated(sentence)
        ids = np.zeros(len(doc))
        for i, key in enumerate(doc):
            # key = token.orth_
            if key in vocab_dict:
                ids[i] = vocab_dict[key]
            else:
                ids[i] = vocab_dict["[UNK]"]
        return ids
    
    def _sentence2formated(self, sen):
        return sum( fill_SYMBOL_ONE( sentence2normalize_noun(sen) ), [] )
    
    def _padding_vector(self, Xseq):
        Xseq = [ torch.tensor( xseq[:, None] ) for xseq in Xseq]
        Xseq = rnn.pad_sequence(Xseq, batch_first=True)
        Xseq = [ torch.flatten(xseq) for xseq in Xseq ] 
        return Xseq


    def _make_X(self, utt_list:list, vocab_dict:dict):
        utt_id_list = []
        for utt in tqdm( utt_list) :
            utt_id = self._sentence2ids(utt, vocab_dict)
            utt_id_list.append(utt_id)

        utt_id_pad = self._padding_vector(utt_id_list)
        upl = len(utt_id_pad[0])
        # X =   [ torch.Tensor([u, s]) for u, s in zip(usr_id_pad, sys_id_pad) ] 
        # print(usr_pad_len, sys_pad_len)
        X = torch.zeros( (len(utt_list), upl) )
        for i, u in enumerate(utt_id_pad):
            X[i, :upl] = u
        return X

In [18]:
model_path = "../models/response2/"
model_name = "forward_v2.pickle"
modelM = DataManager(model_path)
fmodel = modelM.load_data(model_name)

success load : ../models/response2/forward_v2.pickle


In [19]:
import copy
class TopicClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, tagset_size, w2v_model, SYMBOL_w2v):
        # 親クラスのコンストラクタ。決まり文句
        super(TopicClassifier, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。    
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        # モデルを2つ定義
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.tanh = nn.Tanh()
        self.softmax = nn.LogSoftmax()
        
    
        self.w2v_model = w2v_model
        self.SYMBOL_w2v = SYMBOL_w2v
    
    def forward(self, x):
        
        out, hc = self.lstm(x)
        
        return y
    
    def set_forward_model(self, fmodel:LSTMClassifier):
        self.fmodel = fmodel
        self.lstm_f = self.fmodel.lstm1
        