In [1]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

import sys
sys.path.append("../")
from datatools.analyzer import *
from utterance.error_tools import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import loss
import torch.optim as optim

In [3]:
from gensim.models import KeyedVectors
w2v_path = "../../corpus/w2v/"
# fasttext
# https://qiita.com/Hironsan/items/513b9f93752ecee9e670
# w2v_name =  "dep-ja-300dim"
w2v_name =  "model.vec"
w2v_model = KeyedVectors.load_word2vec_format(w2v_path+w2v_name)

[3458] 2022-01-03 19:47:11,767 Info gensim.models.keyedvectors :loading projection weights from ../../corpus/w2v/model.vec
[3458] 2022-01-03 19:48:13,876 Info gensim.utils :KeyedVectors lifecycle event {'msg': 'loaded (351122, 300) matrix of type float32 from ../../corpus/w2v/model.vec', 'binary': False, 'encoding': 'utf8', 'datetime': '2022-01-03T19:48:13.875976', 'gensim': '4.0.1', 'python': '3.6.9 (default, Jan 26 2021, 15:33:00) \n[GCC 8.4.0]', 'platform': 'Linux-5.4.72-microsoft-standard-WSL2-x86_64-with-Ubuntu-18.04-bionic', 'event': 'load_word2vec_format'}


In [5]:
class PNModel(nn.Module):
    def __init__(self, embedding_dim, tagset_size):
        # 親クラスのコンストラクタ。決まり文句
        super(PNModel, self).__init__()    
        self.embedding_dim = embedding_dim
        self.hid1= embedding_dim*2
        self.hid2 = embedding_dim//2
        # self.fb_dim = 4
        # self.fb_dim = 0
        self.fc1 = nn.Linear(self.embedding_dim, self.hid1)
        self.fc2 = nn.Linear(self.hid1, self.hid2)
        # LSTMの出力を受け取って全結合してsoftmaxに食わせるための１層のネットワーク
        # self.hidden2tag = nn.Linear(self.hid2+self.fb_dim, tagset_size)
        self.hidden2tag = nn.Linear(self.hid2, tagset_size)
    
    def forward(self, x):
        # fb = x[:, :self.fb_dim]
        y = F.relu(self.fc1(x))
        # y = F.relu(self.fc1(x[]))
        y = F.relu(self.fc2(y))
        y = self.hidden2tag( y )
        y = F.log_softmax(y, dim=1)
        return y

In [6]:
model_path = "../models/social/"
model_name = "pn_dnn_v1.pickle"
modelM = DataManager(model_path)
PNmodel = modelM.load_data(model_name)

success load : ../models/social/pn_dnn_v1.pickle


In [7]:
def is_ADJ(token):
    if token.pos_ == "ADJ":
        return True
    return False

In [14]:
def dependent_ADJ_NOUN(text):
    doc = nlp(text)
    adj_nouns = []
    for token in doc:
        if is_ADJ(token):
            # print(token)
            for child in token.children:
                if "NOUN" == child.pos_:
                    # print(child, token)
                    adj_nouns.append([child.lemma_, token.lemma_])
    return adj_nouns

In [9]:
path = "../hand_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
convs = read_conv(path, datalist)

In [28]:
error = "Lack of common sense"
sys_utt = []
y = []
for conv in convs:
    for ut in conv:
        if ut.is_system() and ut.is_exist_error():
            sys_utt.append(ut.utt)
            if ut.is_error_included(error):
                y.append(1)
            else:
                y.append(0)

In [12]:
y.count(1)

6

In [18]:
def w2v_list(words, w2v_model):
    vector = []
    for w in words:
        if w not in w2v_model:
            return []
        else:
            vector.append(w2v_model[w] )
    return vector

In [24]:
y_pred = []
for utt in tqdm(sys_utt):
    adj_nouns = dependent_ADJ_NOUN(utt)
    if len(adj_nouns)==0:
        y_pred.append(0)
        continue
    # 形容詞と名詞のペアが存在する
    is_lack_common = False
    for pair in adj_nouns:
        an_vector = w2v_list(pair, w2v_model)

        if len(an_vector)==0:
            # y_pred.append(0)
            continue
        
        with torch.no_grad():
            an_tensor = torch.tensor(an_vector, device='cuda:0').float()
            pred = np.array(PNmodel(an_tensor).cpu()).argmax(axis=1)
        del an_tensor
        noun_pn = pred[0]
        adj_pn = pred[1]

        # ネガティブな名詞をポジティブに形容
        # {'p':2, 'e':1, 'n':0}
        if noun_pn==0 and adj_pn==2:
            print(pair)
            is_lack_common = True
            break
    
    if is_lack_common:
        y_pred.append(1)
    else:
        y_pred.append(0)
    # break
        


  4%|▎         | 49/1349 [00:00<00:15, 85.57it/s]

['台風', '凄い']
['熱中症', 'いい']
['熱中症', '大丈夫']


  8%|▊         | 110/1349 [00:01<00:14, 85.57it/s]

['猛暑', '欲しい']
['猛暑', '嬉しい']
['猛暑', '欲しい']
['冷夏', '高い']
['冷夏', '高い']


 12%|█▏        | 166/1349 [00:01<00:13, 87.46it/s]

['熱中症', 'いい']
['死者', '良い']


 14%|█▎        | 184/1349 [00:02<00:14, 80.73it/s]

['ホント', '良い']


 17%|█▋        | 232/1349 [00:02<00:12, 89.19it/s]

['熱中症', 'いい']
['頭痛', '凄い']


 19%|█▉        | 259/1349 [00:03<00:13, 80.09it/s]

['ごまかし', 'いい']
['ごまかし', 'うまい']


 21%|██        | 277/1349 [00:03<00:13, 76.99it/s]

['やけど', '深い']


 29%|██▊       | 386/1349 [00:04<00:11, 86.37it/s]

['かっこ', 'よい']
['真夏日', 'うまい']


 63%|██████▎   | 850/1349 [00:12<00:08, 58.50it/s]

['治安', '良い']


 71%|███████   | 958/1349 [00:14<00:05, 73.62it/s]

['みぞれ', '好き']


 88%|████████▊ | 1187/1349 [00:17<00:02, 75.99it/s]

['偽装', 'よい']


100%|██████████| 1349/1349 [00:19<00:00, 68.72it/s]


In [30]:
score(y, y_pred)

confusion matrix = 
 [[1328   14]
 [   0    7]]
accuracy =  0.9896219421793921
precision =  0.3333333333333333
recall =  1.0
f1 score =  0.5


- 完全勝利に近い

        confusion matrix = 
        [[1328   14]
        [   0    7]]
        accuracy =  0.9896219421793921
        precision =  0.3333333333333333
        recall =  1.0
        f1 score =  0.5

In [23]:
len(y_pred)

1367

In [29]:
for utt, p in zip(sys_utt, y):
    if p==1:
        adj_nouns = dependent_ADJ_NOUN(utt)
        print(utt, adj_nouns)

熱中症はいいですね [['熱中症', 'いい']]
熱中症は大丈夫ですね [['熱中症', '大丈夫']]
猛暑は嬉しいですね [['猛暑', '嬉しい']]
猛暑は欲しいですよねー [['猛暑', '欲しい']]
熱中症はいいですね [['熱中症', 'いい']]
死者は良いですね [['死者', '良い']]
熱中症はいいです [['熱中症', 'いい']]
