In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

from keras.layers import Dense, Flatten, Reshape
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential
from keras.optimizers import Adam

In [2]:
from gensim.models import word2vec
import MeCab
import json
import hashlib
from googletrans import Translator
from keras.layers import Dropout
from keras.utils import np_utils
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import random
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import torch
from transformers.modeling_bert import BertModel
from transformers.tokenization_bert_japanese import BertJapaneseTokenizer
%matplotlib inline

In [3]:
translator = Translator()
mt = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/')
mt.parse('')
tokenizer = BertJapaneseTokenizer.from_pretrained('bert-base-japanese-whole-word-masking')

model_doc = Doc2Vec.load("jawiki.doc2vec.dbow300d.model")
model_word = word2vec.Word2Vec.load("wiki_plus.model")
model_bert = BertModel.from_pretrained('bert-base-japanese-whole-word-masking')

In [4]:
def get_tags(text):
    word = {}
    node = mt.parseToNode(text)
    while node:
        fields = node.feature.split(",")
        if (fields[0] == '名詞' or fields[0] == '動詞' or fields[0] == '形容詞') and node.surface in model_word.wv:
            w = node.surface
            word[w] = word.get(w, 0) + 1
        node = node.next
    return word

def weighted_mean_vec(text):
    v = np.zeros(model_word.vector_size)
    s = 1.0
    for w,weight in get_tags(text).items():
        v += weight * model_word.wv[w]  #Eventクラスeの単語wの個数＊単語wのベクトル
        s += weight
    return v / s

def get_tags_for_doc2vec(text):
    word = []
    node = mt.parseToNode(text)
    while node:
        fields = node.feature.split(",")
        if node.surface in model_doc.wv and node.surface !='':
            w = node.surface
            word.append(w)
        node = node.next
    return word

#bertのベクトル化
def get_vector_cls(text):
    input_ids = tokenizer.encode(text, return_tensors='pt') 
    result = model_bert(input_ids)
    tensor_result = result[0][0][0]
    numpy_result = tensor_result.to('cpu').detach().numpy().copy()
    return numpy_result

In [5]:
class Event:
    def __init__(self, id, type, score, desc, links):
        self.id = id
        self.type = type
        self.score = score
        self.desc = desc
        self.links = links

In [6]:
# JSON ファイルから event set をロード
def load_events(jsonfile):
    with open(jsonfile) as f:
        df = json.load(f)
    events = {x['id']: Event(x['id'], x['type'], x['score'], x['desc'], x['links']) for x in df} #eventsにidをkeyとしそのオブジェクトをvalueとした辞書を生成
    for k,x in events.items():
        x.links = [events[e] for e in x.links] #Event.linkの中身をidの配列からEventの配列に変更
    return events

In [7]:
events = load_events('sesaku2.json')

In [8]:
import pandas as pd
data = []
labels = []
columns=[]
index=[]
for k1, v1 in events.items():
    if v1.type[-1]=='部品':
        index.append(v1.desc)
for k1, v1 in events.items():
    if v1.type[-1]=='対策':
        if not v1.desc in columns:
            columns.append(v1.desc)
df = pd.DataFrame(index=index, columns=columns)
for k1, v1 in events.items():
    if v1.type[-1]=='部品':
        for k2, v2 in events.items():
            if v2.type[-1] == '対策':
                    if v2 in v1.links:
                        df.at[v1.desc, v2.desc] = 1
                    else:
                        df.at[v1.desc, v2.desc] = 0

In [9]:
taisaku_vec_word = {}
for i in df:
    taisaku_vec_word[i]=weighted_mean_vec(i)

taisaku_vec_doc = {}
for i in df:
    taisaku_vec_doc[i]=model_doc.infer_vector(get_tags_for_doc2vec(i))

taisaku_vec_bert = {}
for i in df:
    taisaku_vec_bert[i]=get_vector_cls(i)

class Label:
    TAISAKU = 1
    NASI = 0

data_word = []
labels_word = []
# data 
for index, row in df.iterrows():#部品
    x1 = weighted_mean_vec(index)
    for i in df:#対策
        x2 =  taisaku_vec_word[i]#対策
        data_word.extend([np.append(x1, x2)])
        if row[i] ==1:
            labels_word.append(Label.TAISAKU)
        else:
            labels_word.append(Label.NASI)

data_doc = []
labels_doc = []
# data 
for index, row in df.iterrows():#部品
    x1 = model_doc.infer_vector(get_tags_for_doc2vec(index))
    for i in df:#対策
        x2 =  taisaku_vec_doc[i]#対策
        data_doc.extend([np.append(x1, x2)])
        if row[i] ==1:
            labels_doc.append(Label.TAISAKU)
        else:
            labels_doc.append(Label.NASI)

data_BERT_cls = []
labels_BERT_cls = []
# data 
for index, row in df.iterrows():#部品
    x1 = get_vector_cls(index) #部品
    for i in df:#対策
        x2=taisaku_vec_bert[i]
        data_BERT_cls.extend([np.append(x1, x2)])
        if row[i] ==1:
            labels_BERT_cls.append(Label.TAISAKU)
        else:
            labels_BERT_cls.append(Label.NASI)

In [10]:
#word,doc,bert のベクトルの範囲を確認する
def max_min(data):
    data_max = 0
    data_min = 0
    for i in data:
        if data_max < i.max():
            data_max = i.max()
        if data_min > i.min():
            data_min = i.min()
    print(data_max,data_min)
    return data_max, data_min

word_max, word_min = max_min(data_word)
doc_max, doc_min = max_min(data_doc)
bert_max, bert_min = max_min(data_BERT_cls)

4.848580956459045 -6.142003456751506
0.5652471 -0.57631934
10.4636545 -1.5997354


In [11]:
 #生成器
def build_generator(data_size, z_dim):
        model = Sequential()
        
        model.add(Dense(128, input_dim = z_dim))
        
        model.add(LeakyReLU(alpha=0.01))
        
        model.add(Dense(data_size, activation='tanh'))
        
        return model

In [12]:
#識別器

def build_discriminator(data_size):
    model = Sequential()

    model.add(Dense(128, activation=LeakyReLU(alpha=0.01), input_shape=(data_size,)))
    
    #model.add(LeakyReLU(alpha=0.01))
        
    model.add(Dense(1, activation='sigmoid'))
    
    return model

In [13]:
def build_gan(generator, discriminator):
    model = Sequential()
    
    model.add(generator)
    model.add(discriminator)
    
    return model


def build_compile(data, z_dim):
    data_size = len(data[0])
    #識別器の構築とコンパイル
    discriminator = build_discriminator(data_size)
    discriminator.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

    #生成器の構築
    generator = build_generator(data_size, z_dim)

    #生成器の構築中は識別器のパラメータを固定
    discriminator.trainable = False

    #生成器の訓練のため、識別器は固定し、GANモデルの構築とコンパイルを行う
    gan = build_gan(generator, discriminator)
    gan.compile(loss='binary_crossentropy', optimizer=Adam())
    
    return discriminator, generator, gan

In [14]:
def train_data(data,labels):
    index_1 = [i for i, x in enumerate(labels) if x == 1]
    index_0 = [i for i, x in enumerate(labels) if x == 0]
    index_0 = random.sample(index_0, len(index_1))
    data_1 = [data[i] for i in index_1]
    data_0 = [data[i] for i in index_0]
    labels = [Label.TAISAKU]*len(data_1) + [Label.NASI]*len(data_0) 
    data = data_1 + data_0
    data = np.array(data)
    labels = np.array(labels)
    return data, labels

In [15]:
def x_y_train(data,labels):
    index_1 = [i for i, x in enumerate(labels) if x == 1]
    index_0 = [i for i, x in enumerate(labels) if x == 0]
    index_0 = random.sample(index_0, len(index_1))
    data_1 = [data[i] for i in index_1]
    data_0 = [data[i] for i in index_0]

    labels_1 = [Label.TAISAKU]*len(data_1)
    labels_0 = [Label.NASI]*len(data_0) 
    data_0 = np.array(data_0)
    labels_0 = np.array(labels_0)
    data_1 = np.array(data_1)
    labels_1 = np.array(labels_1)
    return data_0, labels_0, data_1, labels_1

In [26]:
losses = []
accuracies = []
iteration_checkpoints = []


def train(data, labels, iterations, batch_size, sample_interval):
    
    data_0, labels_0, data_1, lables_1 = x_y_train(data,labels)
    #1の数7303
    #0の数7303
    
    #ラベル1
    real = np.ones((batch_size,1))
    #ラベル0
    fake = np.zeros((batch_size,1))
    
    
    for iteration in range(iterations):
        
        #-------------------
        #識別器の訓練
        #-------------------
        
        #ランダムに関係があるベクトルをとる
        idx = np.random.randint(0,len(data_1),batch_size)
        vecs = data_1[idx]
        
        
        #word doc bert で生成する偽のベクトルの範囲を変化させる
        if(len(data_1[0])==400):
            z = np.random.normal(word_min, word_max,(batch_size, 100))
        elif(len(data_1[0])==600):
            z = np.random.normal(doc_min, doc_max,(batch_size, 100))
        elif(len(data_1[0])==1536):
            z = np.random.normal(bert_min, bert_max,(batch_size, 100))
        
        gen_vec = generator.predict(z)
        
        d_loss_real = discriminator.train_on_batch(vecs, real)
        d_loss_fake = discriminator.train_on_batch(gen_vec, fake)
        d_loss, accuracy = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        #-------------------
        #生成器の訓練
        #-------------------
        if(len(data_1[0])==400):
            z = np.random.normal(word_min, word_max,(batch_size, 100))
        elif(len(data_1[0])==600):
            z = np.random.normal(doc_min, doc_max,(batch_size, 100))
        elif(len(data_1[0])==1536):
            z = np.random.normal(bert_min, bert_max,(batch_size, 100))
            
        gen_vec = generator.predict(z)
        
        g_loss = gan.train_on_batch(z,real)
        
        if(iteration +1) % sample_interval ==0:
            losses.append((d_loss, g_loss))
            accuracies.append(100.0 * accuracy)
            iteration_checkpoints.append(iteration +1)
        
        print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" %(iteration +1,d_loss, 100.0*accuracy,g_loss))
    return generator

In [27]:
data_0, labels_0, data_1, lables_1 = x_y_train(data_word,labels_word)

In [28]:
z = np.random.randint(0,len(data_1),5)

In [29]:
len(data_1[1])

400

In [30]:
z_dim = 100

discriminator, generator, gan = build_compile(data_word, z_dim)

iterations =1000
batch_size = 128
sample_interval = 1000

generator = train(data_word,labels_word,iterations,batch_size,sample_interval)

1 [D loss: 0.367484, acc.: 85.16%] [G loss: 1.631282]
2 [D loss: 0.315841, acc.: 83.98%] [G loss: 1.672086]
3 [D loss: 0.338613, acc.: 80.08%] [G loss: 1.641703]
4 [D loss: 0.321846, acc.: 80.86%] [G loss: 1.565677]
5 [D loss: 0.312600, acc.: 81.64%] [G loss: 1.592006]
6 [D loss: 0.285491, acc.: 84.77%] [G loss: 1.609353]
7 [D loss: 0.310954, acc.: 79.30%] [G loss: 1.786027]
8 [D loss: 0.274887, acc.: 83.98%] [G loss: 1.979512]
9 [D loss: 0.227357, acc.: 91.02%] [G loss: 1.879003]
10 [D loss: 0.164901, acc.: 96.09%] [G loss: 2.244214]
11 [D loss: 0.136092, acc.: 98.83%] [G loss: 2.536335]
12 [D loss: 0.099817, acc.: 99.22%] [G loss: 2.831724]
13 [D loss: 0.089699, acc.: 98.83%] [G loss: 3.255048]
14 [D loss: 0.059855, acc.: 100.00%] [G loss: 3.548979]
15 [D loss: 0.048536, acc.: 100.00%] [G loss: 3.621669]
16 [D loss: 0.042818, acc.: 100.00%] [G loss: 3.789325]
17 [D loss: 0.038055, acc.: 99.61%] [G loss: 3.735094]
18 [D loss: 0.046841, acc.: 100.00%] [G loss: 3.601462]
19 [D loss: 0.0

134 [D loss: 0.000224, acc.: 100.00%] [G loss: 8.561267]
135 [D loss: 0.000204, acc.: 100.00%] [G loss: 8.634319]
136 [D loss: 0.000231, acc.: 100.00%] [G loss: 8.525746]
137 [D loss: 0.000185, acc.: 100.00%] [G loss: 8.399386]
138 [D loss: 0.000307, acc.: 100.00%] [G loss: 8.269097]
139 [D loss: 0.000268, acc.: 100.00%] [G loss: 8.030260]
140 [D loss: 0.000381, acc.: 100.00%] [G loss: 7.655058]
141 [D loss: 0.000538, acc.: 100.00%] [G loss: 7.301559]
142 [D loss: 0.000884, acc.: 100.00%] [G loss: 6.768270]
143 [D loss: 0.001180, acc.: 100.00%] [G loss: 6.370023]
144 [D loss: 0.001400, acc.: 100.00%] [G loss: 6.317873]
145 [D loss: 0.001292, acc.: 100.00%] [G loss: 6.387547]
146 [D loss: 0.001483, acc.: 100.00%] [G loss: 6.360975]
147 [D loss: 0.001470, acc.: 100.00%] [G loss: 6.246059]
148 [D loss: 0.001734, acc.: 100.00%] [G loss: 6.321857]
149 [D loss: 0.001558, acc.: 100.00%] [G loss: 6.413896]
150 [D loss: 0.001572, acc.: 100.00%] [G loss: 6.318861]
151 [D loss: 0.001313, acc.: 10

278 [D loss: 0.000108, acc.: 100.00%] [G loss: 8.699661]
279 [D loss: 0.000146, acc.: 100.00%] [G loss: 8.564693]
280 [D loss: 0.000116, acc.: 100.00%] [G loss: 8.623411]
281 [D loss: 0.000106, acc.: 100.00%] [G loss: 8.726044]
282 [D loss: 0.000107, acc.: 100.00%] [G loss: 8.814136]
283 [D loss: 0.000097, acc.: 100.00%] [G loss: 8.838055]
284 [D loss: 0.000134, acc.: 100.00%] [G loss: 8.744215]
285 [D loss: 0.000113, acc.: 100.00%] [G loss: 8.610493]
286 [D loss: 0.000126, acc.: 100.00%] [G loss: 8.497830]
287 [D loss: 0.000144, acc.: 100.00%] [G loss: 8.554782]
288 [D loss: 0.000123, acc.: 100.00%] [G loss: 8.452472]
289 [D loss: 0.000136, acc.: 100.00%] [G loss: 8.408625]
290 [D loss: 0.000182, acc.: 100.00%] [G loss: 8.471846]
291 [D loss: 0.000153, acc.: 100.00%] [G loss: 8.531819]
292 [D loss: 0.000118, acc.: 100.00%] [G loss: 8.591497]
293 [D loss: 0.000137, acc.: 100.00%] [G loss: 8.679623]
294 [D loss: 0.000130, acc.: 100.00%] [G loss: 8.721046]
295 [D loss: 0.000102, acc.: 10

422 [D loss: 0.000017, acc.: 100.00%] [G loss: 11.012309]
423 [D loss: 0.000016, acc.: 100.00%] [G loss: 11.027472]
424 [D loss: 0.000013, acc.: 100.00%] [G loss: 11.042933]
425 [D loss: 0.000034, acc.: 100.00%] [G loss: 11.058044]
426 [D loss: 0.000016, acc.: 100.00%] [G loss: 11.073267]
427 [D loss: 0.000019, acc.: 100.00%] [G loss: 11.088463]
428 [D loss: 0.000014, acc.: 100.00%] [G loss: 11.103693]
429 [D loss: 0.000010, acc.: 100.00%] [G loss: 11.119038]
430 [D loss: 0.000017, acc.: 100.00%] [G loss: 11.134266]
431 [D loss: 0.000021, acc.: 100.00%] [G loss: 11.149232]
432 [D loss: 0.000012, acc.: 100.00%] [G loss: 11.164167]
433 [D loss: 0.000017, acc.: 100.00%] [G loss: 11.178942]
434 [D loss: 0.000011, acc.: 100.00%] [G loss: 11.193684]
435 [D loss: 0.000024, acc.: 100.00%] [G loss: 11.208045]
436 [D loss: 0.000030, acc.: 100.00%] [G loss: 11.221914]
437 [D loss: 0.000016, acc.: 100.00%] [G loss: 11.235670]
438 [D loss: 0.000025, acc.: 100.00%] [G loss: 11.248985]
439 [D loss: 0

566 [D loss: 0.000007, acc.: 100.00%] [G loss: 11.733463]
567 [D loss: 0.000008, acc.: 100.00%] [G loss: 11.743974]
568 [D loss: 0.000017, acc.: 100.00%] [G loss: 11.754217]
569 [D loss: 0.000019, acc.: 100.00%] [G loss: 11.764277]
570 [D loss: 0.000007, acc.: 100.00%] [G loss: 11.774398]
571 [D loss: 0.000008, acc.: 100.00%] [G loss: 11.784488]
572 [D loss: 0.000013, acc.: 100.00%] [G loss: 11.794421]
573 [D loss: 0.000010, acc.: 100.00%] [G loss: 11.804272]
574 [D loss: 0.000009, acc.: 100.00%] [G loss: 11.814028]
575 [D loss: 0.000008, acc.: 100.00%] [G loss: 11.823761]
576 [D loss: 0.000008, acc.: 100.00%] [G loss: 11.833481]
577 [D loss: 0.000008, acc.: 100.00%] [G loss: 11.843099]
578 [D loss: 0.000007, acc.: 100.00%] [G loss: 11.852702]
579 [D loss: 0.000007, acc.: 100.00%] [G loss: 11.862280]
580 [D loss: 0.000008, acc.: 100.00%] [G loss: 11.871799]
581 [D loss: 0.000008, acc.: 100.00%] [G loss: 11.881229]
582 [D loss: 0.000011, acc.: 100.00%] [G loss: 11.890476]
583 [D loss: 0

710 [D loss: 0.000005, acc.: 100.00%] [G loss: 12.431799]
711 [D loss: 0.000006, acc.: 100.00%] [G loss: 12.437815]
712 [D loss: 0.000006, acc.: 100.00%] [G loss: 12.443857]
713 [D loss: 0.000004, acc.: 100.00%] [G loss: 12.449972]
714 [D loss: 0.000004, acc.: 100.00%] [G loss: 12.456144]
715 [D loss: 0.000006, acc.: 100.00%] [G loss: 12.462295]
716 [D loss: 0.000010, acc.: 100.00%] [G loss: 12.468410]
717 [D loss: 0.000003, acc.: 100.00%] [G loss: 12.474571]
718 [D loss: 0.000004, acc.: 100.00%] [G loss: 12.480762]
719 [D loss: 0.000007, acc.: 100.00%] [G loss: 12.486830]
720 [D loss: 0.000003, acc.: 100.00%] [G loss: 12.492950]
721 [D loss: 0.000005, acc.: 100.00%] [G loss: 12.499076]
722 [D loss: 0.000005, acc.: 100.00%] [G loss: 12.505175]
723 [D loss: 0.000004, acc.: 100.00%] [G loss: 12.511221]
724 [D loss: 0.000006, acc.: 100.00%] [G loss: 12.517282]
725 [D loss: 0.000006, acc.: 100.00%] [G loss: 12.523239]
726 [D loss: 0.000004, acc.: 100.00%] [G loss: 12.529190]
727 [D loss: 0

854 [D loss: 0.000003, acc.: 100.00%] [G loss: 13.027767]
855 [D loss: 0.000002, acc.: 100.00%] [G loss: 13.032124]
856 [D loss: 0.000002, acc.: 100.00%] [G loss: 13.036502]
857 [D loss: 0.000005, acc.: 100.00%] [G loss: 13.040788]
858 [D loss: 0.000005, acc.: 100.00%] [G loss: 13.045000]
859 [D loss: 0.000004, acc.: 100.00%] [G loss: 13.049166]
860 [D loss: 0.000002, acc.: 100.00%] [G loss: 13.053356]
861 [D loss: 0.000006, acc.: 100.00%] [G loss: 13.057394]
862 [D loss: 0.000003, acc.: 100.00%] [G loss: 13.061480]
863 [D loss: 0.000004, acc.: 100.00%] [G loss: 13.065536]
864 [D loss: 0.000003, acc.: 100.00%] [G loss: 13.069626]
865 [D loss: 0.000003, acc.: 100.00%] [G loss: 13.073702]
866 [D loss: 0.000005, acc.: 100.00%] [G loss: 13.077714]
867 [D loss: 0.000003, acc.: 100.00%] [G loss: 13.081749]
868 [D loss: 0.000006, acc.: 100.00%] [G loss: 13.085670]
869 [D loss: 0.000003, acc.: 100.00%] [G loss: 13.089624]
870 [D loss: 0.000003, acc.: 100.00%] [G loss: 13.093585]
871 [D loss: 0

998 [D loss: 0.000001, acc.: 100.00%] [G loss: 13.366163]
999 [D loss: 0.000002, acc.: 100.00%] [G loss: 13.369995]
1000 [D loss: 0.000004, acc.: 100.00%] [G loss: 13.373755]
