In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

from keras.layers import Activation, BatchNormalization, Dense, Dropout, Flatten, Reshape
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import Conv1D, Conv2D,Conv2DTranspose
from keras.models import Sequential
from keras.optimizers import Adam

In [2]:
from tensorflow.python.keras.layers.convolutional import Conv1DTranspose

In [3]:
from gensim.models import word2vec
import MeCab
import json
import hashlib
from googletrans import Translator
from keras.layers import Dropout
from keras.utils import np_utils
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import random
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import torch
from transformers.modeling_bert import BertModel
from transformers.tokenization_bert_japanese import BertJapaneseTokenizer
%matplotlib inline

In [4]:
translator = Translator()
mt = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/')
mt.parse('')
tokenizer = BertJapaneseTokenizer.from_pretrained('bert-base-japanese-whole-word-masking')

model_doc = Doc2Vec.load("jawiki.doc2vec.dbow300d.model")
model_word = word2vec.Word2Vec.load("wiki_plus.model")
model_bert = BertModel.from_pretrained('bert-base-japanese-whole-word-masking')

In [5]:
def get_tags(text):
    word = {}
    node = mt.parseToNode(text)
    while node:
        fields = node.feature.split(",")
        if (fields[0] == '名詞' or fields[0] == '動詞' or fields[0] == '形容詞') and node.surface in model_word.wv:
            w = node.surface
            word[w] = word.get(w, 0) + 1
        node = node.next
    return word

def weighted_mean_vec(text):
    v = np.zeros(model_word.vector_size)
    s = 1.0
    for w,weight in get_tags(text).items():
        v += weight * model_word.wv[w]  #Eventクラスeの単語wの個数＊単語wのベクトル
        s += weight
    return v / s

def get_tags_for_doc2vec(text):
    word = []
    node = mt.parseToNode(text)
    while node:
        fields = node.feature.split(",")
        if node.surface in model_doc.wv and node.surface !='':
            w = node.surface
            word.append(w)
        node = node.next
    return word

#bertのベクトル化
def get_vector_cls(text):
    input_ids = tokenizer.encode(text, return_tensors='pt') 
    result = model_bert(input_ids)
    tensor_result = result[0][0][0]
    numpy_result = tensor_result.to('cpu').detach().numpy().copy()
    return numpy_result

In [6]:
class Event:
    def __init__(self, id, type, score, desc, links):
        self.id = id
        self.type = type
        self.score = score
        self.desc = desc
        self.links = links

In [7]:
# JSON ファイルから event set をロード
def load_events(jsonfile):
    with open(jsonfile) as f:
        df = json.load(f)
    events = {x['id']: Event(x['id'], x['type'], x['score'], x['desc'], x['links']) for x in df} #eventsにidをkeyとしそのオブジェクトをvalueとした辞書を生成
    for k,x in events.items():
        x.links = [events[e] for e in x.links] #Event.linkの中身をidの配列からEventの配列に変更
    return events

In [8]:
events = load_events('sesaku2.json')

In [9]:
import pandas as pd
data = []
labels = []
columns=[]
index=[]
for k1, v1 in events.items():
    if v1.type[-1]=='部品':
        index.append(v1.desc)
for k1, v1 in events.items():
    if v1.type[-1]=='対策':
        if not v1.desc in columns:
            columns.append(v1.desc)
df = pd.DataFrame(index=index, columns=columns)
for k1, v1 in events.items():
    if v1.type[-1]=='部品':
        for k2, v2 in events.items():
            if v2.type[-1] == '対策':
                    if v2 in v1.links:
                        df.at[v1.desc, v2.desc] = 1
                    else:
                        df.at[v1.desc, v2.desc] = 0

In [10]:
taisaku_vec_word = {}
for i in df:
    taisaku_vec_word[i]=weighted_mean_vec(i)

taisaku_vec_doc = {}
for i in df:
    taisaku_vec_doc[i]=model_doc.infer_vector(get_tags_for_doc2vec(i))

taisaku_vec_bert = {}
for i in df:
    taisaku_vec_bert[i]=get_vector_cls(i)

class Label:
    TAISAKU = 1
    NASI = 0

data_word = []
labels_word = []
# data 
for index, row in df.iterrows():#部品
    x1 = weighted_mean_vec(index)
    for i in df:#対策
        x2 =  taisaku_vec_word[i]#対策
        data_word.extend([np.append(x1, x2)])
        if row[i] ==1:
            labels_word.append(Label.TAISAKU)
        else:
            labels_word.append(Label.NASI)

data_doc = []
labels_doc = []
# data 
for index, row in df.iterrows():#部品
    x1 = model_doc.infer_vector(get_tags_for_doc2vec(index))
    for i in df:#対策
        x2 =  taisaku_vec_doc[i]#対策
        data_doc.extend([np.append(x1, x2)])
        if row[i] ==1:
            labels_doc.append(Label.TAISAKU)
        else:
            labels_doc.append(Label.NASI)

data_BERT_cls = []
labels_BERT_cls = []
# data 
for index, row in df.iterrows():#部品
    x1 = get_vector_cls(index) #部品
    for i in df:#対策
        x2=taisaku_vec_bert[i]
        data_BERT_cls.extend([np.append(x1, x2)])
        if row[i] ==1:
            labels_BERT_cls.append(Label.TAISAKU)
        else:
            labels_BERT_cls.append(Label.NASI)

In [11]:
 #生成器
def build_generator(data_size, z_dim):
        model = Sequential()
        
        #全結合層によって、100×16のテンソルに変換  テンソル＝配列
        model.add(Dense(100*32, input_dim = z_dim))
        model.add(Reshape((100,32)))
        
        #転置畳み込み層により,100×16を200×16のテンソルに変換
        model.add(Conv1DTranspose(16, kernel_size=3, strides=2, padding='same'))
        
        #バッチ正規化
        model.add(BatchNormalization())
        
        model.add(LeakyReLU(alpha=0.01))
        
        #転置畳み込み層により,200×16を200×8のテンソルに変換
        model.add(Conv1DTranspose(8, kernel_size=3, strides=1, padding='same'))
        
        model.add(BatchNormalization())
        
        model.add(LeakyReLU(alpha=0.01))
        
        #転置畳み込み層により,200×64を400×1をのテンソルに変換
        model.add(Conv1DTranspose(1, kernel_size=3, strides=2, padding='same'))
        
        model.add(Activation('tanh'))
        
        return model

In [12]:
discriminator = build_generator(200,100)
discriminator.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 3200)              323200    
_________________________________________________________________
reshape (Reshape)            (None, 100, 32)           0         
_________________________________________________________________
conv1d_transpose (Conv1DTran (None, 200, 16)           1552      
_________________________________________________________________
batch_normalization (BatchNo (None, 200, 16)           64        
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 200, 16)           0         
_________________________________________________________________
conv1d_transpose_1 (Conv1DTr (None, 200, 8)            392       
_________________________________________________________________
batch_normalization_1 (Batch (None, 200, 8)            3

In [13]:
#識別器

def build_discriminator(data_size):
    model = Sequential()

    #400×1を200×32のテンソルにする畳み込み層

    model.add(Conv1D(32, kernel_size=3, strides=2,input_shape=(data_size,1), padding='same'))
    
    model.add(LeakyReLU(alpha=0.01))
    
    #200×32を100×64のテンソルにする畳み込み層
    model.add(Conv1D(64, kernel_size=3, strides=2, input_shape=(data_size,1), padding='same'))
    
    #バッチ正規化
    model.add(BatchNormalization())
    
    model.add(LeakyReLU(alpha=0.01))
    
    #7×7×64を3×3×128のテンソルにする畳み込み層
    #model.add(Conv2D(128, kernel_size=3, strides=2, imput_shape=(data_size,), padding='same'))
    model.add(Conv1D(128, kernel_size=3, strides=2, input_shape=(data_size,1), padding='same'))
    
    #バッチ正規化
    model.add(BatchNormalization())
    
    model.add(LeakyReLU(alpha=0.01))
        
    #出力にシグモイド関数を適用
    model.add(Dense(1, activation='sigmoid'))
    
    return model

In [14]:
def build_gan(generator, discriminator):
    
    model = Sequential()
    
    #生成器と識別器のモデルを組み合わせる
    model.add(generator)
    model.add(discriminator)
    
    return model


def build_compile(data, z_dim):
    data_size = len(data[0])
    #識別器の構築とコンパイル
    discriminator = build_discriminator(data_size)
    discriminator.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

    #生成器の構築
    generator = build_generator(data_size, z_dim)

    #生成器の構築中は識別器のパラメータを固定
    discriminator.trainable = False

    #生成器の訓練のため、識別器は固定し、GANモデルの構築とコンパイルを行う
    gan = build_gan(generator, discriminator)
    gan.compile(loss='binary_crossentropy', optimizer=Adam())
    
    return discriminator, generator, gan

In [15]:
def x_y_train(data,labels):
    index_1 = [i for i, x in enumerate(labels) if x == 1]
    index_0 = [i for i, x in enumerate(labels) if x == 0]
    index_0 = random.sample(index_0, len(index_1))
    data_1 = [data[i] for i in index_1]
    data_0 = [data[i] for i in index_0]

    labels_1 = [Label.TAISAKU]*len(data_1)
    labels_0 = [Label.NASI]*len(data_0) 
    data_0 = np.array(data_0)
    labels_0 = np.array(labels_0)
    data_1 = np.array(data_1)
    labels_1 = np.array(labels_1)
    return data_0, labels_0, data_1, labels_1

In [33]:
losses = []
accuracies = []
iteration_checkpoints = []


def train(data, labels, iterations, batch_size, sample_interval):
    
    data_0, labels_0, data_1, lables_1 = x_y_train(data,labels)
    #1の数7303
    #0の数7303

    data_0 = np.reshape(data_0, (-1, data_0.shape[1], 1))
    data_1 = np.reshape(data_1, (-1, data_1.shape[1], 1))
    
    #ラベル1
    real = np.ones((batch_size,1))
    #ラベル0
    fake = np.zeros((batch_size,1))
    
    
    for iteration in range(iterations):
        
        #-------------------
        #識別器の訓練
        #-------------------
        
        #ランダムに関係があるベクトルをとる
        idx = np.random.randint(0,len(data_1),batch_size)
        vecs = data_1[idx]
        
        
        z = np.random.normal(0, 1,(batch_size, 100))
        
        gen_vec = generator.predict(z)
        
        d_loss_real = discriminator.train_on_batch(vecs, real)
        d_loss_fake = discriminator.train_on_batch(gen_vec, fake)
        d_loss, accuracy = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        #-------------------
        #生成器の訓練
        #-------------------

        z = np.random.normal(0, 1,(batch_size, 100))
            
        gen_vec = generator.predict(z)
        
        g_loss = gan.train_on_batch(z , fake)  #real→fake
        
        if(iteration +1) % sample_interval ==0:
            losses.append((d_loss, g_loss))
            accuracies.append(100.0 * accuracy)
            iteration_checkpoints.append(iteration +1)
        
        print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" %(iteration +1,d_loss, 100.0*accuracy,g_loss))
    return generator

In [34]:
z_dim = 100

discriminator, generator, gan = build_compile(data_word, z_dim)

iterations =1000
batch_size = 128
sample_interval = 1000

generator = train(data_word,labels_word,iterations,batch_size,sample_interval)

1 [D loss: 0.798666, acc.: 46.90%] [G loss: 0.704229]
2 [D loss: 0.751533, acc.: 51.23%] [G loss: 0.701942]
3 [D loss: 0.729438, acc.: 51.64%] [G loss: 0.697114]
4 [D loss: 0.707229, acc.: 54.36%] [G loss: 0.694444]
5 [D loss: 0.690716, acc.: 54.93%] [G loss: 0.693570]
6 [D loss: 0.670387, acc.: 57.04%] [G loss: 0.693211]
7 [D loss: 0.653779, acc.: 59.80%] [G loss: 0.692172]
8 [D loss: 0.637908, acc.: 61.73%] [G loss: 0.690932]
9 [D loss: 0.625756, acc.: 63.94%] [G loss: 0.689723]
10 [D loss: 0.609733, acc.: 67.12%] [G loss: 0.688601]
11 [D loss: 0.596043, acc.: 70.75%] [G loss: 0.687069]
12 [D loss: 0.583942, acc.: 73.30%] [G loss: 0.684661]
13 [D loss: 0.567541, acc.: 76.98%] [G loss: 0.681941]
14 [D loss: 0.556607, acc.: 79.55%] [G loss: 0.679264]
15 [D loss: 0.542930, acc.: 82.63%] [G loss: 0.676641]
16 [D loss: 0.526616, acc.: 86.04%] [G loss: 0.674018]
17 [D loss: 0.516362, acc.: 87.45%] [G loss: 0.670906]
18 [D loss: 0.500471, acc.: 89.79%] [G loss: 0.667483]
19 [D loss: 0.48180

149 [D loss: 0.057206, acc.: 100.00%] [G loss: 0.892993]
150 [D loss: 0.055491, acc.: 100.00%] [G loss: 0.897211]
151 [D loss: 0.054246, acc.: 100.00%] [G loss: 0.901840]
152 [D loss: 0.053887, acc.: 100.00%] [G loss: 0.907368]
153 [D loss: 0.053568, acc.: 100.00%] [G loss: 0.912024]
154 [D loss: 0.053130, acc.: 100.00%] [G loss: 0.918216]
155 [D loss: 0.052036, acc.: 100.00%] [G loss: 0.922428]
156 [D loss: 0.051704, acc.: 100.00%] [G loss: 0.926845]
157 [D loss: 0.051911, acc.: 100.00%] [G loss: 0.927650]
158 [D loss: 0.049817, acc.: 100.00%] [G loss: 0.924920]
159 [D loss: 0.050550, acc.: 100.00%] [G loss: 0.926183]
160 [D loss: 0.050290, acc.: 100.00%] [G loss: 0.925262]
161 [D loss: 0.050828, acc.: 100.00%] [G loss: 0.927368]
162 [D loss: 0.051035, acc.: 100.00%] [G loss: 0.934911]
163 [D loss: 0.050872, acc.: 100.00%] [G loss: 0.938515]
164 [D loss: 0.050100, acc.: 100.00%] [G loss: 0.942398]
165 [D loss: 0.052167, acc.: 100.00%] [G loss: 0.943060]
166 [D loss: 0.052625, acc.: 10

294 [D loss: 0.033903, acc.: 99.93%] [G loss: 1.584532]
295 [D loss: 0.031989, acc.: 99.94%] [G loss: 1.608773]
296 [D loss: 0.031227, acc.: 99.94%] [G loss: 1.632663]
297 [D loss: 0.031446, acc.: 99.91%] [G loss: 1.654647]
298 [D loss: 0.030295, acc.: 99.90%] [G loss: 1.671344]
299 [D loss: 0.031308, acc.: 99.91%] [G loss: 1.698087]
300 [D loss: 0.030250, acc.: 99.88%] [G loss: 1.734038]
301 [D loss: 0.028519, acc.: 99.95%] [G loss: 1.766835]
302 [D loss: 0.029075, acc.: 99.93%] [G loss: 1.796423]
303 [D loss: 0.027907, acc.: 99.93%] [G loss: 1.821559]
304 [D loss: 0.027092, acc.: 99.97%] [G loss: 1.860124]
305 [D loss: 0.027601, acc.: 99.92%] [G loss: 1.892839]
306 [D loss: 0.026511, acc.: 99.97%] [G loss: 1.923232]
307 [D loss: 0.026042, acc.: 99.99%] [G loss: 1.954529]
308 [D loss: 0.025645, acc.: 99.98%] [G loss: 1.985574]
309 [D loss: 0.026216, acc.: 99.97%] [G loss: 2.015791]
310 [D loss: 0.026568, acc.: 99.97%] [G loss: 2.053775]
311 [D loss: 0.025916, acc.: 99.98%] [G loss: 2.

439 [D loss: 0.016439, acc.: 100.00%] [G loss: 2.812070]
440 [D loss: 0.015347, acc.: 100.00%] [G loss: 2.835972]
441 [D loss: 0.014984, acc.: 99.99%] [G loss: 2.843692]
442 [D loss: 0.013621, acc.: 100.00%] [G loss: 2.844773]
443 [D loss: 0.013402, acc.: 100.00%] [G loss: 2.883596]
444 [D loss: 0.012467, acc.: 100.00%] [G loss: 2.902140]
445 [D loss: 0.011996, acc.: 100.00%] [G loss: 2.931898]
446 [D loss: 0.011569, acc.: 100.00%] [G loss: 2.980362]
447 [D loss: 0.011298, acc.: 99.99%] [G loss: 3.019136]
448 [D loss: 0.010848, acc.: 100.00%] [G loss: 3.039509]
449 [D loss: 0.010436, acc.: 99.99%] [G loss: 3.030984]
450 [D loss: 0.010044, acc.: 100.00%] [G loss: 3.020078]
451 [D loss: 0.009959, acc.: 100.00%] [G loss: 2.997826]
452 [D loss: 0.009899, acc.: 100.00%] [G loss: 2.968577]
453 [D loss: 0.009936, acc.: 100.00%] [G loss: 2.908720]
454 [D loss: 0.009660, acc.: 100.00%] [G loss: 2.844471]
455 [D loss: 0.009461, acc.: 100.00%] [G loss: 2.793481]
456 [D loss: 0.009513, acc.: 100.0

585 [D loss: 0.021812, acc.: 100.00%] [G loss: 2.755517]
586 [D loss: 0.021468, acc.: 100.00%] [G loss: 2.731687]
587 [D loss: 0.020664, acc.: 100.00%] [G loss: 2.733606]
588 [D loss: 0.021200, acc.: 100.00%] [G loss: 2.742759]
589 [D loss: 0.020431, acc.: 100.00%] [G loss: 2.743729]
590 [D loss: 0.020376, acc.: 100.00%] [G loss: 2.739049]
591 [D loss: 0.020385, acc.: 100.00%] [G loss: 2.755546]
592 [D loss: 0.020807, acc.: 100.00%] [G loss: 2.780550]
593 [D loss: 0.019890, acc.: 100.00%] [G loss: 2.808397]
594 [D loss: 0.019864, acc.: 100.00%] [G loss: 2.818349]
595 [D loss: 0.019496, acc.: 100.00%] [G loss: 2.822618]
596 [D loss: 0.019099, acc.: 100.00%] [G loss: 2.842066]
597 [D loss: 0.018869, acc.: 100.00%] [G loss: 2.861638]
598 [D loss: 0.018009, acc.: 100.00%] [G loss: 2.898223]
599 [D loss: 0.018268, acc.: 99.99%] [G loss: 2.930706]
600 [D loss: 0.017918, acc.: 100.00%] [G loss: 2.967257]
601 [D loss: 0.017720, acc.: 100.00%] [G loss: 2.986391]
602 [D loss: 0.018245, acc.: 100

731 [D loss: 0.007007, acc.: 100.00%] [G loss: 4.122212]
732 [D loss: 0.007026, acc.: 100.00%] [G loss: 4.122414]
733 [D loss: 0.006968, acc.: 100.00%] [G loss: 4.149658]
734 [D loss: 0.007242, acc.: 100.00%] [G loss: 4.179526]
735 [D loss: 0.007217, acc.: 100.00%] [G loss: 4.210777]
736 [D loss: 0.006992, acc.: 100.00%] [G loss: 4.225337]
737 [D loss: 0.006925, acc.: 100.00%] [G loss: 4.240183]
738 [D loss: 0.007075, acc.: 100.00%] [G loss: 4.237346]
739 [D loss: 0.007020, acc.: 100.00%] [G loss: 4.222518]
740 [D loss: 0.007331, acc.: 99.98%] [G loss: 4.220695]
741 [D loss: 0.006773, acc.: 100.00%] [G loss: 4.243931]
742 [D loss: 0.006989, acc.: 100.00%] [G loss: 4.262623]
743 [D loss: 0.006849, acc.: 100.00%] [G loss: 4.269698]
744 [D loss: 0.006973, acc.: 100.00%] [G loss: 4.261486]
745 [D loss: 0.007037, acc.: 100.00%] [G loss: 4.266458]
746 [D loss: 0.006730, acc.: 100.00%] [G loss: 4.271847]
747 [D loss: 0.006755, acc.: 99.99%] [G loss: 4.323784]
748 [D loss: 0.006640, acc.: 100.

875 [D loss: 0.004631, acc.: 100.00%] [G loss: 1.306555]
876 [D loss: 0.004818, acc.: 100.00%] [G loss: 1.315861]
877 [D loss: 0.004636, acc.: 100.00%] [G loss: 1.341340]
878 [D loss: 0.004637, acc.: 100.00%] [G loss: 1.375164]
879 [D loss: 0.004907, acc.: 100.00%] [G loss: 1.418127]
880 [D loss: 0.004727, acc.: 100.00%] [G loss: 1.464967]
881 [D loss: 0.004834, acc.: 100.00%] [G loss: 1.509828]
882 [D loss: 0.004505, acc.: 100.00%] [G loss: 1.545273]
883 [D loss: 0.004540, acc.: 100.00%] [G loss: 1.559448]
884 [D loss: 0.004384, acc.: 100.00%] [G loss: 1.558810]
885 [D loss: 0.004665, acc.: 100.00%] [G loss: 1.557967]
886 [D loss: 0.004468, acc.: 100.00%] [G loss: 1.562344]
887 [D loss: 0.004657, acc.: 100.00%] [G loss: 1.569088]
888 [D loss: 0.004088, acc.: 100.00%] [G loss: 1.590155]
889 [D loss: 0.004698, acc.: 100.00%] [G loss: 1.632544]
890 [D loss: 0.004393, acc.: 100.00%] [G loss: 1.675312]
891 [D loss: 0.004678, acc.: 99.99%] [G loss: 1.721993]
892 [D loss: 0.004278, acc.: 100

In [35]:
z = np.random.normal(0,1,(1, 100))
vec = generator.predict(z)
vec = vec[0].reshape(1,400)
vec.shape

(1, 400)

In [36]:
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [38]:
_ ,  _, data_1, _ = x_y_train(data_word,labels_word)
data_1.shape
result = 0
for i in range(7303):
    if result < cos_sim(vec[0],data_1[i]):
        result = cos_sim(vec[0],data_1[i])
result

0.112584877604407