In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

from keras.layers import Dense, Flatten, Reshape
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential
from keras.optimizers import Adam

In [2]:
from gensim.models import word2vec
import MeCab
import json
import hashlib
from googletrans import Translator
from keras.layers import Dropout
from keras.utils import np_utils
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import random
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import torch
from transformers.modeling_bert import BertModel
from transformers.tokenization_bert_japanese import BertJapaneseTokenizer
%matplotlib inline

In [3]:
translator = Translator()
mt = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/')
mt.parse('')
tokenizer = BertJapaneseTokenizer.from_pretrained('bert-base-japanese-whole-word-masking')

model_doc = Doc2Vec.load("jawiki.doc2vec.dbow300d.model")
model_word = word2vec.Word2Vec.load("wiki_plus.model")
model_bert = BertModel.from_pretrained('bert-base-japanese-whole-word-masking')

In [4]:
def get_tags(text):
    word = {}
    node = mt.parseToNode(text)
    while node:
        fields = node.feature.split(",")
        if (fields[0] == '名詞' or fields[0] == '動詞' or fields[0] == '形容詞') and node.surface in model_word.wv:
            w = node.surface
            word[w] = word.get(w, 0) + 1
        node = node.next
    return word

def weighted_mean_vec(text):
    v = np.zeros(model_word.vector_size)
    s = 1.0
    for w,weight in get_tags(text).items():
        v += weight * model_word.wv[w]  #Eventクラスeの単語wの個数＊単語wのベクトル
        s += weight
    return v / s

def get_tags_for_doc2vec(text):
    word = []
    node = mt.parseToNode(text)
    while node:
        fields = node.feature.split(",")
        if node.surface in model_doc.wv and node.surface !='':
            w = node.surface
            word.append(w)
        node = node.next
    return word

#bertのベクトル化
def get_vector_cls(text):
    input_ids = tokenizer.encode(text, return_tensors='pt') 
    result = model_bert(input_ids)
    tensor_result = result[0][0][0]
    numpy_result = tensor_result.to('cpu').detach().numpy().copy()
    return numpy_result

In [5]:
class Event:
    def __init__(self, id, type, score, desc, links):
        self.id = id
        self.type = type
        self.score = score
        self.desc = desc
        self.links = links

In [6]:
# JSON ファイルから event set をロード
def load_events(jsonfile):
    with open(jsonfile) as f:
        df = json.load(f)
    events = {x['id']: Event(x['id'], x['type'], x['score'], x['desc'], x['links']) for x in df} #eventsにidをkeyとしそのオブジェクトをvalueとした辞書を生成
    for k,x in events.items():
        x.links = [events[e] for e in x.links] #Event.linkの中身をidの配列からEventの配列に変更
    return events

In [7]:
events = load_events('sesaku2.json')

In [8]:
import pandas as pd
data = []
labels = []
columns=[]
index=[]
for k1, v1 in events.items():
    if v1.type[-1]=='部品':
        index.append(v1.desc)
for k1, v1 in events.items():
    if v1.type[-1]=='対策':
        if not v1.desc in columns:
            columns.append(v1.desc)
df = pd.DataFrame(index=index, columns=columns)
for k1, v1 in events.items():
    if v1.type[-1]=='部品':
        for k2, v2 in events.items():
            if v2.type[-1] == '対策':
                    if v2 in v1.links:
                        df.at[v1.desc, v2.desc] = 1
                    else:
                        df.at[v1.desc, v2.desc] = 0

In [9]:
taisaku_vec_word = {}
for i in df:
    taisaku_vec_word[i]=weighted_mean_vec(i)

taisaku_vec_doc = {}
for i in df:
    taisaku_vec_doc[i]=model_doc.infer_vector(get_tags_for_doc2vec(i))

taisaku_vec_bert = {}
for i in df:
    taisaku_vec_bert[i]=get_vector_cls(i)

class Label:
    TAISAKU = 1
    NASI = 0

data_word = []
labels_word = []
# data 
for index, row in df.iterrows():#部品
    x1 = weighted_mean_vec(index)
    for i in df:#対策
        x2 =  taisaku_vec_word[i]#対策
        data_word.extend([np.append(x1, x2)])
        if row[i] ==1:
            labels_word.append(Label.TAISAKU)
        else:
            labels_word.append(Label.NASI)

data_doc = []
labels_doc = []
# data 
for index, row in df.iterrows():#部品
    x1 = model_doc.infer_vector(get_tags_for_doc2vec(index))
    for i in df:#対策
        x2 =  taisaku_vec_doc[i]#対策
        data_doc.extend([np.append(x1, x2)])
        if row[i] ==1:
            labels_doc.append(Label.TAISAKU)
        else:
            labels_doc.append(Label.NASI)

data_BERT_cls = []
labels_BERT_cls = []
# data 
for index, row in df.iterrows():#部品
    x1 = get_vector_cls(index) #部品
    for i in df:#対策
        x2=taisaku_vec_bert[i]
        data_BERT_cls.extend([np.append(x1, x2)])
        if row[i] ==1:
            labels_BERT_cls.append(Label.TAISAKU)
        else:
            labels_BERT_cls.append(Label.NASI)

In [10]:
 #生成器
def build_generator(data_size, z_dim):
        model = Sequential()
        
        model.add(Dense(128, input_dim = z_dim))
        
        model.add(LeakyReLU(alpha=0.01))
        
        model.add(Dense(data_size, activation='tanh'))
        
        return model

In [11]:
#識別器

def build_discriminator(data_size):
    model = Sequential()

    model.add(Dense(128, activation=LeakyReLU(alpha=0.01), input_shape=(data_size,)))
    
    #model.add(LeakyReLU(alpha=0.01))
        
    model.add(Dense(1, activation='sigmoid'))
    
    return model

In [12]:
def build_gan(generator, discriminator):
    model = Sequential()
    
    model.add(generator)
    model.add(discriminator)
    
    return model


def build_compile(data, z_dim):
    data_size = len(data[0])
    #識別器の構築とコンパイル
    discriminator = build_discriminator(data_size)
    discriminator.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

    #生成器の構築
    generator = build_generator(data_size, z_dim)

    #生成器の構築中は識別器のパラメータを固定
    discriminator.trainable = False

    #生成器の訓練のため、識別器は固定し、GANモデルの構築とコンパイルを行う
    gan = build_gan(generator, discriminator)
    gan.compile(loss='binary_crossentropy', optimizer=Adam())
    
    return discriminator, generator, gan

In [13]:
def train_data(data,labels):
    index_1 = [i for i, x in enumerate(labels) if x == 1]
    index_0 = [i for i, x in enumerate(labels) if x == 0]
    index_0 = random.sample(index_0, len(index_1))
    data_1 = [data[i] for i in index_1]
    data_0 = [data[i] for i in index_0]
    labels = [Label.TAISAKU]*len(data_1) + [Label.NASI]*len(data_0) 
    data = data_1 + data_0
    data = np.array(data)
    labels = np.array(labels)
    return data, labels

In [14]:
def x_y_train(data,labels):
    index_1 = [i for i, x in enumerate(labels) if x == 1]
    index_0 = [i for i, x in enumerate(labels) if x == 0]
    index_0 = random.sample(index_0, len(index_1))
    data_1 = [data[i] for i in index_1]
    data_0 = [data[i] for i in index_0]

    labels_1 = [Label.TAISAKU]*len(data_1)
    labels_0 = [Label.NASI]*len(data_0) 
    data_0 = np.array(data_0)
    labels_0 = np.array(labels_0)
    data_1 = np.array(data_1)
    labels_1 = np.array(labels_1)
    return data_0, labels_0, data_1, labels_1

In [15]:
losses = []
accuracies = []
iteration_checkpoints = []


def train(data, labels, iterations, batch_size, sample_interval):
    
    data_0, labels_0, data_1, lables_1 = x_y_train(data,labels)
    #1の数7303
    #0の数7303
    
    #ラベル1
    real = np.ones((batch_size,1))
    #ラベル0
    fake = np.zeros((batch_size,1))
    
    
    for iteration in range(iterations):
        
        #-------------------
        #識別器の訓練
        #-------------------
        
        #ランダムに関係があるベクトルをとる
        idx = np.random.randint(0,len(data_1),batch_size)
        vecs = data_1[idx]
        
        
        z = np.random.normal(0,1,(batch_size, 100))
        
        gen_vec = generator.predict(z)
        
        d_loss_real = discriminator.train_on_batch(vecs, real)
        d_loss_fake = discriminator.train_on_batch(gen_vec, fake)
        d_loss, accuracy = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        #-------------------
        #生成器の訓練
        #-------------------
        
        #平均0標準偏差１の正規分布に従う乱数を生成
        z = np.random.normal(0, 1,(batch_size, 100))
            
        gen_vec = generator.predict(z)
        
        g_loss = gan.train_on_batch(z,real)
        
        if(iteration +1) % sample_interval ==0:
            losses.append((d_loss, g_loss))
            accuracies.append(100.0 * accuracy)
            iteration_checkpoints.append(iteration +1)
        
        print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" %(iteration +1,d_loss, 100.0*accuracy,g_loss))
    return generator

In [16]:
data_0, labels_0, data_1, lables_1 = x_y_train(data_word,labels_word)

In [17]:
z = np.random.randint(0,len(data_1),5)

In [18]:
len(data_1[1])

400

In [19]:
z_dim = 100

discriminator, generator, gan = build_compile(data_word, z_dim)

iterations =1000
batch_size = 128
sample_interval = 1000

generator = train(data_word,labels_word,iterations,batch_size,sample_interval)

1 [D loss: 0.962333, acc.: 41.41%] [G loss: 0.944094]
2 [D loss: 0.417514, acc.: 68.75%] [G loss: 0.766589]
3 [D loss: 0.444082, acc.: 59.38%] [G loss: 0.633709]
4 [D loss: 0.521503, acc.: 52.73%] [G loss: 0.535652]
5 [D loss: 0.580824, acc.: 51.17%] [G loss: 0.527657]
6 [D loss: 0.634639, acc.: 50.39%] [G loss: 0.425568]
7 [D loss: 0.652550, acc.: 50.39%] [G loss: 0.440438]
8 [D loss: 0.683285, acc.: 50.39%] [G loss: 0.448032]
9 [D loss: 0.652323, acc.: 50.78%] [G loss: 0.487600]
10 [D loss: 0.598742, acc.: 50.78%] [G loss: 0.584463]
11 [D loss: 0.515210, acc.: 53.52%] [G loss: 0.658654]
12 [D loss: 0.454722, acc.: 61.33%] [G loss: 0.908089]
13 [D loss: 0.355974, acc.: 78.12%] [G loss: 1.071683]
14 [D loss: 0.263202, acc.: 92.58%] [G loss: 1.363299]
15 [D loss: 0.221081, acc.: 95.31%] [G loss: 1.654232]
16 [D loss: 0.166246, acc.: 98.44%] [G loss: 1.921894]
17 [D loss: 0.137351, acc.: 99.61%] [G loss: 2.164061]
18 [D loss: 0.109910, acc.: 99.61%] [G loss: 2.283379]
19 [D loss: 0.11359

150 [D loss: 0.009086, acc.: 100.00%] [G loss: 5.661693]
151 [D loss: 0.010081, acc.: 100.00%] [G loss: 5.505837]
152 [D loss: 0.011247, acc.: 100.00%] [G loss: 5.356610]
153 [D loss: 0.011699, acc.: 100.00%] [G loss: 5.224894]
154 [D loss: 0.013641, acc.: 100.00%] [G loss: 5.028386]
155 [D loss: 0.013060, acc.: 100.00%] [G loss: 5.056720]
156 [D loss: 0.015436, acc.: 100.00%] [G loss: 5.191667]
157 [D loss: 0.015532, acc.: 100.00%] [G loss: 5.198680]
158 [D loss: 0.015750, acc.: 100.00%] [G loss: 5.061482]
159 [D loss: 0.019354, acc.: 99.61%] [G loss: 5.284676]
160 [D loss: 0.011670, acc.: 100.00%] [G loss: 5.379464]
161 [D loss: 0.015175, acc.: 100.00%] [G loss: 5.618203]
162 [D loss: 0.008695, acc.: 100.00%] [G loss: 5.430086]
163 [D loss: 0.012093, acc.: 100.00%] [G loss: 5.277867]
164 [D loss: 0.011534, acc.: 100.00%] [G loss: 5.133811]
165 [D loss: 0.015996, acc.: 99.61%] [G loss: 5.208025]
166 [D loss: 0.014212, acc.: 100.00%] [G loss: 5.349712]
167 [D loss: 0.011878, acc.: 100.

296 [D loss: 0.062916, acc.: 100.00%] [G loss: 2.332694]
297 [D loss: 0.068883, acc.: 99.61%] [G loss: 2.319421]
298 [D loss: 0.064677, acc.: 100.00%] [G loss: 2.308158]
299 [D loss: 0.064334, acc.: 99.61%] [G loss: 2.293146]
300 [D loss: 0.064444, acc.: 100.00%] [G loss: 2.282376]
301 [D loss: 0.061227, acc.: 100.00%] [G loss: 2.300202]
302 [D loss: 0.061059, acc.: 100.00%] [G loss: 2.311715]
303 [D loss: 0.060406, acc.: 100.00%] [G loss: 2.338560]
304 [D loss: 0.058862, acc.: 100.00%] [G loss: 2.417393]
305 [D loss: 0.056909, acc.: 100.00%] [G loss: 2.372139]
306 [D loss: 0.057509, acc.: 100.00%] [G loss: 2.428709]
307 [D loss: 0.061683, acc.: 100.00%] [G loss: 2.433014]
308 [D loss: 0.057318, acc.: 100.00%] [G loss: 2.427903]
309 [D loss: 0.056421, acc.: 100.00%] [G loss: 2.398629]
310 [D loss: 0.059191, acc.: 100.00%] [G loss: 2.415679]
311 [D loss: 0.054647, acc.: 100.00%] [G loss: 2.418190]
312 [D loss: 0.054907, acc.: 100.00%] [G loss: 2.467316]
313 [D loss: 0.053830, acc.: 100.

440 [D loss: 0.033870, acc.: 100.00%] [G loss: 3.872492]
441 [D loss: 0.031464, acc.: 99.22%] [G loss: 4.001492]
442 [D loss: 0.026807, acc.: 100.00%] [G loss: 4.125230]
443 [D loss: 0.032095, acc.: 100.00%] [G loss: 4.321135]
444 [D loss: 0.021100, acc.: 100.00%] [G loss: 4.400486]
445 [D loss: 0.028898, acc.: 100.00%] [G loss: 4.326350]
446 [D loss: 0.024080, acc.: 100.00%] [G loss: 4.231606]
447 [D loss: 0.020967, acc.: 100.00%] [G loss: 4.263068]
448 [D loss: 0.018470, acc.: 100.00%] [G loss: 4.442733]
449 [D loss: 0.024968, acc.: 100.00%] [G loss: 4.340350]
450 [D loss: 0.017115, acc.: 100.00%] [G loss: 4.492451]
451 [D loss: 0.025344, acc.: 99.61%] [G loss: 4.297660]
452 [D loss: 0.019850, acc.: 100.00%] [G loss: 4.397995]
453 [D loss: 0.028031, acc.: 100.00%] [G loss: 4.255454]
454 [D loss: 0.026982, acc.: 99.61%] [G loss: 4.276411]
455 [D loss: 0.030842, acc.: 100.00%] [G loss: 4.182442]
456 [D loss: 0.037723, acc.: 99.61%] [G loss: 4.233121]
457 [D loss: 0.043081, acc.: 99.61%

588 [D loss: 0.010629, acc.: 100.00%] [G loss: 4.733716]
589 [D loss: 0.010664, acc.: 100.00%] [G loss: 4.776690]
590 [D loss: 0.011754, acc.: 100.00%] [G loss: 4.687382]
591 [D loss: 0.013627, acc.: 100.00%] [G loss: 4.896551]
592 [D loss: 0.014387, acc.: 100.00%] [G loss: 4.739288]
593 [D loss: 0.017210, acc.: 100.00%] [G loss: 4.538580]
594 [D loss: 0.020345, acc.: 100.00%] [G loss: 4.208739]
595 [D loss: 0.019502, acc.: 100.00%] [G loss: 4.217289]
596 [D loss: 0.015863, acc.: 100.00%] [G loss: 4.284447]
597 [D loss: 0.015714, acc.: 100.00%] [G loss: 4.571493]
598 [D loss: 0.023032, acc.: 100.00%] [G loss: 4.585365]
599 [D loss: 0.022891, acc.: 100.00%] [G loss: 4.349692]
600 [D loss: 0.028181, acc.: 100.00%] [G loss: 4.208442]
601 [D loss: 0.024043, acc.: 100.00%] [G loss: 4.056297]
602 [D loss: 0.026036, acc.: 100.00%] [G loss: 4.342232]
603 [D loss: 0.031042, acc.: 100.00%] [G loss: 4.350864]
604 [D loss: 0.032189, acc.: 100.00%] [G loss: 4.369738]
605 [D loss: 0.030635, acc.: 10

736 [D loss: 0.020905, acc.: 100.00%] [G loss: 4.853936]
737 [D loss: 0.021291, acc.: 100.00%] [G loss: 4.833130]
738 [D loss: 0.015168, acc.: 100.00%] [G loss: 4.490783]
739 [D loss: 0.018454, acc.: 100.00%] [G loss: 4.699024]
740 [D loss: 0.017219, acc.: 100.00%] [G loss: 5.090503]
741 [D loss: 0.019528, acc.: 100.00%] [G loss: 5.104705]
742 [D loss: 0.022696, acc.: 100.00%] [G loss: 4.992892]
743 [D loss: 0.017184, acc.: 100.00%] [G loss: 4.839030]
744 [D loss: 0.014698, acc.: 100.00%] [G loss: 5.068901]
745 [D loss: 0.011846, acc.: 100.00%] [G loss: 5.332542]
746 [D loss: 0.025231, acc.: 100.00%] [G loss: 4.703808]
747 [D loss: 0.017069, acc.: 100.00%] [G loss: 4.492939]
748 [D loss: 0.023261, acc.: 100.00%] [G loss: 4.666101]
749 [D loss: 0.017040, acc.: 100.00%] [G loss: 4.935949]
750 [D loss: 0.019073, acc.: 100.00%] [G loss: 4.994081]
751 [D loss: 0.017474, acc.: 100.00%] [G loss: 4.964401]
752 [D loss: 0.020953, acc.: 100.00%] [G loss: 4.633874]
753 [D loss: 0.029620, acc.: 10

880 [D loss: 0.017349, acc.: 99.61%] [G loss: 4.989862]
881 [D loss: 0.013116, acc.: 100.00%] [G loss: 5.089453]
882 [D loss: 0.010344, acc.: 100.00%] [G loss: 5.199720]
883 [D loss: 0.014746, acc.: 100.00%] [G loss: 5.252855]
884 [D loss: 0.011887, acc.: 100.00%] [G loss: 5.216047]
885 [D loss: 0.010316, acc.: 100.00%] [G loss: 5.142741]
886 [D loss: 0.011910, acc.: 100.00%] [G loss: 5.350079]
887 [D loss: 0.017410, acc.: 99.61%] [G loss: 4.883411]
888 [D loss: 0.014303, acc.: 100.00%] [G loss: 4.892160]
889 [D loss: 0.018330, acc.: 100.00%] [G loss: 4.805072]
890 [D loss: 0.022695, acc.: 100.00%] [G loss: 4.642638]
891 [D loss: 0.028508, acc.: 100.00%] [G loss: 4.791015]
892 [D loss: 0.020400, acc.: 100.00%] [G loss: 5.093263]
893 [D loss: 0.027202, acc.: 100.00%] [G loss: 5.087263]
894 [D loss: 0.022749, acc.: 99.61%] [G loss: 4.708878]
895 [D loss: 0.021174, acc.: 99.61%] [G loss: 4.773655]
896 [D loss: 0.012782, acc.: 100.00%] [G loss: 4.854096]
897 [D loss: 0.012147, acc.: 100.00

In [68]:
#ベクトルを生成
z = np.random.normal(0,1,(1, 100))
vec = generator.predict(z)
vec.shape

(1, 400)

In [69]:
_ ,  _, data_1, _ = x_y_train(data_word,labels_word)
data_1.shape

(7303, 400)

In [70]:
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [73]:
result = 0
for i in range(7303):
    if result < cos_sim(vec[0],data_1[i]):
        result = cos_sim(vec[0],data_1[i])
result

0.8048938099471006