<a href="https://colab.research.google.com/github/yujiimt/NLP/blob/master/book/deeplearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams, make_sampling_table
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Input, Dot, Flatten, Embedding, Dense
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

from pprint import pprint
import numpy as np

In [2]:
import urllib.request as request
url = "https://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt"
csvfile = "Japanese.txt"

request.urlretrieve(url,csvfile)

('Japanese.txt', <http.client.HTTPMessage at 0x7f0651526e80>)

In [3]:
!mkdir data
!wget https://s3-ap-northeast-1.amazonaws.com/dev.tech-sketch.jp/chakki/public/ja.text8.zip -P data/
!unzip data/ja.text8.zip -d data/

--2020-05-01 07:29:30--  https://s3-ap-northeast-1.amazonaws.com/dev.tech-sketch.jp/chakki/public/ja.text8.zip
Resolving s3-ap-northeast-1.amazonaws.com (s3-ap-northeast-1.amazonaws.com)... 52.219.0.206
Connecting to s3-ap-northeast-1.amazonaws.com (s3-ap-northeast-1.amazonaws.com)|52.219.0.206|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33905114 (32M) [application/zip]
Saving to: ‘data/ja.text8.zip’


2020-05-01 07:29:35 (8.25 MB/s) - ‘data/ja.text8.zip’ saved [33905114/33905114]

Archive:  data/ja.text8.zip
  inflating: data/ja.text8           


In [4]:
tokenizer = Tokenizer(num_words = 10, oov_token='<UNK>')
texts = ['今日 は 良い 天気 だ 。']
tokenizer.fit_on_texts(texts)
tokenizer.word_index
tokenizer.index_word

{1: '<UNK>', 2: '今日', 3: 'は', 4: '良い', 5: '天気', 6: 'だ', 7: '。'}

In [5]:
sequence = ['猫', 'は', 'かわいい']
pprint(skipgrams(sequence, vocabulary_size=4, window_size=1))

([['かわいい', 1],
  ['猫', 'は'],
  ['は', '猫'],
  ['かわいい', 'は'],
  ['は', 3],
  ['は', 2],
  ['猫', 3],
  ['は', 'かわいい']],
 [0, 1, 1, 1, 0, 0, 0, 1])


In [0]:
def load_data(filepath, encoding = 'utf-8'):
    with open(filepath, encoding=encoding) as f:
      return f.read()

In [0]:
def build_vocabulary(text, num_words = None):
    tokenizer = Tokenizer(num_words=num_words, oov_token='<UNK>')
    tokenizer.fit_on_texts([text])
    return tokenizer

In [0]:
def create_dataset(text, vocab, num_words, window_size, negative_samples):
    data = vocab.texts_to_sequences([text]).pop()
    sampling_table = make_sampling_table(num_words)
    couples, labels = skipgrams(data, num_words, window_size=window_size,
                                negative_samples = negative_samples,
                                sampling_table = sampling_table)
    word_target, word_context = zip(*couples)
    word_target = np.reshape(word_target, (-1, 1))
    word_context = np.reshape(word_context, (-1, 1))
    labels = np.asarray(labels)
    return [word_target, word_context], labels

In [0]:
class EmbeddingModel:

  def __init__(self, vocab_size, emb_dim = 100):
      self.word_input = Input(shape=(1,), name = 'word_input')
      self.word_embed = Embedding(input_dim = vocab_size,
                                  output_dim = emb_dim,
                                  input_length=1,
                                  name = 'word_embedding')
      
      self.context_input = Input(shape=(1,), name = 'context_input')
      self.context_embed = Embedding(input_dim = vocab_size,
                                     output_dim = emb_dim,
                                     input_length = 1,
                                     name = 'context_embedding')
      
      self.dot = Dot(axes=2)
      self.flatten = Flatten()
      self.output = Dense(1, activation = 'sigmoid')


  def build(self):
      word_embed = self.word_embed(self.word_input)
      context_embed = self.context_embed(self.context_input)
      dot = self.dot([word_embed, context_embed])
      flatten = self.flatten(dot)
      output = self.output(flatten)
      model = Model(inputs = [self.word_input, self.context_input],
                    outputs=output)
      
      return model

In [0]:
class InferenceAPI:

    def __init__(self, model, vocab):
        self.vocab = vocab
        self.weights = model.get_layer('word_embedding').get_weights()[0]


    def most_similar(self, word, topn=10):
        word_index = self.vocab.word_index.get(word, 1)
        sim = self._cosine_similarity(word_index)
        pairs = [(s, i) for i, s in enumerate(sim)]
        pairs.sort(reverse = True)
        pairs = paris[1: topn +1]
        res = [(self.vocab.index_word[i], s) for s, i in paris]
        return res

    def similarity(self, word1, word2):
        word_index1 = self.vocab.word_index.get(word1,1)
        word_index2 = self.vocab.word_index.get(word2,1)
        weight1 = self.weights[word_index1]
        weight2 = self.weights[word_index2]
        return cosine(weight1, weight2)


    def _cosine_similarity(self, target_idx):
        target_weight = self.weights[target_idx]
        similarity = cosine_similarity(self.weights, [target_weight])
        return similarity.flatten()

In [0]:
def main():
    emb_dim = 50
    epochs = 10
    model_path = 'model.h5'
    negative_samples = 1
    num_words = 10000
    window_size = 1

    text = load_data(filepath = 'data/ja.text8')

    vocab = build_vocabulary(text, num_words)

    x,y = create_dataset(text, vocab, num_words, window_size, negative_samples)

    model = EmbeddingModel(num_words, emb_dim)
    model = model.build()
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy')

    callbacks = [
        EarlyStopping(patience=1),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    model.fit(
        x = x,
        y = y,
        batch_size = 128,
        epochs = epochs,
        validation_split = 0.2,
        callbacks = callbacks
    )
 
  
    model = load_model(model_path)
    api = InferenceAPI(model, vocab)
    pprint(api.most_similar(word = '日本'))

In [0]:
if __name__ == "__main__" :
    main()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10