In [1]:
%matplotlib inline
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
from matplotlib import pylab
from six.moves import range
from sklearn.manifold import TSNE
import MeCab
import pickle

import emoji as Emoji
from sequence import Sequence
from load_emojis import load_emojis

mecab = MeCab.Tagger("-Owakati")

In [2]:
def build_dictionary():
  dictionary = dict()
  emoji_dictionary = dict()

  dictionary['EOS'] = 0
  for sequence in Sequence.objects:
    words = mecab.parse(sequence.content).split()

    for word in words[:-1]:
      if word not in dictionary:
        dictionary[word] = len(dictionary)
    if words[-1] not in emoji_dictionary:
        emoji_dictionary[words[-1]] = len(emoji_dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    reverse_emoji_dictionary = dict(zip(emoji_dictionary.values(), emoji_dictionary.keys())) 

  return dictionary, reverse_dictionary, emoji_dictionary, reverse_emoji_dictionary
    
dictionary, reverse_dictionary, emoji_dictionary, reverse_emoji_dictionary = build_dictionary()

print(len(dictionary))
print([word for word in dictionary][:10])
print([word_id for word_id in reverse_dictionary][:10])
print([emoji for emoji in emoji_dictionary][:10])
print([word_id for word_id in reverse_emoji_dictionary][:10])

KeyboardInterrupt: 

In [6]:
with open('dictionary.pickle', 'xb') as handle:
  pickle.dump(dictionary, handle)

with open('reverse_dictionary.pickle', 'xb') as handle:
  pickle.dump(reverse_dictionary, handle)

with open('emoji_dictionary.pickle', 'xb') as handle:
  pickle.dump(emoji_dictionary, handle)

with open('reverse_emoji_dictionary.pickle', 'xb') as handle:
  pickle.dump(reverse_emoji_dictionary, handle)

FileExistsError: [Errno 17] File exists: 'dictionary.pickle'

In [2]:
with open('dictionary.pickle', 'rb') as handle:
  dictionary = pickle.load(handle)

with open('reverse_dictionary.pickle', 'rb') as handle:
  reverse_dictionary = pickle.load(handle)

with open('emoji_dictionary.pickle', 'rb') as handle:
  emoji_dictionary = pickle.load(handle)

with open('reverse_emoji_dictionary.pickle', 'rb') as handle:
  reverse_emoji_dictionary = pickle.load(handle)

In [14]:
lstm_size = 64
max_length = 50
embedding_size = 128
vocabulary_size = len(dictionary)
emoji_size = len(emoji_dictionary)

In [4]:
def seq2id(sequence):
  words = mecab.parse(sequence).split()
  input_words = words[:-1]
  label_id = emoji_dictionary.get(words[-1], -1)
  input_ids = list(map(lambda word: dictionary.get(word,0), input_words))
  input_length = len(input_ids)
  if input_length > max_length:
    input_length = max_length
    input_ids = input_ids[-max_length:]
  else:
    input_ids = input_ids + [0]*(max_length - input_length)
  
  return input_ids, label_id, input_length

def id2onehot(id, dictionary_size):
  onehot = np.zeros(dictionary_size)
  onehot[id] = 1.0
  return onehot

sequence = Sequence.objects[0].content
train_ids, label_id, sequence_length = seq2id(sequence)
print(train_ids)
print(label_id)
print(sequence_length)
print(id2onehot(label_id, len(emoji_dictionary)))
print(seq2id('お休みなさい😊'))

[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
0
2
[ 1.  0.  0. ...,  0.  0.  0.]
([68, 1211, 1914, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 43, 3)


In [16]:
graph = tf.Graph()
with graph.as_default():

  inputs = tf.placeholder(tf.int32, shape=[None, max_length])
  labels = tf.placeholder(tf.float32, shape=[None, emoji_size])
  sequence_lengths = tf.placeholder(tf.int32, shape=(None,))

  print(inputs.get_shape())
  
  embedding = tf.get_variable("embedding", shape=[vocabulary_size, embedding_size], dtype=tf.float32)
  
  embed_inputs = tf.nn.embedding_lookup(embedding, inputs)
  print(embed_inputs.get_shape())
  
  cell = tf.nn.rnn_cell.BasicLSTMCell(lstm_size, state_is_tuple=True)
  with tf.variable_scope("train_valid"):
    outputs, state = tf.nn.dynamic_rnn(cell,
                      embed_inputs,
                      dtype=tf.float32,
                      sequence_length=sequence_lengths
                      )
  print(outputs.get_shape())
  print(len(state))
  print(state.h.get_shape())

    
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([lstm_size, emoji_size], -0.1, 0.1), name='w')
  b = tf.Variable(tf.zeros([emoji_size]), name='b')

  logits = tf.nn.xw_plus_b(state.h, w, b)
  print(labels.get_shape())
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, labels))
  optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
  train_predictions = tf.nn.softmax(logits)
    
  # Validation eval  
  valid_inputs = tf.placeholder(tf.int32, shape=[None, max_length])
  valid_sequence_lengths = tf.placeholder(tf.int32, shape=(None,))
  valid_embed_inputs = tf.nn.embedding_lookup(embedding, valid_inputs)
  with tf.variable_scope("train_valid", reuse = True):
    valid_outputs, valid_state = tf.nn.dynamic_rnn(cell,
                                                   valid_embed_inputs,
                                                   dtype=tf.float32,
                                                   sequence_length=valid_sequence_lengths
                                                   )
  valid_predictions = tf.nn.softmax(tf.nn.xw_plus_b(valid_state.h, w, b))
  saver = tf.train.Saver()

(?, 50)
(?, 50, 128)
(?, 50, 64)
2
(?, 64)
(?, 22686)


In [17]:
def predict_emoji(predictions):
  return reverse_emoji_dictionary[np.argmax(predictions[0])]

In [22]:
with tf.Session(graph=graph) as sess:
  sess.run(tf.initialize_all_variables())
  
  print("Run session...")
  for step in range(100001):
    sequence = Sequence.objects[step].content
    train_ids, label_id, sequence_length = seq2id(sequence)
    train_label = id2onehot(label_id, emoji_size)
    feed_dict = { inputs: [train_ids], labels: [train_label], sequence_lengths: [sequence_length] }
    
    _, l, predictions = sess.run([optimizer, loss, train_predictions], feed_dict=feed_dict)
    
    if step % 100 == 0:
      print('Step %d:' % step)
      print('Training set:')
      print('  Loss       : ', l)
      print('  Input      : ', sequence[:-1])
      print('  Label      : ', reverse_emoji_dictionary[label_id])
      print('  Prediction : ', predict_emoji(predictions))

      valid_sequence = Sequence.objects[step+100000].content
      valid_ids, valid_label_id, valid_sequence_length = seq2id(valid_sequence)
  
      valid_feed_dict = { valid_inputs: [valid_ids], valid_sequence_lengths: [valid_sequence_length] }
      _valid_predictions = valid_predictions.eval(feed_dict=valid_feed_dict)
      print('Validation   :')
      print('  Input      : ', valid_sequence[:-1])
      print('  Label      : ', reverse_emoji_dictionary[valid_label_id])
      print('  Prediction : ', predict_emoji(_valid_predictions))
      
      # Save the variables to disk.
      save_path = saver.save(sess, "model/model.ckpt")
      print("Model saved in file: %s" % save_path)

Run session...
Step 0:
Training set:
  Loss       :  10.0308
  Input      :  筋トレガンバ
  Label      :  💪
  Prediction :  🌼🌼💗
Validation   :
  Input      :  ４枚目読んでください
  Label      :  🙏
  Prediction :  💪
Model saved in file: model/model.ckpt
Step 100:
Training set:
  Loss       :  9.99927
  Input      :  ギュッてされる
  Label      :  👫
  Prediction :  💕
Validation   :
  Input      :  お迎えに参りました🚶
  Label      :  🚶💨
  Prediction :  💕
Model saved in file: model/model.ckpt
Step 200:
Training set:
  Loss       :  10.0306
  Input      :  ☁☁☁☁☁
  Label      :  ☁☁☁☁☁☁
  Prediction :  💭
Validation   :
  Input      :  お泊り
  Label      :  💗
  Prediction :  💭
Model saved in file: model/model.ckpt
Step 300:
Training set:
  Loss       :  10.3099
  Input      :  コーンなに🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽
  Label      :  🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽🌽
  Prediction :  💭
Validation   :
  Input      :  任せですよん〜！！！
  Label      :  🐼
  Prediction :  💭
Model saved in file: model/model.ckpt
Step 400:
Training set:
  Loss       :  4.0

In [23]:
with tf.Session(graph=graph) as sess:
  # Restore variables from disk.
  saver.restore(sess, "model/model.ckpt")
  print("Model restored.")
    
  valid_sequence = 'お休みなさい💪'
  valid_ids, valid_label_id, valid_sequence_length = seq2id(valid_sequence)
  valid_feed_dict = { valid_inputs: [valid_ids], valid_sequence_lengths: [valid_sequence_length] }
  _valid_predictions = valid_predictions.eval(feed_dict=valid_feed_dict)
  print('Validation   :')
  print('  Input      : ', valid_sequence[:-1])
  print('  Label      : ', reverse_emoji_dictionary[valid_label_id])
  print('  Prediction : ', predict_emoji(_valid_predictions))

Model restored.
Validation   :
  Input      :  お休みなさい
  Label      :  💪
  Prediction :  😭
