In [0]:
"""
We use following lines because we are running on Google Colab
If you are running notebook on a local computer, you don't need these
"""
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow1/knowledge_graph_completion/wn18/main')

In [0]:
import tensorflow as tf
print("TensorFlow Version", tf.__version__)
print('GPU Enabled:', tf.test.is_gpu_available())
import pprint
import logging

from pathlib import Path

TensorFlow Version 1.13.0-rc1
GPU Enabled: True


In [0]:
def get_vocab(f_path):
  word2idx = {}
  with open(f_path) as f:
    for i, line in enumerate(f):
      line = line.rstrip()
      word2idx[line] = i
  return word2idx

In [0]:
"""
we use 1vN fast evaluation as purposed in ConvE paper:
"https://arxiv.org/abs/1707.01476"
sp2o is a dictionary that maps a pair of (subject, predicate)
to multiple possible corresponding objects in graph
"""
def make_sp2o(f_paths, e2idx, r2idx):
    sp2o = {}
    for f_path in f_paths:
      with open(f_path) as f:
        for line in f:
            line = line.rstrip()
            s, p, o = line.split()
            s, p, o = e2idx[s], r2idx[p], e2idx[o]
            if (s,p) not in sp2o:
                sp2o[(s,p)] = [o]
            else:
                if o not in sp2o[(s,p)]:
                    sp2o[(s,p)].append(o)
    return sp2o

In [0]:
# stream data from text files
def gen_fn(f_path, params, sp2o):
  with open(f_path) as f:
    print('Reading', f_path)
    for line in f:
      line = line.rstrip()
      s, p, o = line.split()
      s, p, o = params['e2idx'][s], params['r2idx'][p], params['e2idx'][o]
      sparse_i = [[x] for x in sp2o[(s, p)]]
      sparse_v = [1.] * len(sparse_i)
      sparse_s = [len(e2idx)]
      yield (s, p), ((sparse_i, sparse_v, sparse_s), o, len(sparse_i))


def map_fn(x, y):
  i, v, s = y[0]
  one_hot = tf.SparseTensor(i, v, s)
  return x, (one_hot, y[1], y[2])
      

def input_fn(mode, params, sp2o):
  _shapes = (([], []), (([None, 1], [None], [1]), [], []))
  _types = ((tf.int32, tf.int32),
            ((tf.int64, tf.float32, tf.int64), tf.int32, tf.int32))
  
  if mode == tf.estimator.ModeKeys.TRAIN:
    ds = tf.data.Dataset.from_generator(
      lambda: gen_fn(params['train_path'], params, sp2o),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.shuffle(params['num_samples'])
    ds = ds.repeat()
    ds = ds.map(map_fn)
    ds = ds.batch(params['batch_size'])
  
  if mode == tf.estimator.ModeKeys.EVAL:
    ds = tf.data.Dataset.from_generator(
      lambda: gen_fn(params['test_path'], params, sp2o),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.map(map_fn)
    ds = ds.batch(params['batch_size'])
  
  return ds

In [0]:
def rank_obj(scores, query):
  _, i = tf.nn.top_k(scores, sorted=True, k=tf.shape(scores)[1])
  query = tf.expand_dims(query, 1)
  is_query = tf.to_float(tf.equal(i, query))
  r = tf.argmax(is_query, -1) + 1
  mrr = 1. / tf.to_float(r)
  hits_10 = tf.to_float(tf.less_equal(r, 10))
  hits_3 = tf.to_float(tf.less_equal(r, 3))
  hits_1 = tf.to_float(tf.less_equal(r, 1))
  
  return {
    'MRR': tf.metrics.mean(mrr),
    'Hits10': tf.metrics.mean(hits_10),
    'Hits3': tf.metrics.mean(hits_3),
    'Hits1': tf.metrics.mean(hits_1),
  }

In [0]:
def label_smoothing(inputs, params):  
  K = len(params['e2idx'])
  epsilon = params['epsilon']
  return ((1-epsilon) * inputs) + (epsilon / K)

In [0]:
def graph_fn(s, p, params):
  e_embed = tf.get_variable('e_embed',
                            [len(params['e2idx']), params['embed_dim']],
                            initializer=tf.initializers.truncated_normal())
  r_embed = tf.get_variable('r_embed',
                            [len(params['r2idx']), params['embed_dim']],
                            initializer=tf.initializers.truncated_normal())

  s = tf.nn.embedding_lookup(e_embed, s)
  p = tf.nn.embedding_lookup(r_embed, p)
  
  logits = tf.matmul(s*p, e_embed, transpose_b=True)
  
  bias = tf.get_variable('bias', [len(params['e2idx'])])
  logits = tf.nn.bias_add(logits, bias)
  
  return logits

In [0]:
def model_fn(features, labels, mode, params):
  s, p = features
  
  if labels is not None:
    labels, o, num_pos = labels
    labels = tf.sparse.to_dense(labels, validate_indices=False)
  
  logits = graph_fn(s, p, params)
  
  if mode == tf.estimator.ModeKeys.PREDICT:
    return tf.estimator.EstimatorSpec(mode=mode,
                                      predictions=tf.sigmoid(logits))
  else:
    num_neg = len(params['e2idx']) - num_pos
    smooth = label_smoothing(labels, params)
    pos_weight = tf.expand_dims(tf.to_float(num_neg/num_pos), 1)
    
    if params['use_balanced_loss']:
      loss_op = tf.nn.weighted_cross_entropy_with_logits(
        targets = smooth,
        logits = logits,
        pos_weight = pos_weight,)
    else:
      loss_op = tf.nn.sigmoid_cross_entropy_with_logits(
        labels = smooth,
        logits = logits,)
    loss_op = tf.reduce_mean(loss_op)
  
  
  if mode == tf.estimator.ModeKeys.TRAIN:
    tf.logging.info('\n'+pprint.pformat(tf.trainable_variables()))
    
    global_step = tf.train.get_or_create_global_step()
    
    optim = tf.train.AdamOptimizer(params['lr'])
    
    train_op = optim.minimize(loss_op, global_step=global_step)
    
    return tf.estimator.EstimatorSpec(mode=mode,
                                      loss=loss_op,
                                      train_op=train_op,)
  
  
  if mode == tf.estimator.ModeKeys.EVAL:
    """
    create masks for Filtered MRR
    """
    o_one_hot = tf.one_hot(o, len(params['e2idx']))
    unwanted = labels - o_one_hot
    masks = tf.to_float(tf.equal(unwanted, 0.))
    scores = tf.sigmoid(logits) * masks
    
    metric_ops = rank_obj(scores=scores,
                          query=o,)
    
    return tf.estimator.EstimatorSpec(mode=mode,
                                      loss=loss_op,
                                      eval_metric_ops=metric_ops)

In [0]:
params = {
    'model_dir': '../model/distmult_1-N',
    'log_path': '../log/distmult_1-N.txt',
    'train_path': '../data/wn18/train.txt',
    'valid_path': '../data/wn18/valid.txt',
    'test_path': '../data/wn18/test.txt',
    'entity_path': '../vocab/entity.txt',
    'relation_path': '../vocab/relation.txt',
    'batch_size': 128,
    'embed_dim': 300,
    'num_samples': 141442,
    'lr': 1e-3,
    'num_patience': 5,
    'epsilon': 0.1,
    'use_balanced_loss': True,
}

In [0]:
e2idx = get_vocab(params['entity_path'])
r2idx = get_vocab(params['relation_path'])
sp2o_tr = make_sp2o([params['train_path']], e2idx, r2idx)
sp2o_all = make_sp2o([params['train_path'],
                      params['test_path'],
                      params['valid_path'],], e2idx, r2idx)
params['e2idx'] = e2idx
params['r2idx'] = r2idx

In [0]:
# Create directory if not exist
Path(os.path.dirname(params['log_path'])).mkdir(exist_ok=True)
Path(params['model_dir']).mkdir(exist_ok=True, parents=True)

# Logging
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)
fh = logging.FileHandler(params['log_path'])
logger.addHandler(fh)

# Create an estimator
_eval_steps = params['num_samples']//params['batch_size']
config = tf.estimator.RunConfig(
  save_checkpoints_steps=_eval_steps,
  keep_checkpoint_max=params['num_patience']+2,)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  model_dir=params['model_dir'],
  config=config,
  params=params)

# This hook early-stops model if testing rank is not increased
hook = tf.contrib.estimator.stop_if_no_increase_hook(
  estimator=estimator,
  metric_name='Hits10',
  max_steps_without_increase=params['num_patience']*_eval_steps,
  run_every_secs=None,
  run_every_steps=_eval_steps)

# Train on training data and Evaluate on testing data
train_spec = tf.estimator.TrainSpec(
  input_fn=lambda: input_fn(mode=tf.estimator.ModeKeys.TRAIN,
                            params=params,
                            sp2o=sp2o_tr,),
  hooks=[hook])

eval_spec = tf.estimator.EvalSpec(
  input_fn=lambda: input_fn(mode=tf.estimator.ModeKeys.EVAL,
                            params=params,
                            sp2o=sp2o_all,),
  steps=None,
  throttle_secs=10,)

tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Using config: {'_model_dir': '../model/distmult_1-N', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1105, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 7, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff90e4c2a90>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you de

({'Hits1': 0.6532,
  'Hits10': 0.9496,
  'Hits3': 0.9322,
  'MRR': 0.79194087,
  'global_step': 54146,
  'loss': 0.29487878},
 [])