# Relational ERM in the Skipgram model

The skipgram model is a commonly used model in language and graph representation learning. In this notebook, we demonstrate how to simply implement the skipgram model in the relational ERM framework.

In [None]:
# book-keeping
import sys
import tensorflow as tf
import numpy as np

sys.path.append('../')

# Model

We first create the model for the skipgram. The skipgram model attempts to predict the presence or absence of an edge between two vertices from the vertex embeddings. Here, we create a tensorflow model which models this description for a given minibatch. For simplicity, we are using the tensorflow [estimator](https://www.tensorflow.org/programmers_guide/estimators) API.

In [None]:
def model_fn(features, labels, mode, params):
    """ This function creates the skipgram model for semi-supervised node classification.
    
    It creates the necessary variables, computes the loss, and creates a training operation
    to run to optimize the model.
    
    """
    # This variable corresponds to the vertex level embeddings.
    embedding_variables = tf.get_variable(
        'input_layer/vertex_index_embedding/embedding_weights',
        shape=(params['num_vertices'], params['embedding_dimension']),
        dtype=tf.float32,
        initializer=tf.truncated_normal_initializer(stddev=1 / params['embedding_dimension']),
        trainable=True)
    
    # vertex_index is a vector which represents the indices of the vertices that are present
    # in the subsample.
    vertex_index = features['vertex_index']
    
    # We gather the embeddings for the vertices in the subgraph
    embeddings = tf.nn.embedding_lookup(embedding_variables, vertex_index)
    
    # In the semi-supervised node classification problem, each vertex is also given a label.
    # A portion of the labels are censored at training time, given by `split`
    vertex_labels = labels['labels']
    vertex_labels_split = labels['split']
    
    # Our strategy computes two different losses: a skipgram loss on edges, and a logistic
    # regression loss on vertices. Let's start with the skipgram loss.
    label_task_weight = 1e-3
    
    # -------------------- SKIPGRAM LOSS ------------------------
    edge_list = features['edge_list']
    
    # We use weight to denote whether an edge is the edge list is an actual edge or
    # a non-edge.
    edge_weight = tf.squeeze(features['weights'], axis=-1)
    
    # in the skipgram model, the edge prediction is based on a bernoulli model
    # conditional on the dot product of the embeddings of the vertices.
    #
    # For computational efficiency, we compute the product for pairs of vertices,
    # then select the ones corresponding to the edges and non-edges in the subsample.
    embeddings_prod = tf.matmul(embeddings, embeddings, transpose_b=True)
    edge_logit = tf.gather_nd(embeddings_prod, edge_list)
    
    # The loss is given by the sigmoid cross entropy.
    edge_loss_per_edge = tf.nn.sigmoid_cross_entropy_with_logits(labels=edge_weight, logits=edge_logit)
    edge_loss = tf.reduce_sum(edge_loss_per_edge)
    
    edge_accuracy = tf.metrics.accuracy(labels=edge_weight, predictions=tf.to_float(tf.greater(edge_logit, 0.5)))
    
    # -------------------- VERTEX LOSS ----------------------------
    # the other aspect we tackle is a logistic regression of the vertex label onto the corresponding
    # embeddings.

    vertex_logits = tf.layers.dense(
        embeddings, params['num_labels'], activation=None, use_bias=True,
        kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=label_task_weight * params['l2_regularization']))
    
    vertex_loss_per_vertex = tf.losses.sigmoid_cross_entropy(
        vertex_labels, logits=vertex_logits, weights=tf.expand_dims(vertex_labels_split, -1),
        reduction=tf.losses.Reduction.NONE)
    
    vertex_loss = tf.reduce_sum(vertex_loss_per_vertex)
    
    # -------------------- Optimization -----------------------------
    # Having computed the vertex and edge loss, we combine them in a weighted fashion and
    # apply a simple stochastic optimizer.
    
    total_loss = (1 - label_task_weight) * edge_loss + label_task_weight * vertex_loss
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=params['learning_rate'])
    
    train_op = optimizer.minimize(
        total_loss, global_step=tf.train.get_or_create_global_step())
    
    return tf.estimator.EstimatorSpec(
        mode, loss=total_loss, train_op=train_op,
        eval_metric_ops={
            'edge_accuracy': edge_accuracy
        })

# Sampler

In the relational ERM framework, the loss function and predictor are only half the story. The way in which we sample is central to the problem and is a part of the model definition. We use custom samplers and adapters we have developed to create an efficient input pipeline to produce the samples. In this case, we are using the tensorflow dataset API.

In [None]:
from relational_sgd.sampling import adapters, negative_sampling
from relational_sgd.tensorflow_ops import dataset_ops

def make_input_fn(graph, labels):
    def input_fn(params):
        # We first create a dataset which produces uniform random walks,
        # i.e. list of vertex indices of the given length.
        dataset = dataset_ops.RandomWalkDataset(
            params['walk_length'], graph.neighbours, graph.lengths, graph.offsets)
        
        # We will apply several adapters to transform this dataset into a dataset
        # which contains all the required information for the model function
        num_vertices = len(graph.lengths)
        
        dataset = dataset.map(
            adapters.compose(
                # We first transform our list of vertices into an edge list which
                # corresponds to the windowed edges.
                adapters.adapt_random_walk_window(params['window_size']),
                # We then add negative edges according to the described negative sampling
                # scheme.
                negative_sampling.add_negative_sample(
                    num_vertices,
                    num_samples_per_vertex=params['num_negative_samples'],
                    vertex_distribution_logit=negative_sampling.make_learned_unigram_logits(
                        num_vertices, prior=graph.lengths)),
                # The next couple of operations are mostly bookkeeping to augment the graph with the
                # necessary metadata
                adapters.relabel_subgraph(),
                adapters.append_vertex_labels(labels),
                adapters.split_vertex_labels(num_vertices, proportion_censored=0.5),
                adapters.format_features_labels()),
            num_parallel_calls=2)
        
        dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
        return dataset
    return input_fn

# Training the Model

Once we have defined the above two components, we can load the data and train the model using the usual tensorflow tools. Below we illustrate with an example using the protein-protein interaction network in Homo-Sapiens.

In [None]:
def load_data(path=None):
    from relational_sgd.graph_ops.representations import create_packed_adjacency_list, edge_list_to_adj_list
    
    if path is None:
        path = 'data/homo_sapiens.npz'

    with tf.gfile.Open(path, mode='rb') as f:
        loaded = np.load(f, allow_pickle=False)
    
    edge_list = loaded['edge_list'].astype(np.int32)
    
    weights = np.ones(edge_list.shape[0], dtype=np.float32)
    labels = loaded['group'].astype(np.int32)
    
    not_self_edge = edge_list[:, 0] != edge_list[:, 1]
    edge_list = edge_list[not_self_edge, :]
    weights = weights[not_self_edge]
    
    adjacency_list = edge_list_to_adj_list(edge_list, weights)
    adjacency_list = create_packed_adjacency_list(adjacency_list)
    
    return adjacency_list, labels

In [None]:
graph, labels = load_data()
num_vertices = len(graph.lengths)

In [None]:
params = {
    'num_vertices': num_vertices,
    'embedding_dimension': 128,
    'num_labels': labels.shape[1],
    'walk_length': 80,
    'window_size': 10,
    'num_negative_samples': 5,
    'learning_rate': 0.025,
    'l2_regularization': 1
}

estimator = tf.estimator.Estimator(
    model_fn=model_fn,
    params=params)

In [None]:
tf.logging.set_verbosity(tf.logging.INFO)
estimator.train(make_input_fn(graph, labels), steps=10000)

In [None]:
estimator.evaluate(make_input_fn(graph, labels), steps=20)