In [None]:
Name:

In [21]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [22]:
import pickle
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [23]:
CORA='/content/gdrive/MyDrive/CORA/'

with open(CORA + 'vocab-lower.pickle', 'rb') as f:
    #vocab dic is list of all words in the dictionery
    vocab_dic = list(map(lambda x: x.decode('utf-8'), pickle.load(f, encoding='utf-8')))


with open(CORA + 'vocab_labels.pickle', 'rb') as f:
    #The list of possible labels 
    label_dic = list(map(lambda x: x.decode('utf-8'), pickle.load(f, encoding='bytes')))
    
with open(CORA + 'trained_embedding.pickle', 'rb') as f:
    #The trained embedding on Wikipedia using GLOVE algorithm 
    #Each column is the embedding of a word (with respect to the vocab dictionary) 
    embedding = pickle.load(f, encoding='bytes')
    vocabulary_size = 20608
    embedding_size = 100


with open(CORA + 'xdata-lower.pickle', 'rb') as f:
    #Each row is a citation. Each cell is the token id (with respect to the vocab dictionary) within the citation.
    xdata = pickle.load(f, encoding='bytes')

with open(CORA + 'Y_train.pickle', 'rb') as f:
    ydata = pickle.load(f, encoding='bytes')

with open(CORA + 'xval-lower.pickle', 'rb') as f:
    xval = pickle.load(f, encoding='bytes')

with open(CORA + 'Y_dev.pickle', 'rb') as f:
    yval = pickle.load(f, encoding='bytes')

with open(CORA + 'xtest-lower.pickle', 'rb') as f:
    xtest = pickle.load(f, encoding='bytes')

with open(CORA + 'Y_test.pickle', 'rb') as f:
    ytest = pickle.load(f, encoding='bytes')





In [24]:

print("vocab size:", len(vocab_dic))
print(vocab_dic[100], vocab_dic[200])

vocab size: 18049
no formal


In [25]:
def cosine_similarity(v1, v2):
  norm_v1 = np.linalg.norm(v1)
  norm_v2 = np.linalg.norm(v2)
  return np.dot(v1,v2)/(norm_v1*norm_v2)

In [26]:
v1_id = vocab_dic.index('positive')
v2_id = vocab_dic.index('constructive')
v3_id = vocab_dic.index('conference')

v1_vec = embedding[v1_id]
v2_vec = embedding[v2_id]
v3_vec = embedding[v3_id]


In [27]:
cosine_similarity(v1_vec,v2_vec)

0.40829688468111025

In [28]:
cosine_similarity(v1_vec, v3_vec)

0.226298426910638

In [29]:
#construct the sentence
print(xdata[1])
#each cell is the token id and 0 means padding.


[   16     1  4426     2    33     1  2014     2     4    38     1  8853
     3 14152     1   189     5  1901    37    59   839   128     4   319
   155     1   102   282    62     2    46     7    46     8     2   297
     1     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0]


In [204]:
def get_sentence(x):
  x_nopad = list(filter(lambda i: True if i > 0 else False, x))
  return ' '.join([vocab_dic[id] for id in x_nopad])

def get_label(y):
  y_nopad = list(filter(lambda i: True if i > 0 else False, y))
  return ' '.join([label_dic[id] for id in y_nopad])

def print_sentence_label_pair(x, y):
  x_nopad = list(filter(lambda i: True if i > 0 else False, x))
  y_nopad = list(filter(lambda i: True if i > 0 else False, y))
  for i in range(len(x_nopad)):
    print(label_dic[y_nopad[i]],'\t\t' ,vocab_dic[x_nopad[i]],)

def print_sentence_pred_label(x, y, yt):
  x_nopad = list(filter(lambda i: True if i > 0 else False, x))
  for i in range(len(x_nopad)):
    print(label_dic[yt[i]], label_dic[y[i]],'\t\t'  ,vocab_dic[x_nopad[i]],)


In [205]:
print_sentence_label_pair(xdata[10], ydata[10])
# print(get_sentence(xdata[10]))
# print(get_label(ydata[10]))

author 		 ramadge
author 		 ,
author 		 p
author 		 .
author 		 ,
author 		 &
author 		 wonham
author 		 ,
author 		 w
author 		 .
date 		 (
date 		 1989
date 		 )
date 		 .
title 		 the
title 		 control
title 		 of
title 		 discrete
title 		 event
title 		 systems
title 		 .
booktitle 		 proceedings
booktitle 		 of
booktitle 		 the
booktitle 		 ieee
booktitle 		 ,
volume 		 77
volume 		 (
volume 		 1
volume 		 )
volume 		 ,
pages 		 81
pages 		 -
pages 		 98
pages 		 .


In [206]:
#calucate the hamming loss between gold data and predication
def token_level_loss(cpred, ctrue):
  try:
    label_num = np.shape(cpred)[1]
    pred = cpred.astype(np.int)
    true = ctrue.astype(np.int)
    n1 = len(pred)
    n2 = len(true)
    assert n1 == n2
    hloss = 0.0
    exact_acc = 0.0
    for i in range(n1):
      sample_loss = 0.0
      exact = 0.0
      xp = pred[i]
      xt = true[i]
      cnp = 0
      for j in range(label_num):
        if (xt[j] == 0):
          continue
        cnp+=1
        if xp[j] != xt[j]:
          sample_loss += 1.0
        else:
          exact +=1
      sample_loss = sample_loss / cnp
      exact = exact / cnp
      hloss += sample_loss
      exact_acc += exact
    return (hloss / n1, exact_acc / n1)
  except Warning:
    return (0.0,0.0);

def perf(ytr_pred, yval_pred, yts_pred, ydata, yval, ytest):
    global best_val
    global test_val

    hm_ts, ex_ts = token_level_loss(yts_pred, ytest)
    hm_tr, ex_tr = token_level_loss(ytr_pred, ydata)
    hm_val, ex_val = token_level_loss(yval_pred, yval)
    if ex_val > best_val:
        best_val = ex_val
        test_val = ex_ts
    return ("Train: %0.3f Val: %0.3f Test: %0.3f -- Best Val: %0.3f Test: %0.3f" % (ex_tr, ex_val, ex_ts, best_val, test_val))

In [207]:
output_size = len(label_dic)
max_length_output = 118
class CitationNetwork(tf.keras.Model):
  def __init__(self, network_type = 'SimpleLSTM'):
      super(CitationNetwork, self).__init__()

      self.optimizer = tf.keras.optimizers.Adam(1e-5)
      self.embedding = layers.Embedding(input_dim=vocabulary_size, output_dim=embedding_size)
      self.rnn = layers.LSTM(128, return_sequences=True, return_state=True)

      self.fc = layers.Dense(output_size)
  
 
  def call(self, x, predict=True):
      emb = self.embedding(x)
      output, _ , _ = self.rnn(emb)
      output = self.fc(output)
      if predict:
        return tf.math.argmax(output, axis=-1)
      return output
  
  def get_loss(self, ylogits, yt):
    ylabels = tf.one_hot(ybatch, output_size)
    cross_ent = tf.nn.softmax_cross_entropy_with_logits(logits=ylogits, labels=ylabels);
    return tf.reduce_sum(cross_ent)

@tf.function
def train_step(xbatch, ybatch):
    loss = 0
    with tf.GradientTape() as tape:
        prediction = cite(xbatch, False)
        loss = cite.get_loss(prediction, ybatch)
        
    gradients = tape.gradient(loss, cite.trainable_variables)
    cite.optimizer.apply_gradients(zip(gradients, cite.trainable_variables))
    return loss

cite = CitationNetwork();

        

In [208]:
train_size = xdata.shape[0]
batch_size = 10
train_dataset = (tf.data.Dataset.from_tensor_slices(np.hstack((xdata, ydata)))
                 .shuffle(train_size).batch(batch_size))
# print(xdata.shape)
num_epoch = 1000
i = 0;
best_val = 0
test_val = 0

while i < num_epoch:
  loss = 0;
  for batch in train_dataset:
    xbatch = batch[:, :max_length_output]
    ybatch = batch[:, max_length_output:]
    loss += train_step(xbatch, ybatch)

  ypred_test = cite(xtest, predict=True).numpy()
  ypred_val = cite(xval, predict=True).numpy()
  ypred_train = cite(xdata, predict=True).numpy()
  print(i, loss.numpy(), perf(ypred_train, ypred_val, ypred_test, ydata, yval, ytest) )
  i = i+1
  # break

0 93806.17 Train: 0.104 Val: 0.108 Test: 0.112 -- Best Val: 0.108 Test: 0.112
1 93276.89 Train: 0.109 Val: 0.112 Test: 0.119 -- Best Val: 0.112 Test: 0.119
2 92731.08 Train: 0.108 Val: 0.111 Test: 0.120 -- Best Val: 0.112 Test: 0.119
3 92155.65 Train: 0.103 Val: 0.103 Test: 0.114 -- Best Val: 0.112 Test: 0.119
4 91520.74 Train: 0.084 Val: 0.088 Test: 0.097 -- Best Val: 0.112 Test: 0.119
5 90792.98 Train: 0.060 Val: 0.066 Test: 0.069 -- Best Val: 0.112 Test: 0.119
6 89926.64 Train: 0.039 Val: 0.040 Test: 0.049 -- Best Val: 0.112 Test: 0.119
7 88854.76 Train: 0.025 Val: 0.023 Test: 0.027 -- Best Val: 0.112 Test: 0.119
8 87374.99 Train: 0.015 Val: 0.016 Test: 0.015 -- Best Val: 0.112 Test: 0.119
9 85164.234 Train: 0.007 Val: 0.009 Test: 0.008 -- Best Val: 0.112 Test: 0.119
10 81209.71 Train: 0.004 Val: 0.006 Test: 0.005 -- Best Val: 0.112 Test: 0.119
11 71110.586 Train: 0.002 Val: 0.002 Test: 0.002 -- Best Val: 0.112 Test: 0.119
12 53208.266 Train: 0.001 Val: 0.002 Test: 0.001 -- Best Val

KeyboardInterrupt: ignored

In [182]:
print_sentence_pred_label(xtest[1], ypred[1], ytest[1])

editor author 		 in
editor author 		 bouma
editor author 		 ,
editor author 		 h
editor author 		 .
editor author 		 ,
editor author 		 &
editor title 		 elsendoorn
editor author 		 ,
editor title 		 a
editor title 		 .
editor title 		 g
editor title 		 .
editor title 		 (
editor title 		 eds
editor title 		 .
editor title 		 )
editor title 		 ,
title title 		 working
title title 		 models
title title 		 of
title title 		 human
title title 		 perception
title title 		 ,
pages title 		 pp
pages title 		 .
pages title 		 391
pages title 		 -
pages title 		 410
pages title 		 .
publisher title 		 academic
publisher title 		 press
publisher title 		 ,
location title 		 london
location title 		 ,
location PAD 		 england
location PAD 		 .
