In [None]:
# general imports
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from transformers import AutoTokenizer
from transformers import AutoModel

import torch
import torch.nn as nn
from torch import optim

import numpy as np

import networkx as nx
import spacy

import pandas as pd

import ast

import pprint

import json

import glob

from torch_geometric.utils.convert import from_networkx
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

import pickle

In [2]:
# if we use embedding only from last layer, this should stay as it is
# it could be changed for some experiments ?
layers = [-1]

#we load the model
#we could experiment with other models as well
model = AutoModel.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

#these are the spacy models used to tokenize the texts and extract linguistic information
nlp_pt = spacy.load("pt_core_news_sm")
nlp_it = spacy.load("it_core_news_sm")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
#the device variable can be changed in case a GPU is available
device = torch.device('cpu')
#uncomment the next line to use gpu
#device = torch.device('gpu')

#the next two functions are used to extract the embeddings from tokens / sentences
def get_hidden_states(encoded, model, layers):
    with torch.no_grad():
         output = model(**encoded)
    # Get all hidden states
    states = output.hidden_states
    # Stack and sum all requested layers
    output = torch.stack([states[i] for i in layers]).sum(0).squeeze()

    return output

def get_words_vector(sent, tokenizer, model, layers):
    encoded = tokenizer.encode_plus(sent, return_tensors="pt")
    # get all token idxs that belong to the word of interest
    #token_ids_word = np.where(np.array(encoded.word_ids()) == idx)

    return get_hidden_states(encoded, model, layers)



In [9]:
data = [json.load(open(file)) for file in glob.glob('./data/*.json')]

get_words_vector(data[0]['sentences_0']['it-text'], tokenizer, model, layers).shape

torch.Size([83, 768])

In [57]:
# this function is used to create a graph in networkx
# + a dictionary of {token_index:embedding}
# the graph and the dictionary are kept divided in case we would want to visualize just the graph

def createGraph(tokens, sentence, relations, language):

    graph = nx.Graph()
    edge_list = []
    dict_embeddings = {}
    sent_embeddings = get_words_vector(sentence, tokenizer, model, layers)
    
    skipped_tokens = 0
    
    for enum, token in enumerate(tokens):
        token_string = tokens[token]['string']
        token_dep_id = tokens[token]['dep_id']
        token_head = tokens[token]['dep_head_id']
        
        #we tokenize each word separately so we get bert subwords
        
        token_bert = tokenizer.tokenize(token_string, add_special_tokens=False)
        token_idx = tokenizer.encode(token_string, add_special_tokens=False)
        token_embeddings = []
        
        #if a word is divided in subwords, the final embedding is the mean of the embeddings of each subword
        for enum_idx, token_id in enumerate(token_idx):
            token_embeddings.append(sent_embeddings[int(token)+enum_idx+skipped_tokens])
            
        skipped_tokens += enum_idx
        print(token,token_string, token_embeddings)
        if len(token_embeddings) > 1:
            token_embeddings = torch.stack(token_embeddings).to(device)
            token_embeddings = torch.mean(token_embeddings, -2)
        else:
            token_embeddings = token_embeddings[0]
 
        
        
        edge = (int(token), token_head)
        edge_list.append(edge)

        graph.add_node(int(token), label=token_string, type='token', idx=token_idx)
        dict_embeddings[int(token)] = token_embeddings
    
    
    last_added_node = list(graph.nodes())[-1]
    #this next variable is used to extract the correct value from the dictionary
    rel_key = f"{language}-rel"
    
    for rel in relations:
        subj_id = last_added_node 
        obj_id = last_added_node +1
        last_added_node +=2
        
        embeddings_subj = []
        embeddings_obj = []
        
        graph.add_node(subj_id, label=rel[rel_key]['subj']['text'], type='entity', idx=[])
        graph.add_node(obj_id, label=rel[rel_key]['obj']['text'], type='entity', idx=[])
        
        #for each annotated entity, the embedding is equal to the mean of the embeddings of the tokens that are present in it
        #for instance the embedding of 'Il presidente Mario Draghi' = mean([w_il, w_presidente, w_mario, w_draghi]),
        #with w_i being the embedding of i
        
        for subj_tokens in rel[rel_key]['subj']['id_tokens']:
            graph.add_edge(int(subj_tokens), subj_id)
            embeddings_subj.append(dict_embeddings[int(subj_tokens)])
            for id_ in graph.nodes[int(subj_tokens)]['idx']:
                graph.nodes[subj_id]['idx'].append(id_)
                
        for obj_tokens in rel[rel_key]['obj']['id_tokens']:
            graph.add_edge(int(obj_tokens), obj_id) 
            embeddings_obj.append(dict_embeddings[int(obj_tokens)])
            for id_ in graph.nodes[int(obj_tokens)]['idx']:
                graph.nodes[obj_id]['idx'].append(id_)
        
        #the next two exceptions are needed for some italian annotation that lacks entity
        #unfortunately this is the only fix I could think about since the data was automatically annotated
        try:
            embeddings_subj = torch.stack(embeddings_subj).to(device)
            dict_embeddings[int(subj_tokens)] = torch.mean(embeddings_subj, -2)
        except RuntimeError:
            pass
        try:
            embeddings_obj = torch.stack(embeddings_obj).to(device)
            dict_embeddings[int(obj_tokens)] = torch.mean(embeddings_obj, -2)
        except RuntimeError:
            pass
    
    for edge in edge_list:
        graph.add_edge(edge[0], edge[1])
        
    
    return graph, dict_embeddings

In [58]:
#the next function create pairs of (source_sentence_graph, target_sentence_graph)

def get_pair(data, l1, l2):
    all_pairs = []
    for enum, d in enumerate(data):
        print(f"{enum}//{len(data)}", end='\r')
        for sentence in d:
            G_source, G_emb_source = createGraph(
                                d[sentence][f'{l1}-tokens'],
                                d[sentence][f'{l1}-text'],
                                d[sentence]['relations'],
                                l1)
            G_target, G_emb_target = createGraph(
                                d[sentence][f'{l2}-tokens'],
                                d[sentence][f'{l2}-text'],
                                d[sentence]['relations'],
                                l2)
            all_pairs.append(([G_source, G_emb_source],[G_target, G_emb_target]))
        
    return all_pairs

In [59]:
pairs = get_pair(data,'pt', 'it')

0 bandos [tensor([-1.3044e-01,  1.1840e-01, -2.5689e-01, -1.3554e-01,  2.6830e-01,
         8.6189e-02,  1.4512e-02,  2.6113e-02,  1.0715e-01,  1.9583e-01,
         3.5018e-02,  1.8181e-01,  1.7123e-01,  2.3724e-01, -7.1631e-01,
         4.1730e-02,  1.3107e-01,  1.3381e-01, -9.3239e-02, -2.9428e-02,
         2.5364e-01,  1.5361e-01,  5.4660e-02, -2.9812e-02, -2.4345e-01,
        -2.1660e-01, -1.0629e-01,  7.6632e-02,  2.5843e-01, -1.2880e-01,
         2.7978e-01, -1.0295e-01, -2.6071e-01, -1.5524e-01,  1.7449e-01,
        -6.8282e-02, -1.9245e+00, -1.9406e-01,  1.6369e-01,  6.2921e-02,
        -8.7172e-02,  5.0502e-01, -3.9920e-02,  1.2925e-01,  7.3942e-02,
         1.1204e+00,  1.3366e-01, -1.0384e-01,  1.2570e+00, -1.0754e-01,
         1.3077e-01, -5.3340e-01, -1.5294e-01, -1.5558e+00, -2.7788e-02,
         1.0674e-01,  4.7726e-02, -9.4458e-02,  1.9616e-01,  1.5093e-01,
        -8.6653e-02,  1.5124e-01,  1.9240e-01,  1.8507e-02, -1.1009e-01,
        -4.5655e-02, -1.0680e-01, -2.5181

16 com [tensor([-7.5373e-01,  4.8733e-01,  1.3604e+00, -2.5192e-01,  1.3436e+00,
        -1.6855e-01, -3.2552e-01,  1.1857e-01, -1.0881e-01,  2.1181e-02,
        -2.7171e-01,  8.6600e-01,  3.1023e-02,  5.9614e-01, -5.6855e-01,
        -7.9085e-01,  1.2306e+00, -5.4357e-01, -1.0198e+00,  5.4038e-01,
         1.9200e-01,  4.4835e-01,  4.7510e-01, -3.5790e-02,  1.2587e+00,
        -7.0539e-02, -3.6292e-01,  5.1344e-01,  2.2613e-03, -1.8452e-01,
         8.3927e-01,  3.7729e-01, -2.5676e-01,  2.5602e-01, -5.6123e-01,
        -4.7917e-01,  2.6765e-01, -1.9772e-02,  4.6104e-01, -8.5714e-01,
        -2.1491e-01,  7.5394e-01, -3.2655e-01,  4.6374e-02,  1.9133e-01,
         7.1806e-02,  7.7625e-01,  2.1032e-01, -1.3118e+00, -6.0414e-01,
        -3.5635e-01, -4.3440e-01,  6.8747e-01,  1.7462e-01,  5.6837e-01,
         1.9802e+00,  1.4116e+00,  9.3603e-01,  6.5041e-01, -1.7131e-01,
         2.2810e-01, -1.3399e-01,  3.9199e-01, -1.1908e-01, -1.8062e-01,
        -8.9742e-02,  9.5419e-01, -1.4526e-

33 , [tensor([-8.7145e-01, -2.8291e-01,  9.0250e-01, -1.9781e-01,  6.8509e-01,
         7.2860e-01, -3.2691e-01,  1.7204e-01, -3.3096e-01,  7.5319e-01,
        -1.0966e-01,  7.6861e-01, -5.1553e-01, -1.1486e-01, -5.7151e-01,
         4.2269e-01,  4.7650e-01,  3.9194e-01, -1.1492e+00, -1.8570e-01,
        -3.9680e-01, -1.9184e-01,  4.9698e-01, -1.7705e-01, -1.1924e+00,
         6.0900e-02, -3.7793e-01, -2.2957e-01,  1.0941e-01, -5.3661e-01,
         1.1026e+00,  3.8511e-01, -4.3572e-01, -5.2565e-01, -4.5760e-01,
        -3.2761e-01, -3.4485e-01,  7.8459e-01,  7.1666e-01,  1.3620e-01,
        -7.0933e-01,  3.2259e-02, -3.0555e-01, -2.5533e-01,  9.5400e-01,
        -6.5229e-02,  7.3159e-02,  4.2400e-01, -7.5088e-01,  2.0026e+00,
        -8.2702e-01,  1.9367e-01, -3.2707e-01, -8.9515e-02, -2.2004e-01,
         7.6993e-01, -2.2836e-02,  9.8373e-01,  6.9804e-01,  1.4899e-01,
        -4.2836e-01, -4.0679e-01, -1.3817e-01,  1.7914e-01, -2.4435e-01,
        -2.5216e-01,  5.4203e-02, -5.2589e-01

47 dominar [tensor([-2.4177e-01, -1.2283e-01,  6.2377e-01,  3.5232e-01,  4.3207e-01,
        -2.4054e-01, -2.3321e-01, -5.1577e-02,  2.2208e-03,  5.0082e-01,
         7.3660e-01,  1.6690e-01,  8.4466e-01,  9.4212e-02,  2.6027e-01,
        -5.6853e-01,  1.2504e+00, -1.0109e+00, -9.4159e-01,  9.8465e-02,
         2.5102e-01,  3.5079e-01, -2.3534e-01, -5.5355e-01, -2.6416e-01,
        -3.1565e-01, -1.4424e+00,  7.5460e-02, -4.4567e-01, -5.8488e-01,
         6.0301e-01, -9.8057e-03, -5.9311e-02, -3.0431e-01, -1.7692e-01,
         1.2430e-01,  2.2199e-01, -4.7175e-01,  1.5950e+00,  6.6752e-01,
         4.8086e-01,  1.5990e+00, -4.2087e-01, -5.1064e-01,  1.5797e-01,
        -8.5195e-01,  6.2594e-01, -6.6860e-01, -1.1526e+00, -4.5869e-01,
        -2.2975e-04, -5.6852e-03, -9.5940e-01,  1.6607e-02, -1.9098e-01,
         9.2106e-01, -2.0267e-01,  4.9502e-01,  4.1264e-01, -6.6027e-01,
        -9.0586e-02,  9.5754e-01, -9.1379e-03, -1.2488e-01,  1.4557e-01,
        -3.1056e-01,  1.9838e-01,  2.67

IndexError: list index out of range

In [30]:
# this is an example of a basic 2-layer GCN


layer1 = GCNConv(in_channels=768, out_channels=20)
layer2 = GCNConv(in_channels=20, out_channels=768)

def tensorFromSentence(sentence):
    G = sentence[0]
    G_emb = sentence[1]
    for node in G.nodes():
        if node in G_emb:
            G.nodes[node]['embedding'] = G_emb[node]
        else:
            G.nodes[node]['embedding'] = torch.rand(768)

    pyg_graph = from_networkx(G)
    out1 = layer1(torch.stack(pyg_graph['embedding']), pyg_graph.edge_index)
    out2 = layer2(out1, pyg_graph.edge_index)
    
    return out1, out2

In [40]:
test = tensorFromSentence(pairs[0][0])

In [31]:
#------ the tested code ends here! ------#
#------ here starts the fun part --------#
#this decoder is a basic one from torch
#we need to understand how this can be implemented with our embeddings

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        output = self.embedding(input).view(1,1,-1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1,1,self.hidden_size, device=device)

In [94]:
# the training function should go here
#once again, this is a basic one from pytorch
#we should customize it 

teacher_forcing_ration = 0.5

decoder = DecoderRNN(hidden_size=20, output_size = 768)

def train(input_tensor, target_tensor, decoder):
    max_length=100
    
    loss = 0
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ration else False
    
    if use_teacher_forcing:
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                input_tensor, 
            )
            
    

In [18]:
pairs = get_pair(data[:1], 'it', 'pt')

0//1

In [25]:
tensors = tensorFromSentence(pairs[0][0])

In [26]:
tensors

(tensor([[  1.4758,   0.8212,   1.4488,  ...,  -0.9026,  -2.5898,  -1.4582],
         [  6.0610,   1.9357,   4.7576,  ...,  -1.4969, -13.8911,  -7.7268],
         [  6.8169,   1.9038,   5.1857,  ...,  -1.7661, -16.4115,  -9.0232],
         ...,
         [  4.6705,   1.3710,   3.2550,  ...,  -0.9674, -11.6142,  -6.5362],
         [ -0.2498,   0.1870,   0.3529,  ...,   0.5391,  -1.5391,  -0.9721],
         [  4.7955,   1.5626,   3.3891,  ...,  -1.0123, -11.9010,  -6.2971]],
        grad_fn=<AddBackward0>),
 tensor([[-0.6971, -0.0369,  0.1276,  ..., -0.6505, -0.4651,  0.9091],
         [-1.4891, -0.1901,  0.3322,  ..., -1.4354, -0.9088,  1.9527],
         [-1.5921, -0.2484,  0.3736,  ..., -1.6160, -0.9641,  2.1743],
         ...,
         [-1.3078, -0.2117,  0.2626,  ..., -1.4092, -0.7851,  1.9219],
         [-0.4061, -0.0460,  0.1764,  ..., -0.3664, -0.1687,  0.2669],
         [-1.2866, -0.2163,  0.2448,  ..., -1.3974, -0.7908,  1.9115]],
        grad_fn=<AddBackward0>))