In [2]:
from pykeen.datasets import WN18RR, FB15k237
import pandas as pd
import numpy as np
import torch
from pykeen.sampling.basic_negative_sampler import BasicNegativeSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification,Trainer, TrainingArguments
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import wandb
import os

### 0 Load dataset

In [4]:
#dataset = FB15k237()
path_ent = "data/fb15k237/fb15k237_entity2text.txt"
path_rel = 'data/fb15k237/fb15k237_relation2text.txt'
device = "cuda"

In [3]:
#dataset

You're trying to map triples with 30 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 28 from 20466 triples were filtered out


FB15k237(num_entities=14505, num_relations=237, create_inverse_triples=False)

#### Get train, test and validation triples  and store in df

In [5]:
def triple_labels(dataset):
    # Get triples in label form e.g. ['/m/010016', '/location/', '/m/0mr_8']
    train = dataset.training.triples
    test = dataset.testing.triples
    val= dataset.validation.triples
    
    return train, test, val

def triple_ids(dataset):
    # Get triples in ID form e.g. [0, 120, 13647]
    train = dataset.training.mapped_triples
    test = dataset.testing.mapped_triples
    val = dataset.validation.mapped_triples
    
    return train, test, val
    
def neg_sampling(sampler, triple_ids, triple_factory):
    
    #Initialiaze negative sampler from pykeen
    neg_sampler = BasicNegativeSampler(mapped_triples = triple_ids, filtered=True)
    
    # Compute negative samples for the given triples
    neg_triples, filter_mask = neg_sampler.sample(triple_ids)
    
    # Create mask to filter out the neg_triples that are included in the initial positive triples of the KG
    mask = np.ones(len(neg_triples), dtype=bool)
    mask[np.where(filter_mask==False)[0]] = False
    tensor_filtered = neg_triples[mask] # Apply the mask to remove duplicates
    
    # Reshape tensor to match the shape of mapped_triples from the pykeen triple factory 
    tensor_filtered = tensor_filtered.reshape(len(tensor_filtered), 3) 
    triple_labels = triple_factory.label_triples(tensor_filtered)  # enter triple ID (number) and get the triple labels   
    
    del tensor_filtered
    
    return triple_labels


def load_ent_rel_def(dataset, path_ent, path_rel):
    
    if dataset=="fb15k237":
        df_entity2text = pd.read_csv(path_ent, delimiter="\t", header = None, names=["id", "entity"])
        df_entity2text["segmented_entities"] = df_entity2text["entity"].str.split(' ')
    
        df_rel2text = pd.read_csv(path_rel, delimiter="\t", header = None, names=["id", "definition"])
        df_rel2text[["property_1_id", "property_2_id"]] = df_rel2text["id"].str.split('.', n=1, expand=True)
        df_rel2text["property_1_id"] = df_rel2text["property_1_id"].str.replace("/", ", ").str[2:]
        df_rel2text["property_2_id"] = df_rel2text["property_2_id"].str.replace("/", ", ").str[2:]
        df_rel2text["property_1_id"] = df_rel2text["property_1_id"].str.replace("_", " ")
        df_rel2text["property_2_id"] = df_rel2text["property_2_id"].str.replace("_", " ")
        
    elif dataset=="wn18rr":
        print("Not implemented yet")
    
    else: 
        print("Only datasets 'fb1k237' and 'wn18rr' are supported")
        
    return df_entity2text, df_rel2text
   
    

def triple_def(df_entity2text, df_rel2text, triples, target):
    
    df = pd.DataFrame(triples, columns=['head', 'rel', 'tail'])
    
    df['head_label'] = df['head'].map(df_entity2text.set_index('id')['entity'])
    df['rel_label'] = df['rel'].map(df_rel2text.set_index('id')['definition'])
    df['tail_label'] = df['tail'].map(df_entity2text.set_index('id')['entity'])
    
    if target=="pos":
        df["target"] = 1
    elif target=="neg": 
        df["target"] = 0
    else: 
        print("This parameter can only be either 'pos' or 'neg'")
        
    if df.isnull().values.any():
        print("Dataframe contains nan values, review input")
    return df
    
    
def merge_pos_neg_triples(pos_triples, neg_triples):
    
    # Concatenate the positive and negative triples in the dataframe and shuffle the data 
    df = pd.concat([pos_triples, neg_triples])
    df = df.reset_index(drop=True)
    df = df.sample(frac = 1, random_state=5)
    
    df["triple"]= df["head_label"]+ ", " + df["rel_label"] + ", " + df["tail_label"]
    
    #Store triples in a dictionary along with the labels
    output = { "triples": df["triple"].tolist(), "labels": df["target"].tolist()}
    
    return output

class RobertaInput(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def post_tokenizer(tokenized_triples):
    for j in range(len(tokenized_triples['input_ids'])):
        item = tokenized_triples['input_ids'][j]
        tokenized_triples['input_ids'][j]=[2 if item[i]==6 else item[i] for i in range(len(item))] 
    return tokenized_triples 

In [5]:
train_label, test_label, val_label =  triple_labels(FB15k237())
train_ids, test_ids, val_ids =  triple_ids(FB15k237())

You're trying to map triples with 30 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 28 from 20466 triples were filtered out
Reconstructing all label-based triples. This is expensive and rarely needed.
Reconstructing all label-based triples. This is expensive and rarely needed.
You're trying to map triples with 9 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 9 from 17535 triples were filtered out
Reconstructing all label-based triples. This is expensive and rarely needed.
You're trying to map triples with 30 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 28 from 20466 triples were filtered out
You're trying to map triples with 9 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 9 from 17535 triples were filtered o

In [6]:
neg_train_triple_labels = neg_sampling(BasicNegativeSampler, train_ids, dataset.training)
neg_test_triple_labels = neg_sampling(BasicNegativeSampler, test_ids, dataset.testing)
neg_val_triple_labels = neg_sampling(BasicNegativeSampler, val_ids, dataset.validation)

You're trying to map triples with 9 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 9 from 17535 triples were filtered out


In [7]:
len(neg_val_triple_labels) 

17472

In [8]:
df_ent, df_rel = load_ent_rel_def("fb15k237", path_ent, path_rel)

In [9]:
train_triple_def_pos = triple_def(df_ent, df_rel, train_label, "pos")
train_triple_def_neg = triple_def(df_ent, df_rel, neg_train_triple_labels , "neg")

In [10]:
test_triple_def_pos = triple_def(df_ent, df_rel, test_label , "pos")
test_triple_def_neg = triple_def(df_ent, df_rel, neg_test_triple_labels , "neg")

In [11]:
val_triple_def_pos = triple_def(df_ent, df_rel, val_label , "pos")
val_triple_def_neg = triple_def(df_ent, df_rel, neg_val_triple_labels , "neg")

In [12]:
len(val_triple_def_neg)

17472

In [13]:
train_triples = merge_pos_neg_triples(train_triple_def_pos, train_triple_def_neg)
test_triples = merge_pos_neg_triples(test_triple_def_pos, test_triple_def_neg)
val_triples = merge_pos_neg_triples(val_triple_def_pos, val_triple_def_neg)

In [14]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [15]:
train_tokenized = tokenizer(train_triples["triples"], padding=True, truncation=True)

In [16]:
test_tokenized = tokenizer(test_triples["triples"], padding=True, truncation=True)
val_tokenized = tokenizer(val_triples["triples"], padding=True, truncation=True)

In [17]:
train_tokenized = post_tokenizer(train_tokenized)
test_tokenized = post_tokenizer(test_tokenized)
val_tokenized = post_tokenizer(val_tokenized)

In [18]:
test_tokenized["input_ids"][0]

[0,
 26145,
 625,
 27171,
 3683,
 13,
 2700,
 8479,
 16038,
 154,
 2,
 2354,
 2354,
 4120,
 11357,
 4,
 2354,
 2354,
 5757,
 7076,
 13,
 2,
 496,
 815,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [19]:
train_data = RobertaInput(train_tokenized, train_triples["labels"])

In [20]:
test_data = RobertaInput(test_tokenized, test_triples["labels"])

In [21]:
val_data = RobertaInput(val_tokenized, test_triples["labels"])

In [22]:
del train_tokenized
del test_tokenized
del val_tokenized
del tokenizer
del train_triples
del test_triples
del val_triples

### 1 Load model

In [23]:
#Double check if tokenizer worked out as it should 
#tokenizer.convert_ids_to_tokens(test_tokenized["input_ids"][0])
#tokenizer.convert_tokens_to_ids(4)

#### -------------------- ROBERTA for Text Classification -------------------

In [24]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [25]:
model.base_model.parameters

<bound method Module.parameters of RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwi

In [26]:
for param in model.base_model.parameters():
    param.requires_grad = False

In [27]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [28]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = '07_roberta/1st_try_1epoch',
    num_train_epochs=1,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 32, 
    eval_accumulation_steps = 1,
    per_device_eval_batch_size= 1,
    evaluation_strategy = "steps",
    save_strategy = "steps",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    eval_steps = 3000,
    save_steps = 3000,
    #warmup_steps=500,
    weight_decay=0.01,
    logging_steps = 8,
    fp16 = False,
    logging_dir='07_roberta/logs',
    #dataloader_num_workers = 8,
    run_name = 'roberta-frozen-layers-fb15k237'
)

In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mvjolacl[0m ([33mnlm_kgc[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


### WN11 Dataset ----------------------

In [15]:
df_entity2text = pd.read_csv("data/wn11/entity2text.txt", delimiter="\t", header = None, names=["id", "entity"])

In [6]:
df_relation2text = pd.read_csv("data/wn11/relation2text.txt", delimiter="\t", header = None, names=["id", "entity"])

In [17]:
df_entity2text[:3]

Unnamed: 0,id,entity
0,__east_indian_1,east indian
1,__lindesnes_1,lindesnes
2,__chlamydosaurus_kingi_1,chlamydosaurus kingi


In [41]:
train =  pd.read_csv("data/wn11/train.tsv",  delimiter="\t", header = None, names=["head", "relation", "tail"])
test =  pd.read_csv("data/wn11/test.tsv",  delimiter="\t", header = None, names=["head", "relation", "tail"])
dev =  pd.read_csv("data/wn11/dev.tsv",  delimiter="\t", header = None, names=["head", "relation", "tail"])

In [74]:
duplicate_rows = train.duplicated()
print(train[duplicate_rows])

                                head                  relation  \
109782              __house_of_god_1             _has_instance   
109783             __confrontation_2                  _type_of   
109784  __african_scented_mahogany_1                  _type_of   
109785        __family_graminaceae_1           _member_meronym   
109786                 __aplectrum_1           _member_holonym   
...                              ...                       ...   
112576                   __hamelin_1  _subordinate_instance_of   
112577                    __anuran_1             _has_instance   
112578                  __bring_up_2             _has_instance   
112579                   __dreamer_3                  _type_of   
112580        __american_red_elder_1                  _type_of   

                           tail  
109782          __conventicle_2  
109783         __disagreement_3  
109784             __mahogany_2  
109785       __genus_glyceria_1  
109786   __family_orchidaceae_1  
...

In [87]:
training

TriplesFactory(num_entities=38194, num_relations=11, create_inverse_triples=False, num_triples=110361, path="/pfs/data5/home/kit/aifb/ho8030/data/wn11/train.tsv")

In [78]:
from pykeen.triples import TriplesFactory
training = TriplesFactory.from_path("data/wn11/train.tsv")
testing = TriplesFactory.from_path("data/wn11/test.tsv")
dev = TriplesFactory.from_path("data/wn11/dev.tsv")

In [88]:
training.label_triples(training.mapped_triples)

array([['__0_1', '_similar_to', '__cardinal_4'],
       ['__0_1', '_type_of', '__digit_1'],
       ['__1000000000000_1', '_type_of', '__large_integer_1'],
       ...,
       ['__zymosis_2', '_has_instance', '__fungal_infection_1'],
       ['__zymosis_2', '_synset_domain_topic', '__medical_specialty_1'],
       ['__zymosis_2', '_type_of', '__infection_2']], dtype='<U68')

In [35]:
testing

TriplesFactory(num_entities=17336, num_relations=11, create_inverse_triples=False, num_triples=21034, path="/pfs/data5/home/kit/aifb/ho8030/data/wn11/test.tsv")

In [50]:
len(train)

112581