In [1]:
from pykeen.datasets import WN18RR, FB15k237
import pandas as pd
import numpy as np
import torch
from pykeen.sampling.basic_negative_sampler import BasicNegativeSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification,Trainer, TrainingArguments
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from functions import *

2023-06-17 18:53:43.051538: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-17 18:53:43.840991: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### 0 Load dataset

In [2]:
dataset = WN18RR()
path_ent = "data/wn18rr/wn18rr_entity2text.txt"
path_rel = 'data/wn18rr/wn18rr_relation2text.txt'
device = "cuda"

In [3]:
dataset

WN18RR(training_path="/pfs/data5/home/kit/aifb/ho8030/.data/pykeen/datasets/wn18rr/train.txt", testing_path="/pfs/data5/home/kit/aifb/ho8030/.data/pykeen/datasets/wn18rr/test.txt", validation_path="/pfs/data5/home/kit/aifb/ho8030/.data/pykeen/datasets/wn18rr/valid.txt")

#### Get train, test and validation triples  and store in df

In [63]:
def triple_labels(dataset):
    # Get triples in label form e.g. ['/m/010016', '/location/', '/m/0mr_8']
    train = dataset.training.triples
    test = dataset.testing.triples
    val= dataset.validation.triples
    
    return train, test, val

def triple_ids(dataset):
    # Get triples in ID form e.g. [0, 120, 13647]
    train = dataset.training.mapped_triples
    test = dataset.testing.mapped_triples
    val = dataset.validation.mapped_triples
    
    return train, test, val
    
def neg_sampling(sampler, triple_ids, triple_factory):
    
    #Initialiaze negative sampler from pykeen
    neg_sampler = BasicNegativeSampler(mapped_triples = triple_ids, filtered=True)
    
    # Compute negative samples for the given triples
    neg_triples, filter_mask = neg_sampler.sample(triple_ids)
    
    # Create mask to filter out the neg_triples that are included in the initial positive triples of the KG
    mask = np.ones(len(neg_triples), dtype=bool)
    mask[np.where(filter_mask==False)[0]] = False
    tensor_filtered = neg_triples[mask] # Apply the mask to remove duplicates
    
    # Reshape tensor to match the shape of mapped_triples from the pykeen triple factory 
    tensor_filtered = tensor_filtered.reshape(len(tensor_filtered), 3) 
    triple_labels = triple_factory.label_triples(tensor_filtered)  # enter triple ID (number) and get the triple labels   
    
    del tensor_filtered
    
    return triple_labels


def load_ent_rel_def(dataset, path_ent, path_rel):
    
    if dataset=="fb15k237":
        df_entity2text = pd.read_csv(path_ent, delimiter="\t", header = None, names=["id", "entity"])
        df_entity2text["segmented_entities"] = df_entity2text["entity"].str.split(' ')
    
        df_rel2text = pd.read_csv(path_rel, delimiter="\t", header = None, names=["id", "definition"])
        df_rel2text[["property_1_id", "property_2_id"]] = df_rel2text["id"].str.split('.', n=1, expand=True)
        df_rel2text["property_1_id"] = df_rel2text["property_1_id"].str.replace("/", ", ").str[2:]
        df_rel2text["property_2_id"] = df_rel2text["property_2_id"].str.replace("/", ", ").str[2:]
        df_rel2text["property_1_id"] = df_rel2text["property_1_id"].str.replace("_", " ")
        df_rel2text["property_2_id"] = df_rel2text["property_2_id"].str.replace("_", " ")
        
    elif dataset=="wn18rr":
        df_entity2text = pd.read_csv(path_ent, delimiter="\t", header = None, names=["id", "definition"])
        df_entity2text[["entity", "description"]] = df_entity2text["definition"].str.split(',', n=1, expand=True)
        df_entity2text.id = df_entity2text.id.astype(str)
        df_entity2text["id"] = df_entity2text["id"].str.rjust(8, '0')
    
        df_rel2text = pd.read_csv(path_rel, delimiter="\t", header = None, names=["id", "definition"])
    
    else: 
        print("Only datasets 'fb1k237' and 'wn18rr' are supported")
        
    return df_entity2text, df_rel2text
   
    

def triple_def(df_entity2text, df_rel2text, triples, target):
    
    df = pd.DataFrame(triples, columns=['head', 'rel', 'tail'])
    
    df['head_label'] = df['head'].map(df_entity2text.set_index('id')['entity'])
    df['rel_label'] = df['rel'].map(df_rel2text.set_index('id')['definition'])
    df['tail_label'] = df['tail'].map(df_entity2text.set_index('id')['entity'])
    
    if target=="pos":
        df["target"] = 1
    elif target=="neg": 
        df["target"] = 0
    else: 
        print("This parameter can only be either 'pos' or 'neg'")
        
    if df.isnull().values.any():
        print("Dataframe contains nan values, review input")
    return df
    
    
def merge_pos_neg_triples(pos_triples, neg_triples, approach):
    
    # Concatenate the positive and negative triples in the dataframe and shuffle the data 
    df = pd.concat([pos_triples, neg_triples])
    df = df.reset_index(drop=True)
    df = df.sample(frac = 1, random_state=5)
    
    
    if approach=="one_sequence":
        df["triple"]= df["head_label"] + " "+ df["rel_label"] + " " + df["tail_label"]
        
    elif approach=="sep_sequences":
        df['triple'] = df.apply(lambda row: [row['head_label'], row['rel_label'], row['tail_label']], axis=1) 
            
        
    else: 
        print("Only the two approaches one_sequence and sep_sequences are suppported.")
    
    
    #Store triples in a dictionary along with the labels
    output = { "triples": df["triple"].tolist(), "labels": df["target"].tolist()}
    
    return output

class RobertaInput(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def post_tokenizer(tokenized_triples):
    for j in range(len(tokenized_triples['input_ids'])):
        item = tokenized_triples['input_ids'][j]
        tokenized_triples['input_ids'][j]=[2 if item[i]==6 else item[i] for i in range(len(item))] 
    return tokenized_triples 

In [48]:
train_label, test_label, val_label =  triple_labels(dataset)
train_ids, test_ids, val_ids =  triple_ids(dataset)

You're trying to map triples with 212 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 210 from 3134 triples were filtered out
Reconstructing all label-based triples. This is expensive and rarely needed.
Reconstructing all label-based triples. This is expensive and rarely needed.
You're trying to map triples with 211 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 210 from 3034 triples were filtered out
Reconstructing all label-based triples. This is expensive and rarely needed.


In [49]:
neg_train_triple_labels = neg_sampling(BasicNegativeSampler, train_ids, dataset.training)
neg_test_triple_labels = neg_sampling(BasicNegativeSampler, test_ids, dataset.testing)
neg_val_triple_labels = neg_sampling(BasicNegativeSampler, val_ids, dataset.validation)

In [50]:
len(neg_val_triple_labels) 

2822

In [64]:
df_ent, df_rel = load_ent_rel_def("wn18rr", path_ent, path_rel)

In [72]:
train_triple_def_pos = triple_def(df_ent, df_rel, train_label, "pos")
train_triple_def_neg = triple_def(df_ent, df_rel, neg_train_triple_labels , "neg")

In [73]:
test_triple_def_pos = triple_def(df_ent, df_rel, test_label , "pos")
test_triple_def_neg = triple_def(df_ent, df_rel, neg_test_triple_labels , "neg")

In [74]:
val_triple_def_pos = triple_def(df_ent, df_rel, val_label , "pos")
val_triple_def_neg = triple_def(df_ent, df_rel, neg_val_triple_labels , "neg")

In [75]:
len(val_triple_def_neg)

2822

In [85]:
train_triples = merge_pos_neg_triples(train_triple_def_pos, train_triple_def_neg, "one_sequence")
test_triples = merge_pos_neg_triples(test_triple_def_pos, test_triple_def_neg, "one_sequence")
val_triples = merge_pos_neg_triples(val_triple_def_pos, val_triple_def_neg, "one_sequence")

In [89]:
def save_dict_to_file(dic, name):
    f = open(name,'w')
    f.write(str(dic))
    f.close()

In [92]:
#save_dict_to_file(test_triples,'test_one_sequence')

In [88]:
train_triples["triples"][:10]

['silkwood hypernym tree',
 'distend derivationally related form distention',
 'caramelize hypernym strike',
 'border hypernym edge',
 'reciprocal hypernym help',
 'unnaturalness derivationally related form wait',
 'signal tower hypernym exist',
 'vaticination derivationally related form vaticinate',
 'ziziphus member meronym ziziphus jujuba',
 'gymnosporangium juniperi-virginianae hypernym rust fungus']

#### Tokenize Input

In [2]:
def load_dict(path):
    f = open(path,'r')
    data=f.read()
    f.close()
    return eval(data)

In [2]:
train_triples = load_dict('07_roberta/processed_input/fb15k237/train_one_sequence')
test_triples = load_dict('07_roberta/processed_input/fb15k237/test_one_sequence')
val_triples = load_dict('07_roberta/processed_input/fb15k237/val_one_sequence')

In [18]:
#tokenizer_input = [[train_triples['triples'][i]] for i in range(len(train_triples['triples']))]

In [23]:
tokenizer_input[0]

['Juliette Binoche award award nominee award nominations. award award nomination award nominee Brian K. Vaughan']

In [3]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [7]:
test1 = tokenizer(["Hello world."])
test2 = tokenizer(["Hello world <SEP> It is sunny today."])
test3 = tokenizer([["Hello", "world"],
                  ["It", "is"]])

In [4]:
train_tokenized = tokenizer(train_triples['triples'], padding=True, truncation=True)
test_tokenized = tokenizer(test_triples["triples"], padding=True, truncation=True)
val_tokenized = tokenizer(val_triples["triples"], padding=True, truncation=True)

In [7]:
train_tokenized["input_ids"][:3]

[[0,
  26239,
  330,
  1845,
  8944,
  2855,
  119,
  3907,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 [0,
  17165,
  1397,
  45087,
  29688,
  1330,
  1026,
  7018,
  19774,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 [0,
  5901,
  22044,
  2072,
  8944,
  2855,
  119,
  2506,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]]

In [9]:
len(train_tokenized["input_ids"])

173549

In [96]:
len(train_tokenized["input_ids"])

173549

In [5]:
train_data = RobertaInput(train_tokenized, train_triples["labels"])

In [6]:
test_data = RobertaInput(test_tokenized, test_triples["labels"])

In [7]:
val_data = RobertaInput(val_tokenized, val_triples["labels"])

In [14]:
del train_tokenized
del test_tokenized
del val_tokenized
del tokenizer
del train_triples
del test_triples
del val_triples

### 1 Load model

In [46]:
#Double check if tokenizer worked out as it should 
tokenizer.convert_ids_to_tokens(train_tokenized["input_ids"][0])
#tokenizer.convert_tokens_to_ids(4)

['<s>',
 'sil',
 'k',
 'wood',
 'Ġhyper',
 'ny',
 'm',
 'Ġtree',
 '</s>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']

In [24]:
tokenizer.convert_tokens_to_ids([0])

[3]

#### -------------------- ROBERTA for Text Classification -------------------

In [8]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [12]:
model.base_model.parameters

<bound method Module.parameters of RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwi

In [9]:
for param in model.base_model.parameters():
    #param.requires_grad = False
    param.requires_grad = False
    
#for param in model.parameters():
#    param.requires_grad = True
    

In [10]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [11]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = 'test9',
    num_train_epochs=3,
    learning_rate=5e-5,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 32,
    eval_accumulation_steps = 8,
    per_device_eval_batch_size= 8,
    evaluation_strategy = "steps",
    save_strategy = "steps",
    disable_tqdm = False,
    load_best_model_at_end=True,
    eval_steps = 400,
    save_steps = 400,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps = 8,
    fp16 = False,
    logging_dir='test9',
    #dataloader_num_workers = 8,
    run_name = 'test9'
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data
)

In [13]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mvjolacl[0m ([33mnlm_kgc[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
400,0.6874,0.688347,0.530882,0.672464,0.516943,0.961828
800,0.6815,0.676731,0.61453,0.612348,0.616689,0.608068
1200,0.6743,0.668423,0.621043,0.681275,0.588428,0.808912


KeyboardInterrupt: 

In [18]:
trainer.save_model('07_roberta/fb15k237_frozenL_oneseq_P1/1epoch_P1')

In [19]:
output = trainer.predict(test_dataset=test_data)

In [109]:
len(test_data)

5845

In [20]:
output

PredictionOutput(predictions=array([[ 0.11438958,  0.03335938],
       [-0.09979224,  0.23526353],
       [-0.0358229 ,  0.19228782],
       ...,
       [ 0.10056299,  0.03405912],
       [-0.16386166,  0.3011089 ],
       [ 0.17861867, -0.03435762]], dtype=float32), label_ids=array([1, 1, 0, ..., 0, 1, 0]), metrics={'test_loss': 0.666879415512085, 'test_accuracy': 0.6142237683431735, 'test_f1': 0.6924594262054957, 'test_precision': 0.576239232894523, 'test_recall': 0.8674038555631667, 'test_runtime': 50.0683, 'test_samples_per_second': 815.266, 'test_steps_per_second': 101.921})

### -----------------------

In [31]:
# Model with classifier layers on top of RoBERTa
class ROBERTAClassifier(torch.nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(ROBERTAClassifier, self).__init__()
        
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.d1 = torch.nn.Dropout(dropout_rate)
        self.l1 = torch.nn.Linear(768, 64)
        self.bn1 = torch.nn.LayerNorm(64)
        self.d2 = torch.nn.Dropout(dropout_rate)
        self.l2 = torch.nn.Linear(64, 2)
        
    def forward(self, input_ids, attention_mask):
        _, x = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = self.d1(x)
        x = self.l1(x)
        x = self.bn1(x)
        x = torch.nn.Tanh()(x)
        x = self.d2(x)
        x = self.l2(x)
        
        return x 