<a href="https://colab.research.google.com/github/yalopez84/Goog-Negative-Sampling/blob/master/GoodNegativePrediction_Freebase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pytorch-pretrained-bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
import os
from shutil import rmtree
import csv
from tqdm import tqdm, trange
from torch.nn import CrossEntropyLoss, MSELoss
import random
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_dir="/content/drive/MyDrive/NegativeStrategies/GoodNegativeSampling/FB13/"
os.chdir(data_dir)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
class InputExample(object):

    def __init__(self, guid, text_a, text_b=None, text_c=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.text_c = text_c
        self.label = label

In [None]:
class InputFeatures(object):
    
    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id   

In [None]:
class TripleEvaluationReport(object):
    
    def __init__(self, id_triple, id_label, id_prediction):
        self.id_triple = id_triple
        self.id_label = id_label
        self.id_prediction = id_prediction

In [None]:
class DataProcessor(object):
  
    def get_test_examples(self, data_dir):
        raise NotImplementedError()

    def get_labels(self, data_dir):
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        with open(input_file, "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                lines.append(line)
            return lines   

In [None]:
class KGProcessor(DataProcessor):
    def __init__(self):
        self.labels = set()
    
    def get_test_examples(self, data_dir):
        return self._create_examples(
          self._read_tsv(os.path.join(data_dir, "test_reduced.tsv")), "test", data_dir)

    def get_relations(self, data_dir):
        with open(os.path.join(data_dir, "relations.txt"), 'r') as f:
            lines = f.readlines()
            relations = []
            for line in lines:
                relations.append(line.strip())
        return relations

    def get_labels(self, data_dir):
        return ["0", "1"]

    def get_entities(self, data_dir):

        with open(os.path.join(data_dir, "entities.txt"), 'r') as f:
            lines = f.readlines()
            entities = []
            for line in lines:
                entities.append(line.strip())
        return entities

    def _create_examples(self, lines, set_type, data_dir):
        
        ent2text = {}
        with open(os.path.join(data_dir, "entity2text.txt"), 'r') as f:
            ent_lines = f.readlines()
            for line in tqdm(ent_lines):
                temp = line.strip().split('\t')
                if len(temp) == 2:
                    ent2text[temp[0]] = temp[1]

        entities = list(ent2text.keys())

        rel2text = {}
        with open(os.path.join(data_dir, "relation2text.txt"), 'r') as f:
            rel_lines = f.readlines()
            for line in rel_lines:
                temp = line.strip().split('\t')
                rel2text[temp[0]] = temp[1]      

        lines_str_set = set(['\t'.join(line) for line in lines])

        examples=[]        
        for (i, line) in enumerate(lines):
        #Convertimos cada línea en un ejemplo con descripciones
       
            triple_label = line[3]
            if triple_label == "1":
                label = "1"
            else:
                label = "0"

            guid = "%s-%s" % (set_type, i)
            head_ent_text = ent2text[line[0]]
            tail_ent_text = ent2text[line[2]]
            relation_text = rel2text[line[1]] 

            text_a = head_ent_text
            text_b = relation_text
            text_c = tail_ent_text
      
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, text_c = text_c, label=label))   

        return examples

In [None]:
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, print_info = True):
    
    label_map = {label : i for i, label in enumerate(label_list)}

    features = []

    for (ex_index, example) in enumerate(examples):
        
        tokens_a = tokenizer.tokenize(example.text_a)
        tokens_b = None
        tokens_c = None

        if example.text_b and example.text_c:
            tokens_b = tokenizer.tokenize(example.text_b)
            tokens_c = tokenizer.tokenize(example.text_c)
            _truncate_seq_triple(tokens_a, tokens_b, tokens_c, max_seq_length - 4)
        else:
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)
        if tokens_c:
            tokens += tokens_c + ["[SEP]"]
            segment_ids += [0] * (len(tokens_c) + 1)        

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_ids)

        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label_map[example.label]

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features

In [None]:
def _truncate_seq_triple(tokens_a, tokens_b, tokens_c, max_length):
    while True:
        total_length = len(tokens_a) + len(tokens_b) + len(tokens_c)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b) and len(tokens_a) > len(tokens_c):
            tokens_a.pop()
        elif len(tokens_b) > len(tokens_a) and len(tokens_b) > len(tokens_c):
            tokens_b.pop()
        elif len(tokens_c) > len(tokens_a) and len(tokens_c) > len(tokens_b):
            tokens_c.pop()
        else:
            tokens_c.pop()

In [None]:
def simple_accuracy(preds, labels):
    return (preds == labels).mean()

In [None]:
def compute_metrics(preds, labels):
    return {"acc": simple_accuracy(preds, labels)}


In [None]:
def main(numExecution):
    arg_dict ={
        "data_dir": data_dir,
        "bert_model": "bert-base-cased",
        "max_seq_length": 200,
        "train_batch_size": 32,
        "eval_batch_size": 512,
        "learning_rate": 5e-5,
        "num_train_epochs": 3.0,
        "output_dir": "./output_FB13_",               
        "gradient_accumulation_steps": 1,
        "seed":42,
        "do_lower_case":False,
        "loss_scale":0,
        "warmup_proportion":0.1
        }
    
    dirFolder=arg_dict["output_dir"]+str(numExecution)
    n_gpu=0
    device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
    print("Device",device)
    if str(device)=="cuda":
        n_gpu=1
    arg_dict["seed"] = random.randint(1, 200)
    random.seed(arg_dict["seed"])
    np.random.seed(arg_dict["seed"])
    torch.manual_seed(arg_dict["seed"])

    processor = KGProcessor()
    label_list = processor.get_labels(arg_dict["data_dir"])
    num_labels=len(label_list)
  
    model = BertForSequenceClassification.from_pretrained(dirFolder, num_labels=num_labels)
    tokenizer = BertTokenizer.from_pretrained(dirFolder, do_lower_case=arg_dict["do_lower_case"])
    model.to(device)

    test_examples = processor.get_test_examples(arg_dict["data_dir"])
    test_features = convert_examples_to_features(
            test_examples, label_list, arg_dict["max_seq_length"], tokenizer)
    all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long)
    test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=arg_dict["eval_batch_size"])
    
    eval_loss = 0
    nb_eval_steps = 0
    preds = []
    model.eval()
    for input_ids, input_mask, segment_ids, label_ids in tqdm(test_dataloader, desc="Testing"):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)
        with torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask, labels=None)
           
        loss_fct = CrossEntropyLoss()
        tmp_eval_loss = loss_fct(logits, label_ids)
        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if len(preds) == 0:
            preds.append(logits.detach().cpu().numpy())        
        else:
            preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0)
    
    eval_loss = eval_loss / nb_eval_steps
    preds = preds[0]
    preds_ids = np.argmax(preds, axis=1)  #indice del mayor valor 
    result = compute_metrics(preds_ids,all_label_ids.numpy())
    triples_Evaluation_Report = []
    for cont, label in enumerate(all_label_ids.numpy()):
        triples_Evaluation_Report.append(TripleEvaluationReport(
                id_triple = cont,
                id_label = label,
                id_prediction = preds_ids[cont]
            ))

    result['eval_loss'] = eval_loss        
    output_test_file = os.path.join(arg_dict["data_dir"], "test_results_good_negative_sampling_freebase"+str(numExecution)+".txt")
    with open(output_test_file, "w") as writer:           
        for key in sorted(result.keys()):
            writer.write("  %s = %s" % (key, str(result[key])))    


In [None]:
if __name__ == "__main__":
    for i in range(32, 34):
        main(i)
        print ("Modelo: ",i)
    print('fin')

Device cuda


100%|██████████| 75042/75042 [00:00<00:00, 285105.93it/s]
Testing: 100%|██████████| 5/5 [00:32<00:00,  6.51s/it]


Modelo:  32
Device cuda


100%|██████████| 75042/75042 [00:00<00:00, 508964.04it/s]
Testing: 100%|██████████| 5/5 [00:32<00:00,  6.40s/it]

Modelo:  33
fin



