<a href="https://colab.research.google.com/github/yalopez84/Goog-Negative-Sampling/blob/master/PseudoTypedNegativeTraining_Freebase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pytorch-pretrained-bert  

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
import os
from shutil import rmtree
import csv
from tqdm import tqdm, trange
from torch.nn import CrossEntropyLoss, MSELoss
import random
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_dir="/content/drive/MyDrive/NegativeStrategies/GoodNegativeSampling/FB13/"
os.chdir(data_dir)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
class InputExample(object):

    def __init__(self, guid, text_a, text_b=None, text_c=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.text_c = text_c
        self.label = label

In [None]:
class InputFeatures(object):
    
    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id  

In [None]:
class TrainingReport(object):
    def __init__(self, epoch, loss):
        self.epoch = epoch
        self.loss = loss

In [None]:
class DataProcessor(object):
  
    def get_train_examples(self, data_dir):
        raise NotImplementedError()

    def get_labels(self, data_dir):
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        with open(input_file, "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                lines.append(line)
            return lines    

In [None]:
class KGProcessor(DataProcessor):
    def __init__(self):
        self.labels = set()
    
    def get_train_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train_reduced_11082_neg_and_descrip_seudotyped.tsv")), "train", data_dir)

    def get_relations(self, data_dir):
        with open(os.path.join(data_dir, "relations.txt"), 'r') as f:
            lines = f.readlines()
            relations = []
            for line in lines:
                relations.append(line.strip())
        return relations

    def get_labels(self, data_dir):
        return ["0", "1"]

    def get_entities(self, data_dir):
        with open(os.path.join(data_dir, "entities.txt"), 'r') as f:
            lines = f.readlines()
            entities = []
            for line in lines:
                entities.append(line.strip())
        return entities

    def _create_examples(self, lines, set_type, data_dir):
        examples=[]        
        for (i, line) in enumerate(lines):
            examples.append(InputExample(guid=line[0], text_a=line[1], text_b=line[2], text_c = line[3], label=line[4]))   

        return examples

In [None]:
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, print_info = True):
    
    label_map = {label : i for i, label in enumerate(label_list)}

    features = []

    for (ex_index, example) in enumerate(examples):
        
        tokens_a = tokenizer.tokenize(example.text_a)
        tokens_b = None
        tokens_c = None

        if example.text_b and example.text_c:
            tokens_b = tokenizer.tokenize(example.text_b)
            tokens_c = tokenizer.tokenize(example.text_c)
            _truncate_seq_triple(tokens_a, tokens_b, tokens_c, max_seq_length - 4)
        else:
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)
        if tokens_c:
            tokens += tokens_c + ["[SEP]"]
            segment_ids += [0] * (len(tokens_c) + 1)        

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_ids)

        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label_map[example.label]

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features

In [None]:
def _truncate_seq_triple(tokens_a, tokens_b, tokens_c, max_length):
    while True:
        total_length = len(tokens_a) + len(tokens_b) + len(tokens_c)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b) and len(tokens_a) > len(tokens_c):
            tokens_a.pop()
        elif len(tokens_b) > len(tokens_a) and len(tokens_b) > len(tokens_c):
            tokens_b.pop()
        elif len(tokens_c) > len(tokens_a) and len(tokens_c) > len(tokens_b):
            tokens_c.pop()
        else:
            tokens_c.pop()

In [None]:
def main(numExecution):
    arg_dict ={
        "data_dir": data_dir,
        "bert_model": "bert-base-cased",
        "max_seq_length": 200,
        "train_batch_size": 32,
        "eval_batch_size": 512,
        "learning_rate": 5e-5,
        "num_train_epochs": 3.0,
        "output_dir": "./output_FB13_",               
        "gradient_accumulation_steps": 1,
        "seed":42,
        "do_lower_case":False,
        "loss_scale":0,
        "warmup_proportion":0.1
        }
    dirFolder=arg_dict["output_dir"]+str(numExecution)
    if os.path.exists(dirFolder):
        rmtree(dirFolder)
        os.makedirs(dirFolder)
    else:
        os.makedirs(dirFolder)

    n_gpu=0
    device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
    print("Device",device)
    if str(device)=="cuda":
        n_gpu=1
    arg_dict["seed"] = random.randint(1, 200)
    random.seed(arg_dict["seed"])
    np.random.seed(arg_dict["seed"])
    torch.manual_seed(arg_dict["seed"])

    if n_gpu > 0:
        torch.cuda.manual_seed_all(arg_dict["seed"])

    processor = KGProcessor()

    label_list = processor.get_labels(arg_dict["data_dir"])
    num_labels=len(label_list)

    entity_list = processor.get_entities(arg_dict["data_dir"])   

    relation_list = processor.get_relations(arg_dict["data_dir"])   

    tokenizer = BertTokenizer.from_pretrained(arg_dict["bert_model"], do_lower_case=arg_dict["do_lower_case"])

    train_examples = None
    num_train_optimization_steps = 0

    train_examples = processor.get_train_examples(arg_dict["data_dir"])

    num_train_optimization_steps = int(len(train_examples) / arg_dict["train_batch_size"] / arg_dict["gradient_accumulation_steps"]) * arg_dict["num_train_epochs"]

    model = BertForSequenceClassification.from_pretrained(arg_dict["bert_model"],
              num_labels=num_labels)
    
    model.to(device)

    param_optimizer = list(model.named_parameters()) 

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=arg_dict["learning_rate"],
                         warmup=arg_dict["warmup_proportion"],
                         t_total=num_train_optimization_steps)
    
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0

    train_features = convert_examples_to_features(
            train_examples, label_list, arg_dict["max_seq_length"], tokenizer)    
    print("len(train_features)",len(train_features))

    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    train_sampler = RandomSampler(train_data)

    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=arg_dict["train_batch_size"])

    model.train()

    trainingReports=[]
    
    for epoch in trange(int(arg_dict["num_train_epochs"]), desc="Epoch"):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            logits = model(input_ids, segment_ids, input_mask, labels=None)


            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits, label_ids)

            loss.backward()

            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
        print("Training loss in epoch %s es: %s", epoch+1, tr_loss) 
        trainingReports.append(TrainingReport(epoch=epoch+1, loss=tr_loss))

    trainingReport_file = os.path.join(arg_dict["data_dir"], "TrainingReport"+str(numExecution)+".txt")
    with open(trainingReport_file, "w") as writer:
        for epoch in trainingReports:
                writer.write("%s %s \n" % (epoch.epoch,epoch.loss)) 


    model_to_save = model
    
    output_model_file = os.path.join(dirFolder, WEIGHTS_NAME)
    output_config_file = os.path.join(dirFolder, CONFIG_NAME)
    torch.save(model_to_save.state_dict(), output_model_file)
    model_to_save.config.to_json_file(output_config_file)
    tokenizer.save_vocabulary(dirFolder)

In [None]:
if __name__ == "__main__":
    for i in range(1, 33):
       main(i) 
       print("*******modelo********",i)
    print('fin')

probando
probando
probando
probando
probando
probando
probando
probando
probando
probando
probando
probando
probando
probando
fin
