In [None]:
!pip install torch==1.9.0 torchvision==0.10.0 torchaudio==0.9.0 torchtext==0.10.0
#!pip uninstall transformers -y
!pip install transformers==4.8.0
!pip install transformers sentencepiece
!pip install tqdm



In [None]:
import json
import torch, time, gc
from transformers import AutoModelForQuestionAnswering
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm
import gc

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [None]:
!nvidia-smi

Tue Mar  1 21:49:28 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8    26W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def read_squad(path):
    # se abre el archivo JSON y cargue el diccionario de introducción
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # inicializar listas para contextos, preguntas y respuestas // título, pregunta, respuesta, resumen (contextos)
    contexts = []
    questions = []
    answers = []
    # itera a través de todos los datos 

    for i in squad_dict:
        try:
            context = i['context']
            question = i['question']
            answer = i['answer']
            # comprueba si necesitamos extraer de 'answers' o 'plausible_answers'
            if 'plausible_answers' in i.keys():
                access = 'plausible_answers'
            else:
                access = 'answers'
            # agregar datos a listas // título, pregunta, respuesta, resumen
            contexts.append(context)
            questions.append(question)
            answers.append(answer)
        except:
            print("eeee")
    # devolver listas de datos 
    return contexts, questions, answers

# se ejecuta la función de lectura SQuAD para conjuntos de entrenamiento y validación
train_contexts, train_questions, train_answers = read_squad('/content/drive/MyDrive/Colab Notebooks/ajusteBert/datosunido.json')
val_contexts, val_questions, val_answers = read_squad('/content/drive/MyDrive/Colab Notebooks/ajusteBert/datosCopy.json')

In [None]:
print(train_contexts[0])
print(train_questions[0])
print(train_answers[0])

def add_end_idx(answers, contexts):
    # se recorre cada par respuesta-contexto
    for answer, context in zip(answers, contexts):
        # gold_text se refiere a la respuesta que esperamos encontrar en contexto
        print(answer['text'])
        gold_text = answer['text']
        # se conoce el índice de inicio
        start_idx = answer['answer_start']
        # idealmente este sería el índice final
        end_idx = start_idx + len(gold_text)

        # A veces las respuestas se desvían por un personaje o dos
        if context[start_idx:end_idx] == gold_text:
            # si la respuesta no es apagada
            answer['answer_end'] = end_idx
        else:
            # esto significa que la respuesta está desviada por 1-2 tokens
            for n in [1, 2]:
                if context[start_idx - n:end_idx - n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n


# y se aplica la función a nuestras dos listas de respuestas
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

 March 2020, the World Health Organization (WHO) declared COVID-19 a global pandemic. Since then, the rate of infections around the world has increase
what is covid-19?
{'text': 'a global pandemic', 'answer_start': 67}
a global pandemic
global pandemic
an infected person transmits the virus to many more people than average
identifying other infected people
airborne transmission
wear a mask in public, maintain social distancing of at least six feet
face mask/covering when outside
talking to my friends about hand washing, face masks, etc
Skin-to-skin contact was not recommended in women with symptoms, regardless of the severity of the illness; early clamping of the cord was promoted; and breastfeeding was not recommended for COVID-19+ mothers
Skin-to-skin contact was not recommended in women with symptoms, regardless of the severity of the illness
recommending the presence of a companion during birth with the adequate personal protective equipment (PPE), skin-to-skin contact after birth 

In [None]:

print(train_answers[:5])
from transformers import AutoTokenizer
# inicializar el tokenizador
tokenizer = AutoTokenizer.from_pretrained("deepset/bert-base-cased-squad2")
# tokenizador
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)
print(tokenizer.decode(train_encodings['input_ids'][0]))

def add_token_positions(encodings, answers):
    # se inicializa las listas para contener los índices de token de inicio / final de respuesta
    start_positions = []
    end_positions = []

    for i in range(len(answers)):
        print(answers[i]['answer_start'])
        print(answers[i]['text'])
        # agregar la posición del token de inicio / finalización usando el método char_to_token
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
        # si la posición inicial es None, el pasaje de respuesta se ha truncado
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # la posición final no se puede encontrar, char_to_token encontró el espacio, así que cambie la posición hasta encontrarla
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    # Actualiza nuestro objeto de codificaciones con las nuevas posiciones de inicio / finalización basadas en tokens.
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# Se aplica la funcion a los datos
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

[{'text': 'a global pandemic', 'answer_start': 67, 'answer_end': 84}, {'text': 'global pandemic', 'answer_start': 68, 'answer_end': 83}, {'text': 'an infected person transmits the virus to many more people than average', 'answer_start': 40, 'answer_end': 111}, {'text': 'identifying other infected people', 'answer_start': 59, 'answer_end': 92}, {'text': 'airborne transmission', 'answer_start': 65, 'answer_end': 86}]
[CLS] March 2020, the World Health Organization ( WHO ) declared COVID - 19 a global pandemic. Since then, the rate of infections around the world has increase [SEP] what is covid - 19? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [None]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [None]:
#Entrenamiento

from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

model_path = "deepset/bert-base-cased-squad2"

model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

gc.collect()
torch.cuda.empty_cache()
# configurar GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# se mueve el modelo al dispositivo detectado
model.to(device)
# se activa el modo de entrenamiento del modelo
model.train()
# se inicializa adam optimizer con caída de peso (reduce la posibilidad de sobreajuste)
optim = AdamW(model.parameters(), lr=2e-5) #tasa de aprendizaje
# Se inicializa el cargador de datos para los datos de entrenamiento
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

for epoch in range(7):
    # configurar el modelo en modo de entrenamiento
    model.train()
    # bucle de configuración (usamos tqdm para la barra de progreso)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # Se inicializa los gradientes calculados (del paso anterior)
        optim.zero_grad()
        # extraer todos los lotes de tensores necesarios para el entrenamiento
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # Salida del modelo de entrenamiento por lotes y devoluciones (incluida la pérdida)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extraer la pérdida
        loss = outputs[0]
        # calcula la pérdida para cada parámetro que necesite actualización gradual
        loss.backward()
        # actualiza los parámetros
        optim.step()
        # imprime información relevante en la barra de progreso
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

# GUARDAR EL MODELO AJUSTADO
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/ajusteBert/tokenizerv3")
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/ajusteBert/modelv")


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline, QuestionAnsweringPipeline

## CARGAR EL MODELO
# Cargarmos el modelo usando "AutoModel" or BertModel:
loaded_model = AutoModelForQuestionAnswering.from_pretrained("/content/drive/MyDrive/Colab Notebooks/ajusteBert/model")
load_tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Colab Notebooks/ajusteBert/tokenizer")
#BertModel.from_pretrained("C:/Users/Alexis/PycharmProjects/ModeloBERT/model/FineTune_BERT.pt")

# Question answering pipeline, specifying the checkpoint identifier
# Canal de respuesta a preguntas "pipeline", especificando el modelo cargado
question_answerer = QuestionAnsweringPipeline(model=loaded_model, tokenizer=load_tokenizer)



In [None]:
questions = "Covid-19 risk factors?"
contexto = "To identify risk factors for hospital deaths from COVID-19, the OpenSAFELY platform examined electronic health records from 17.4 million UK adults. The authors used multivariable Cox proportional hazards model to identify the association of risk of death with older age, lower socio-economic status, being male, non-white ethnic background and certain clinical conditions (diabetes, obesity, cancer, respiratory diseases, heart, kidney, liver, neurological and autoimmune conditions). Notably, asthma was identified as a risk factor, despite prior suggestion of a potential protective role. Interestingly, higher risks due to ethnicity or lower socio-economic status could not be completely attributed to pre-existing health conditions."
result = question_answerer(question=questions, context=contexto)
print(result)
# texto=client['resumen']
# ocurrencias = texto.count(palabra)
# print(ocurrencias)
#x = (round(result['score'], 4))
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

{'score': 2.6897090720012784e-05, 'start': 373, 'end': 399, 'answer': 'diabetes, obesity, cancer,'}
Answer: 'diabetes, obesity, cancer,', score: 0.0, start: 373, end: 399


In [None]:
tokenizer

PreTrainedTokenizerFast(name_or_path='deepset/bert-base-cased-squad2', vocab_size=28996, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [None]:
model.base_model.config

BertConfig {
  "_name_or_path": "deepset/bert-base-cased-squad2",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "name": "Bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}