Text infilling: context, T, attribute, A, and the value, V , in a sentence as “T. A is V .” where the attribute value V is masked as blank

Answer generation: generate V as the answer, considering T as the context and A as the question.

Answer generation seems to be plausible. I just train the a model on that. I will just use T5 because its not important for me to have the best performance. 

In [18]:
import pandas as pd
import numpy as np
import torch
from datasets import DatasetDict
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import transformers
from transformers import(
    AdamW,
    DataCollatorForSeq2Seq,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [19]:
from datasets import load_dataset
jnlpba = load_dataset('jnlpba', split=['train[:10]', "validation[:10]"])
print(jnlpba[0][0])

Found cached dataset jnlpba (/Users/maxhager/.cache/huggingface/datasets/jnlpba/jnlpba/1.0.0/3062f220823930cffde7976b694aa67bac3b06c322a02ced92d3761519810ce4)


  0%|          | 0/2 [00:00<?, ?it/s]

{'id': '1', 'tokens': ['IL-2', 'gene', 'expression', 'and', 'NF-kappa', 'B', 'activation', 'through', 'CD28', 'requires', 'reactive', 'oxygen', 'production', 'by', '5-lipoxygenase', '.'], 'ner_tags': [1, 2, 0, 0, 9, 10, 0, 0, 9, 0, 0, 0, 0, 0, 9, 0]}


I need a df with question, context and answer like:

What are the entities?

IL-2', 'gene', 'expression', 'and', 'NF-kappa', 'B', 'activation', 'through', 'CD28', 'requires', 'reactive', 'oxygen', 'production','by', '5-lipoxygenase', '.

attr1, attr2, attr3

The question I still have is how i need to tokenize everything - how i am going to do that?

In [24]:
df_train = pd.DataFrame(jnlpba[0])
print(df_train.head(1))
df_val = pd.DataFrame(jnlpba[1])

mapping = {
    0: "O",
    1: "B-DNA",
    2: "I-DNA",
    3: "B-RNA",
    4: "I-RNA",
    5: "B-cell_line",
    6: "I-cell_line",
    7: "B-cell_type",
    8: "I-cell_type",
    9: "B-protein",
    10: "I-protein"
}

def map_tags(row):
    row['ner_tags'] = [' '.join([mapping[tag] for tag in row['ner_tags']])][0]
    return row

df_train = df_train.apply(map_tags, axis=1)
df_val = df_val.apply(map_tags, axis=1)

def join_tags(row):
    row['tokens'] = ' '.join(row['tokens'])
    return row

df_train = df_train.apply(join_tags, axis=1)
df_val = df_val.apply(join_tags, axis=1)

print(df_train.head(3))

data_train = DatasetDict({"context": df_train['tokens'], "question": "What are the attributes?", "answer": df_train['ner_tags']})
data_val = DatasetDict({"context": df_val['tokens'], "question": "What are the attributes?", "answer": df_val['ner_tags']})

  id                                             tokens  \
0  1  [IL-2, gene, expression, and, NF-kappa, B, act...   

                                            ner_tags  
0  [1, 2, 0, 0, 9, 10, 0, 0, 9, 0, 0, 0, 0, 0, 9, 0]  
  id                                             tokens  \
0  1  IL-2 gene expression and NF-kappa B activation...   
1  2  Activation of the CD28 surface receptor provid...   
2  3  In primary T lymphocytes we show that CD28 lig...   

                                            ner_tags  
0  B-DNA I-DNA O O B-protein I-protein O O B-prot...  
1  O O O B-protein I-protein I-protein O O O O O ...  
2  O B-cell_type I-cell_type I-cell_type O O O B-...  


### Tokenization

In [21]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
# tokenize input (context)
input_tensors = [tokenizer.encode(sentence, return_tensors='pt') for sentence in data_train['context']]

# tokenize output (answer)
output_tensors = [tokenizer.encode(sentence, return_tensors='pt') for sentence in data_train['answer']]

# create traning arguments
#todo check if 16 is appropriate
batch_size = 16
args = TrainingArguments(
    "T5-fine-tuned-NER-QA",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    #report_to="wandb",
    #push_to_hub=True,
    #todo hide token 
    #push_to_hub_token="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"
)

# train 
model = transformers.T5Model.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) # created based on collator which is used in this example https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/translation.ipynb#scrollTo=TlqNaB8jIrJW
trainer = Trainer(
    model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics
)

In [22]:
# Convert the input data to tensors and build the dataset
input_tensors = [tokenizer.encode(sentence, return_tensors='pt') for sentence in data_train['context']]
output_tensors = [torch.tensor(tags, dtype=torch.long) for tags in data_train['answer']]
dataset = torch.utils.data.TensorDataset(input_tensors, output_tensors)

# Use the BatchTransform class to create batches of input and output tensors
data_transform = transformers.BatchTransform(batch_size=batch_size)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=data_transform.collate_fn)

# Set the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters())
loss_fn = torch.nn.CrossEntropyLoss()

# Define the number of


TypeError: new(): invalid data type 'str'

In [None]:
#print(data_train)
context_max_token_len = max(data_train['context'], key=len)
encoding_context = tokenizer.encode(
    data_train['context'][0],
    return_tensors='pt'
)
#i need somehow convert 
#i can just try to use the tokenizer on datatrain
#now I tokenize this shit but what is next?
#do i pu
print(encoding_context)

tensor([[    3,  3502,  4949,  6510,  3893,    11,     3, 12619,    18,   157,
          3096,     9,   272,  5817,   257,   190,  3190,  2577,  2311, 28360,
         11035,   999,    57,  7670,  7446,  9773,   729,     9,     7,    15,
             3,     5,     1]])


In [None]:
#i need to tokenize the data. now i am doing this with the ner dataset class. instead i could do it manually. 

class NERDataset(Dataset):
    def __init__(self, data: pd.DataFrame, tokenizer: T5Tokenizer, answer_max_token_len: int = 0 , question_max_token_len: int=0, context_max_token_len: int=0):
        self.tokenizer = tokenizer
        self.data = pd.DataFrame(data)
        self.answer_max_token_len = max(data['context'], key=len)
        self.question_max_token_len = max(data['question'], key=len)
        self.context_max_token_len = max(data['context'], key=len)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        data_row = self.data.iloc[index]
        
        encoding_context = tokenizer(    
            data_row["context"],
            max_length=int(self.context_max_token_len),
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        
        encoding_answer = tokenizer(
            data_row["answer"],
            max_length=int(self.answer_max_token_len),
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"   
        )
        
        encoding_question = tokenizer(
            data_row["question"],
            max_length=int(self.question_max_token_len),
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        
        labels = encoding_answer['input_ids']
        labels[labels == 0] = -100
        
        return dict(
            question = data_row["question"],
            context = data_row["context"],
            answer = data_row["answer"],
            input_ids = encoding_context['input_ids'].flatten(),
            attention_mask = encoding_context['attention_mask'].flatten(),
            labels = labels.flatten()   
        )

In [None]:
sample_dataset_train = NERDataset(data_train, tokenizer)
sample_dataset_val = NERDataset(data_val, tokenizer)

In [None]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)

In [None]:
for i in sample_dataset_train:
    print(i)
    break
#next i need to train my model - i can do this direct with huggingface instead with pytorch lightning 
#first i need to check why my dataset is not converted to tensors
#i just want to tokenize my data and thats it

ValueError: invalid literal for int() with base 10: 'In primary T lymphocytes we show that CD28 ligation leads to the rapid intracellular formation of reactive oxygen intermediates ( ROIs ) which are required for CD28 -mediated activation of the NF-kap

In [None]:
#still need to turn the attributes representation into a string and not list done
#what is next. nex