Text infilling: context, T, attribute, A, and the value, V , in a sentence as “T. A is V .” where the attribute value V is masked as blank

Answer generation: generate V as the answer, considering T as the context and A as the question.

Answer generation seems to be plausible. I just train the a model on that. I will just use T5 because its not important for me to have the best performance. 

In [113]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import(
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [114]:
from datasets import load_dataset
jnlpba = load_dataset('jnlpba', split=['train[:10]', "validation[:10]"])
print(jnlpba[0][0])

Found cached dataset jnlpba (/Users/maxhager/.cache/huggingface/datasets/jnlpba/jnlpba/1.0.0/3062f220823930cffde7976b694aa67bac3b06c322a02ced92d3761519810ce4)


  0%|          | 0/2 [00:00<?, ?it/s]

{'id': '1', 'tokens': ['IL-2', 'gene', 'expression', 'and', 'NF-kappa', 'B', 'activation', 'through', 'CD28', 'requires', 'reactive', 'oxygen', 'production', 'by', '5-lipoxygenase', '.'], 'ner_tags': [1, 2, 0, 0, 9, 10, 0, 0, 9, 0, 0, 0, 0, 0, 9, 0]}


I need a df with question, context and answer like:

What are the entities?

IL-2', 'gene', 'expression', 'and', 'NF-kappa', 'B', 'activation', 'through', 'CD28', 'requires', 'reactive', 'oxygen', 'production','by', '5-lipoxygenase', '.

attr1, attr2, attr3

The question I still have is how i need to tokenize everything - how i am going to do that?

In [115]:
df_train = pd.DataFrame(jnlpba[0])
print(df_train.head(1))
df_val = pd.DataFrame(jnlpba[1])

mapping = {
    0: "O",
    1: "B-DNA",
    2: "I-DNA",
    3: "B-RNA",
    4: "I-RNA",
    5: "B-cell_line",
    6: "I-cell_line",
    7: "B-cell_type",
    8: "I-cell_type",
    9: "B-protein",
    10: "I-protein"
}

def map_tags(row):
    row['ner_tags'] = [' '.join([mapping[tag] for tag in row['ner_tags']])][0]
    return row

df_train = df_train.apply(map_tags, axis=1)
df_val = df_val.apply(map_tags, axis=1)

def join_tags(row):
    row['tokens'] = ' '.join(row['tokens'])
    return row

df_train = df_train.apply(join_tags, axis=1)
df_val = df_val.apply(join_tags, axis=1)

print(df_train.head(3))

data_train = {"context": df_train['tokens'], "question": "What are the attributes?", "answer": df_train['ner_tags']}
data_val = {"context": df_val['tokens'], "question": "What are the attributes?", "answer": df_val['ner_tags']}

  id                                             tokens  \
0  1  [IL-2, gene, expression, and, NF-kappa, B, act...   

                                            ner_tags  
0  [1, 2, 0, 0, 9, 10, 0, 0, 9, 0, 0, 0, 0, 0, 9, 0]  
  id                                             tokens  \
0  1  IL-2 gene expression and NF-kappa B activation...   
1  2  Activation of the CD28 surface receptor provid...   
2  3  In primary T lymphocytes we show that CD28 lig...   

                                            ner_tags  
0  B-DNA I-DNA O O B-protein I-protein O O B-prot...  
1  O O O B-protein I-protein I-protein O O O O O ...  
2  O B-cell_type I-cell_type I-cell_type O O O B-...  


### Toknization

In [116]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [117]:
class NERDataset(Dataset):
    def __init__(self, data: pd.DataFrame, tokenizer: T5Tokenizer, answer_max_token_len: int = 0 , question_max_token_len: int=0, context_max_token_len: int=0):
        self.tokenizer = tokenizer
        self.data = data
        self.answer_max_token_len = max(data['context'], key=len)
        self.question_max_token_len = max(data['question'], key=len)
        self.context_max_token_len = max(data['context'], key=len)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        data_row = self.data.iloc[index]
        encoding_context = tokenizer(    
            data_row["context"].tolist(),
            max_length=self.context_max_token_len,
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        
        encoding_answer = tokenizer(
            data_row["answer"].tolist(),
            max_length=self.answer_max_token_len,
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"   
        )
        
        encoding_question = tokenizer(
            data_row["question"].tolist(),
            max_length=self.question_max_token_len,
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        
        labels = encoding_answer['input_ids']
        labels[labels == 0] = -100
        
        return dict(
            question = data_row["question"],
            context = data_row["context"],
            answer = data_row["answer"],
            input_ids = encoding_context['input_ids'].flatten(),
            attention_mask = encoding_context['attention_mask'].flatten(),
            labels = labels.flatten()   
        )

In [118]:
sample_dataset_train = NERDataset(data_train, tokenizer)
sample_dataset_val = NERDataset(data_val, tokenizer)

In [120]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [126]:
for i in sample_dataset_train:
    print(i)
    break


#next i need to train my model - i can do this direct with huggingface instead with pytorch lightning 
#first i need to check why my dataset is not converted to tensors

AttributeError: 'dict' object has no attribute 'iloc'

In [None]:
#still need to turn the attributes representation into a string and not list done
#what is next. nex