# T5 MODEL
This python file mainly contains code for training, testing and evaluation of Pre-trained model based AKE models for T5.
  1. complete the parameter setting: 't5-base' for T5-base model, 't5-large' for T5-laerge model
  2. load the data and construct the dataset. Include words, cognitive features and labels.
  3. Build the model and start training, adding cognitive features.
  4. conduct testing, read the optimal model parameters and data, build the model, then make predictions and evaluate the results.

In [1]:
import transformers
import torch
from transformers import BertModel, BertTokenizerFast

In [2]:
weight = 't5-base'     # 't5-base' for T5-base model, 't5-large' for T5-laerge model

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
max_len = 35

#### NER label problem

#### load data

In [4]:
import json

train_path = '../dataset/GT-train.json'
# train_path = '../dataset/ET-train.json'
test_path = '../dataset/Election/GT-test.json'
# test_path = '../dataset/ET-test.json'

train_file = json.load(open(train_path,'r',encoding='utf-8'))
test_file = json.load(open(test_path, 'r', encoding='utf-8'))

In [5]:
train_sens, train_features, train_tags = [],[],[]
train_word_nums = []

sens = ''
nums = 0
for key in train_file.keys():
    tags = []
    items = train_file[key]
    sens = ''
    nums = 0
    features = []
    for item in items:
        sens += item[0]
        sens += ' '
        tags.append(item[-1])
        features.append(item[1:-1])
        nums += 1
    train_sens.append(sens.strip())
    train_word_nums.append(nums)
    train_tags.append(tags)
    train_features.append(features)

In [6]:
test_sens, test_features, test_tags = [],[],[]
test_word_nums = []

sens = ''
nums = 0
for key in test_file.keys():
    tags = []
    items = test_file[key]
    sens = ''
    nums = 0
    features = []
    for item in items:
        sens += item[0]
        sens += ' '
        tags.append(item[-1])
        features.append(item[1:-1])
        nums += 1
    test_sens.append(sens.strip())
    test_word_nums.append(nums)
    test_tags.append(tags)
    test_features.append(features)

In [7]:
len(test_sens)

3027

#### build dataset

In [8]:
from torch.utils.data import Dataset, DataLoader

In [None]:
from transformers import T5TokenizerFast

# Define the model architecture and load the weights
tokenizer = T5TokenizerFast.from_pretrained('t5-base')

In [30]:
label_to_ids = {'none': 0, 'B': 1, 'I': 2, 'E': 3, 'S': 4, "O": 5}
# label_to_ids = {'O': 0, 'B': 1, 'I': 2, 'E': 3, 'S': 4}

In [31]:
label_all_tokens = True

def align_label(text,labels,features):
  input = tokenizer(text, max_length=max_len, add_special_tokens=True, padding='max_length', truncation=True, return_tensors='pt')
  word_ids = input.word_ids()
  input_ids = input['input_ids'] 
  tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
  
  previous_word_idx = None
  new_labels = []
  new_features = []
  no_features = [0 for i in range(1,26)]

  for word_idx in word_ids:
      if word_idx is None:
          new_labels.append('none')
          new_features.append(no_features)
        #   new_labels.append('O')

      elif word_idx != previous_word_idx:
          try:
              new_labels.append(labels[word_idx])
              new_features.append(features[word_idx])
          except:
              new_labels.append('none')
              new_features.append(no_features)
            #   new_labels.append('O')
      else:
          try:
              new_labels.append(labels[word_idx] if label_all_tokens else 'none')
              new_features.append(features[word_idx] if label_all_tokens else no_features)
            #   new_labels.append(labels[word_idx] if label_all_tokens else 'O')
          except:
              new_labels.append('none')
              new_features.append(no_features)
      previous_word_idx = word_idx

  label_ids = [label_to_ids[label] for label in new_labels]

  return label_ids, tokens, new_features

In [32]:
from tqdm import tqdm
import numpy as np

class MyDataset(Dataset):
    def __init__(self, texts, old_features, tags):
        self.texts = texts
        self.tags = tags
        self.old_features = old_features
        
        self.labels = []
        self.tokens = []
        self.features = []
        
        self.input_ids = None
        self.attention_masks = None

    def encode(self):
        for i in tqdm(range(len(self.texts))):
          text = self.texts[i]
          tag = self.tags[i]
          feature = self.old_features[i]
          tags, tokens, features = align_label(text,tag,feature)
          self.labels.append(tags)
          self.tokens.append(tokens)
          self.features.append(features)
          
        self.features = np.array(self.features,float)
        self.inputs = tokenizer(self.texts, max_length=max_len, add_special_tokens=True, padding='max_length', truncation=True, return_tensors='pt')
        self.input_ids = self.inputs['input_ids']
        self.attention_masks = self.inputs['attention_mask']

    def __getitem__(self, idx):
        return self.input_ids[idx,:], self.attention_masks[idx,:], self.tokens[idx], torch.tensor(self.features[idx],dtype=torch.float32), torch.tensor(self.labels[idx])

    def __len__(self):
        return len(self.input_ids)

In [33]:
train_dataset = MyDataset(train_sens, train_features, train_tags)
train_dataset.encode()

100%|██████████| 24210/24210 [00:05<00:00, 4603.04it/s]


In [35]:
test_dataset = MyDataset(test_sens, test_features, test_tags)
test_dataset.encode()

100%|██████████| 3027/3027 [00:00<00:00, 4338.21it/s]


In [36]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=128)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=128)

#### construct model

In [42]:
import torch.nn as nn
import torch
from transformers import T5ForConditionalGeneration

# Define the model architecture and load the weights


class T5NerModel(nn.Module):
    def __init__(self,num_labels):
        super(T5NerModel,self).__init__()

        self.T5 = T5ForConditionalGeneration.from_pretrained(weight)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768+25,num_labels)

    def forward(self,input_ids,attention_mask,extra_features,labels):
        outputs = self.T5(input_ids, attention_mask=attention_mask, labels=labels)
        
        pooled_output = outputs[0]
        T5_outputs = self.dropout(pooled_output)
        
        # Adding Cognitive signals here:
        outputs = torch.concat((T5_outputs,extra_features[:,:,:]),-1)

        outputs = self.classifier(outputs)
        
        return outputs

#### evaluate way

In [37]:
def TagConvert(raw_tags, words_set, poss=None):
    true_tags = []
    for i in range(raw_tags.shape[0]):
      kw_list = []
      nkw_list = ""
      for j in range(len(raw_tags[i])):
          item = raw_tags[i][j]
          if item == 0:
              continue
          if poss !=None and j in poss[i]:
              continue
          # if item == 5:
          #     continue
          if item == 4:
              kw_list.append(str(words_set[j][i]))
          if item == 1:
              nkw_list += str(words_set[j][i])
          if item == 2:
              nkw_list += " "
              nkw_list += str(words_set[j][i])
          if item == 3:
              nkw_list += " "
              nkw_list += str(words_set[j][i])
              kw_list.append(nkw_list)
              nkw_list = ""

      true_tags.append(kw_list)
    return true_tags

In [38]:
def evaluate(predict_data, target_data, topk=3):
  TRUE_COUNT, PRED_COUNT, GOLD_COUNT = 0.0, 0.0, 0.0
  for index, words in enumerate(predict_data):
      y_pred, y_true = None, target_data[index]

      if type(predict_data) == str:
          words = sorted(words.items(), key=lambda item: (-item[1], item[0]))
          y_pred = [i[0] for i in words]
      elif type(predict_data) == list:
          y_pred = words

      y_pred = y_pred[0: topk]
      TRUE_NUM = len(set(y_pred) & set(y_true))
      TRUE_COUNT += TRUE_NUM
      PRED_COUNT += len(y_pred)
      GOLD_COUNT += len(y_true)
  # compute P
  if PRED_COUNT != 0:
      p = (TRUE_COUNT / PRED_COUNT)
  else:
      p = 0
  # compute R
  if GOLD_COUNT != 0:
      r = (TRUE_COUNT / GOLD_COUNT)
  else:
      r = 0
  # compute F1
  if (r + p) != 0:
      f1 = ((2 * r * p) / (r + p))
  else:
      f1 = 0

  p = round(p * 100, 2)
  r = round(r * 100, 2)
  f1 = round(f1 * 100, 2)

  return p, r, f1

In [39]:
import numpy as np

def calculate_f1(y_pred, y_true):
    # flatten and convert to numpy array
    y_true = y_true.view(-1)
    y_pred = y_pred.view(-1)
    y_true = y_true.detach().cpu().numpy()
    y_pred = y_pred.detach().cpu().numpy()

    mask = np.where(y_true != 0)

    y_true = y_true[mask]
    y_pred = y_pred[mask]

    return y_pred, y_true

#### start training

In [43]:
from torch.nn import CrossEntropyLoss
from torch.optim import Adam, AdamW

model = T5NerModel(num_labels = 6)
model = model.to(device)

optim = AdamW(model.parameters(),lr=5e-5,weight_decay=1e-2)
loss_fn = CrossEntropyLoss(reduction='none', ignore_index=0)
loss_fn = loss_fn.to(device)

In [None]:
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.metrics import f1_score


epochs = 5
best_f1 = 0.0
for epoch in tqdm(range(epochs)):
    loss_value = 0.0
    model.train()
    label_true, label_pred = [], []
    for i,batch in enumerate(train_dataloader):
        optim.zero_grad()
        input_ids, attention_masks, _, features, tags = batch
        pred_tags = model(input_ids.to(device), attention_masks.to(device), features.to(device),tags.to(device))

        loss = loss_fn(pred_tags.permute(0,2,1),tags.to(device))
        loss = loss.mean()
        loss.backward()
        optim.step()

        pred_tags = F.softmax(pred_tags,dim=-1)
        pred_tags = torch.argmax(pred_tags,dim=-1)

        y_pred, y_true = calculate_f1(pred_tags, tags)
        label_true.extend(y_true)
        label_pred.extend(y_pred)
    
        loss_value += loss.item()

    label_train_f1 = f1_score(label_true, label_pred, average='macro')

    model.eval()
    kw_true, kw_pred = [], []
    label_true, label_pred = [],[]
    for i,batch in enumerate(test_dataloader):
      input_ids, attention_masks, tokens, features, tags = batch
      with torch.no_grad():
          for module in model.modules():
              if isinstance(module, nn.Dropout):
                  module.p = 0
                  module.train(False)
          pred_tags = model(input_ids.to(device), attention_masks.to(device), features.to(device), tags.to(device))
          pred_tags = F.softmax(pred_tags,dim=-1)
          pred_tags = torch.argmax(pred_tags,dim=-1)

      y_pred, y_true = calculate_f1(pred_tags, tags)
      label_true.extend(y_true)
      label_pred.extend(y_pred)

      # more balance evaluate
      poss = []
      for i in range(len(tags)):
          pos = []
          for j in range(len(tags[i])):
              if tags[i][j] == 0:
                  pos.append(j)
          poss.append(pos)
           
      kw_true.extend(TagConvert(tags,tokens))
      kw_pred.extend(TagConvert(pred_tags,tokens,poss))

    label_f1 = f1_score(label_true, label_pred, average='macro')
    P, R, F1 = evaluate(kw_true, kw_pred)
    
    if F1 > best_f1:
        best_f1 = F1
        torch.save(model.state_dict(),'./pretrain_pt/T5.pt')
        
    print("epoch{}:  loss:{:.2f}   train_f1_value:{:.2f}  test_f1_value:{:.2f}  kw_f1_value:{:.2f}".format(
        epoch+1, loss_value / len(train_dataloader), label_train_f1, label_f1, F1
    ))

#### inference

In [25]:
model = T5NerModel(num_labels=6)
model.load_state_dict(torch.load('./pretrain_pt/T5.pt'))
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.metrics import f1_score

model.eval()
kw_true, kw_pred = [], []
label_true, label_pred = [],[]
for i,batch in enumerate(test_dataloader):
    input_ids, attention_masks, tokens, features, tags = batch
    with torch.no_grad():
        for module in model.modules():
            if isinstance(module, nn.Dropout):
                module.p = 0
                module.train(False)
        pred_tags = model(input_ids.to(device), attention_masks.to(device),features.to(device))
        pred_tags = F.softmax(pred_tags,dim=-1)
        pred_tags = torch.argmax(pred_tags,dim=-1)

    y_pred, y_true = calculate_f1(pred_tags, tags)
    label_true.extend(y_true)
    label_pred.extend(y_pred)

    # more balance evaluate
    poss = []
    for i in range(len(tags)):
        pos = []
        for j in range(len(tags[i])):
            if tags[i][j] == 0:
                pos.append(j)
        poss.append(pos)
        
    kw_true.extend(TagConvert(tags,tokens))
    kw_pred.extend(TagConvert(pred_tags,tokens,poss))

label_f1 = f1_score(label_true, label_pred, average='macro')
P, R, F1 = evaluate(kw_true, kw_pred)

In [None]:
print(P)
print(R)
print(F1)