In [1]:
import json
from datasets import load_dataset, load_metric, load_from_disk
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AlbertTokenizer, AlbertModel
from transformers import Adafactor
import torch
from torch import nn
import torch.nn.functional as F
import collections
from typing import List
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
class DataClass:

  def __init__(self, data_dir):
    self.data_dir = data_dir

  def data_csv(self, f, output):

    answers = []
    rewrites = []
    passages = []

    filepath = self.data_dir+f

    with open(filepath) as fl:
      data = json.load(fl)
      
      for d in data:
        answers.append(d['answer'])
        rewrites.append(d['rewrite'])
        passages.append(d['passage'])

      data = {'answer':answers, 'passage':passages, 'rewrite':rewrites}
      df = pd.DataFrame(data)
      df.to_csv(output, index=False)


data = DataClass('/home/ujan/Documents/conv-qa/data/interim/')

data.data_csv('qrecc_train.json', 'train.csv')
data.data_csv('qrecc_test.json', 'test.csv')

qrecc = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})

Using custom data configuration default-c532fae75c1b277d


Downloading and preparing dataset csv/default to /home/ujan/.cache/huggingface/datasets/csv/default-c532fae75c1b277d/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/ujan/.cache/huggingface/datasets/csv/default-c532fae75c1b277d/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
#@title models and hyperparameters

max_length= 384
batch_size = 16
pretrained_model = 't5-base'

t5_tokenizer = T5Tokenizer.from_pretrained(pretrained_model)

t5_model = T5ForConditionalGeneration.from_pretrained(pretrained_model)

In [None]:
def tokenize_dataset(batch):

  passages = t5_tokenizer(batch['rewrite'], batch['passage'], padding='max_length', truncation='only_second', max_length=max_length, add_special_tokens=True)
  answers = t5_tokenizer(batch['answer'], padding='max_length', truncation=True, max_length=max_length, add_special_tokens=True)

  batch['psg_input_ids'] = passages.input_ids
  batch['ans_input_ids']  = answers.input_ids
  batch['psg_attention_mask'] = passages.attention_mask

  return batch


# handle examples with no answers
def no_ans(x):
  if isinstance(x['answer'], str): return x
  x['answer'] = 'no_ans'
  return x


# removing examples with no context
qrecc = qrecc.filter(lambda x: isinstance(x['passage'], str))

# removing examples with passage length > 384
#qrecc = qrecc.filter(lambda x: len(e_tokenizer(x['passage']).input_ids) <= max_length)

# no answers
qrecc = qrecc.map(no_ans)

# tokenizing
dataset = qrecc.map(
    tokenize_dataset, 
    batch_size = batch_size,
    batched=True,
    remove_columns=['passage', 'answer', 'rewrite']
)


dataset.set_format(
    type='torch', columns=['psg_input_ids', 'ans_input_ids', 'psg_attention_mask'],)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/24037 [00:00<?, ?ex/s]

  0%|          | 0/6478 [00:00<?, ?ex/s]

  0%|          | 0/1503 [00:00<?, ?ba/s]

  0%|          | 0/405 [00:00<?, ?ba/s]

In [None]:
train_loader = torch.utils.data.DataLoader(dataset['train'], batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(dataset['test'], batch_size=batch_size)

In [4]:
def valid_loss():
  
  val_loss = 0
  idx = 0

  for batch in test_loader:
    # rewrite+passge inputs for encoder
    psg_input = batch['psg_input_ids'].to(device)
    psg_attention = batch['psg_attention_mask'].to(device)

    # answer inputs for rc
    ans_input = batch['ans_input_ids'] 
    ans_input[ans_input == t5_tokenizer.pad_token_id] = -100 # tokens with indices set to -100 are ignored (masked)
    ans_input = ans_input.to(device)

    # rc loss
    loss = t5_model(input_ids=psg_input, attention_mask=psg_attention, labels=ans_input).loss

    val_loss += loss.item()

    del psg_input, psg_attention, ans_input, loss

    idx += 1

  return val_loss/idx

In [7]:
num_epochs = 6

device = torch.device('cuda')
t5_model.to(device)

#t5_model.load_state_dict(torch.load('/home/ujan/Documents/conv-qa/models/finetuned_weights/rc_gen5.pth'))

t5_model.train()


optim = optimizer = Adafactor(
    t5_model.parameters(),
    lr=1e-5,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False
)

for epoch in range(num_epochs):
  
  epoch_loss = 0

  for batch in train_loader:

    # rewrite+passge inputs for encoder
    psg_input = batch['psg_input_ids'].to(device)
    psg_attention = batch['psg_attention_mask'].to(device)

    # answer inputs for rc
    ans_input = batch['ans_input_ids'] 
    ans_input[ans_input == t5_tokenizer.pad_token_id] = -100
    ans_input = ans_input.to(device)

    # rc loss
    loss = t5_model(input_ids=psg_input, attention_mask=psg_attention, labels=ans_input).loss

    epoch_loss += loss.item()

    loss.backward()
    optim.step()
    optim.zero_grad()
    

  print('Train loss after epoch {} : {}'.format(epoch+1, epoch_loss / len(train_loader)))
  t5_model.eval()
  print('Valid loss after epoch {} : {}'.format(epoch+1, valid_loss()))
  
  print('\n')
  t5_model.train()
  torch.save(t5_model.state_dict(), '/home/ujan/Desktop/rc_gen'+str(epoch+1)+'.pth')

Train loss after epoch 1 : 0.8047845386419785
Valid loss after epoch 1 : 0.6113779529377267


Train loss after epoch 2 : 0.6315083886116882
Valid loss after epoch 2 : 0.5707246081696616


Train loss after epoch 3 : 0.5835389507784497
Valid loss after epoch 3 : 0.5542257139344274


Train loss after epoch 4 : 0.5532140438051439
Valid loss after epoch 4 : 0.5449160399260344


Train loss after epoch 5 : 0.5301274461640728
Valid loss after epoch 5 : 0.5391939025602223


Train loss after epoch 6 : 0.5120165204593363
Valid loss after epoch 6 : 0.5360729444174119




Train loss after epoch 5 : 0.5301274461640728
Valid loss after epoch 5 : 0.5391939025602223


Train loss after epoch 6 : 0.5120165204593363
Valid loss after epoch 6 : 0.5360729444174119

batch size 16


In [None]:
device = torch.device('cuda')
t5_model.to(device)

t5_model.load_state_dict(torch.load('/storage/qrecc/models/e2e/rc3.pth'))
t5_model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [None]:
def compute_f1_from_tokens(gold_toks: List[str], pred_toks: List[str]) -> float:
  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
  num_same = sum(common.values())

  if len(gold_toks) == 0 or len(pred_toks) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    return int(gold_toks == pred_toks)

  if num_same == 0:
    return 0

  precision = 1.0 * num_same / len(pred_toks)
  recall = 1.0 * num_same / len(gold_toks)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1

idx = 0
f1 = 0

for batch in test_loader:

  # rewrite+passge inputs for encoder
  psg_input = batch['psg_input_ids'].to(device)
  psg_attention = batch['psg_attention_mask'].to(device)
  ans_input = batch['ans_input_ids'] 

  outputs = t5_model.generate(input_ids=psg_input, attention_mask=psg_attention)
  output_text = t5_tokenizer.batch_decode(outputs, skip_special_tokens=True)
  true_text = t5_tokenizer.batch_decode(ans_input, skip_special_tokens=True)

  for i in range(len(output_text)):
    score = compute_f1_from_tokens(true_text, output_text)
    #if score < 0: score = 0
    f1 += score
    idx += 1

print(f1 / idx)

0.3013275702377277


0.3173819079962952

Compute rc loss for paraphrased rewrites

In [5]:
qrecc = qrecc.filter(lambda x: isinstance(x['passage'], str))

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [9]:
para_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")  
para_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")

In [13]:
device = torch.device('cuda')
t5_model.to(device)
para_model.to(device)

t5_model.load_state_dict(torch.load('/home/ujan/Documents/conv-qa/models/finetuned_weights/new_rc_gen5.pth'))

<All keys matched successfully>

Augment dataset

In [14]:
def update(example):
    
    passage = t5_tokenizer(example['rewrite'], example['passage'], padding=True, truncation='only_second',
                           max_length=max_length, add_special_tokens=True, return_tensors="pt")
    answer = t5_tokenizer(example['answer'], padding=True, truncation='only_second',
                          max_length=max_length, add_special_tokens=True, return_tensors="pt")
    
    psg_input = passage.input_ids.to(device)
    psg_attention = passage.attention_mask.to(device)
    ans_input = answer.input_ids
    ans_input[ans_input == t5_tokenizer.pad_token_id] = -100
    ans_input = ans_input.to(device)
    
    org_loss = t5_model(input_ids=psg_input, attention_mask=psg_attention, labels=ans_input).loss.item()
    
    para_loss = {}
    losses = []
    
    sentence = example['rewrite']

    text =  "paraphrase: " + sentence + " </s>"

    encoding = para_tokenizer.encode_plus(text, padding=True, return_tensors="pt")
    input_ids = encoding["input_ids"].to(device)
    attention_masks = encoding["attention_mask"].to(device)
    
    outputs = para_model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    max_length=256,
    do_sample=True,
    top_k=120, # 120
    top_p=0.95,
    early_stopping=True,
    num_return_sequences=10)

    for output in outputs:
        line = para_tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        passage = t5_tokenizer(line, example['passage'], padding=True, truncation='only_second',
                           max_length=max_length, add_special_tokens=True, return_tensors="pt")
        
        psg_input = passage.input_ids.to(device)
        psg_attention = passage.attention_mask.to(device)
    
        loss = t5_model(input_ids=psg_input, attention_mask=psg_attention, labels=ans_input).loss.item()
        para_loss[line] = loss
        losses.append(loss)
        
    if any(p < org_loss for p in losses):
        example['rewrite'] = min(para_loss, key=para_loss.get)
    
    return example

In [15]:
qrecc['train'] = qrecc['train'].map(update)

  0%|          | 0/24037 [00:00<?, ?ex/s]

In [16]:
qrecc.save_to_disk("/home/ujan/Desktop/qrecc")

Flattening the indices:   0%|          | 0/7 [00:00<?, ?ba/s]

In [16]:
qrecc = load_from_disk("/home/ujan/Desktop/qrecc")

In [17]:
def tokenize_dataset(batch):

  passages = t5_tokenizer(batch['rewrite'], batch['passage'], padding='max_length', truncation='only_second', max_length=max_length, add_special_tokens=True)
  answers = t5_tokenizer(batch['answer'], padding='max_length', truncation=True, max_length=max_length, add_special_tokens=True)

  batch['psg_input_ids'] = passages.input_ids
  batch['ans_input_ids']  = answers.input_ids
  batch['psg_attention_mask'] = passages.attention_mask

  return batch


# handle examples with no answers
def no_ans(x):
  if isinstance(x['answer'], str): return x
  x['answer'] = 'no_ans'
  return x


# removing examples with no context
qrecc = qrecc.filter(lambda x: isinstance(x['passage'], str))

# removing examples with passage length > 384
#qrecc = qrecc.filter(lambda x: len(e_tokenizer(x['passage']).input_ids) <= max_length)

# no answers
qrecc = qrecc.map(no_ans)

# tokenizing
dataset = qrecc.map(
    tokenize_dataset, 
    batch_size = batch_size,
    batched=True,
    remove_columns=['passage', 'answer', 'rewrite']
)


dataset.set_format(
    type='torch', columns=['psg_input_ids', 'ans_input_ids', 'psg_attention_mask'],)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/24037 [00:00<?, ?ex/s]

  0%|          | 0/6478 [00:00<?, ?ex/s]

  0%|          | 0/1503 [00:00<?, ?ba/s]

  0%|          | 0/405 [00:00<?, ?ba/s]

In [18]:
train_loader = torch.utils.data.DataLoader(dataset['train'], batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(dataset['test'], batch_size=batch_size)

train para e2e with rc

In [19]:
num_epochs = 6

device = torch.device('cuda')
t5_tokenizer = T5Tokenizer.from_pretrained(pretrained_model)

t5_model = T5ForConditionalGeneration.from_pretrained(pretrained_model)
t5_model.to(device)

#t5_model.load_state_dict(torch.load('/home/ujan/Documents/conv-qa/models/finetuned_weights/rc_gen5.pth'))

t5_model.train()


optim = optimizer = Adafactor(
    t5_model.parameters(),
    lr=1e-5,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False
)

for epoch in range(num_epochs):
  
  epoch_loss = 0

  for batch in train_loader:

    # rewrite+passge inputs for encoder
    psg_input = batch['psg_input_ids'].to(device)
    psg_attention = batch['psg_attention_mask'].to(device)

    # answer inputs for rc
    ans_input = batch['ans_input_ids'] 
    ans_input[ans_input == t5_tokenizer.pad_token_id] = -100
    ans_input = ans_input.to(device)

    # rc loss
    loss = t5_model(input_ids=psg_input, attention_mask=psg_attention, labels=ans_input).loss

    epoch_loss += loss.item()

    loss.backward()
    optim.step()
    optim.zero_grad()
    

  print('Train loss after epoch {} : {}'.format(epoch+1, epoch_loss / len(train_loader)))
  t5_model.eval()
  print('Valid loss after epoch {} : {}'.format(epoch+1, valid_loss()))
  
  print('\n')
  t5_model.train()
  torch.save(t5_model.state_dict(), '/home/ujan/Documents/rc_gen'+str(epoch+1)+'.pth')

Train loss after epoch 1 : 0.7852077122854536
Valid loss after epoch 1 : 0.6146532967870618


Train loss after epoch 2 : 0.6046948571722268
Valid loss after epoch 2 : 0.581035594255836




KeyboardInterrupt: 

Train qr with new dataset