In [1]:
import numpy as np
import evaluate

2023-06-21 00:16:43.317698: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
vk = 'dev_docs/'
ms = '../msmarco_data/'

import pandas as pd
import transformers
import evaluate

vk_qrels = pd.read_csv(vk + 'qrels.tsv', names=['id', 'query_id', 'doc_id'],  sep='\t')
vk_docs = pd.read_csv(vk + 'docs.tsv', names=['id', 'doc_id', 'data'],  sep='\t')
vk_queries = pd.read_csv(vk + 'queries.tsv', names=['id', 'query_id', 'data'],  sep='\t')

# ms_qrels = pd.read_csv(ms + 'qrels.tsv', names=['id', 'query_id', 'doc_id'],  sep='\t')
# ms_docs = pd.read_csv(ms + 'docs.tsv', names=['id', 'doc_id', 'data'],  sep='\t')
# ms_queries = pd.read_csv(ms + 'queries.tsv', names=['id', 'query_id', 'data'],  sep='\t')

In [5]:
def create_joined_file(df_docs, df_qrels, df_queries, path_processed_joined=None):
    joined_df = df_qrels.merge(df_queries, on='query_id').merge(df_docs, on='doc_id', how='left')[['query_id', 'data_x', 'doc_id', 'data_y']]
    joined_df.rename(columns={'data_x':'query_data', 'data_y':'doc_data'}, inplace=True)
    if path_processed_joined:
        joined_df.to_csv(path_processed_joined, sep='\t', index=None, header=None)
    return joined_df

vk_joined = create_joined_file(vk_docs, vk_qrels, vk_queries)

In [7]:
len(vk_qrels), len(vk_docs) #, len(ms_qrels), len(ms_docs), len(vk_joined)

(5200, 5175)

### Train-Test split

In [9]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

MODEL_NAME = 'cointegrated/rut5-base-absum'
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

In [10]:
dataset = vk_joined[['doc_data', 'query_data']]

dataset = dataset.sample(frac=1).reset_index(drop=True)
train_size = int(len(dataset) * 0.8)
train_dataset = dataset[:train_size]
test_dataset = dataset[train_size:]

In [11]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {}
        item['input_ids'] = self.encodings[idx]
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels) 

### Rouge

In [12]:
metric = evaluate.load('rouge')

Downloading builder script: 0.00B [00:00, ?B/s]

### Train

In [17]:
model.cuda()
max_input_length = 400
max_target_input = 400

def preprocess_function(dataset):
    docs = [str(i) for i in dataset['doc_data'].values]
    lbls = [str(i) for i in dataset['query_data'].values]
    inputs = tokenizer(docs, max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(lbls, max_length=max_target_input, truncation=True)
    inputs['labels'] = labels['input_ids']
    return inputs

In [18]:
tokenized_train_dataset = preprocess_function(train_dataset)
tokenized_test_dataset = preprocess_function(test_dataset)

In [19]:
tokenized_test_dataset.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [20]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

In [21]:
batch_size = 2
model_name = MODEL_NAME.split('/')[-1]

training_set = Dataset(tokenized_train_dataset['input_ids'], tokenized_train_dataset['labels'])
validation_set = Dataset(tokenized_test_dataset['input_ids'], tokenized_test_dataset['labels'])


args = Seq2SeqTrainingArguments(
    f'{model_name}-finetuned-rut5',
    evaluation_strategy='epoch', 
    learning_rate=2e-5, 
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=1, 
    num_train_epochs=2, 
    predict_with_generate=True, 
    fp16=False,
    push_to_hub=False    
)

In [22]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [23]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels!= - 100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = ['\n'.join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ['\n'.join(nltk.sent_tokenize(pred.strip())) for pred in decoded_labels]
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: v * 100 for k, v in result.items()}
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result['gen_len'] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

In [24]:
nltk.download('punkt')

torch.cuda.empty_cache()

trainer = Seq2SeqTrainer(
    model, 
    args, 
    train_dataset=training_set,
    eval_dataset=validation_set,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()


[nltk_data] Downloading package punkt to /home/tatiana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.8509,2.48146,10.2668,2.0027,10.0711,10.0346,10.7212
2,2.6836,2.450127,10.2989,1.7137,10.1021,10.0461,10.6731


TrainOutput(global_step=4160, training_loss=2.843358978858361, metrics={'train_runtime': 1236.4783, 'train_samples_per_second': 6.729, 'train_steps_per_second': 3.364, 'total_flos': 4393040504924160.0, 'train_loss': 2.843358978858361, 'epoch': 2.0})

In [26]:
prediction = trainer.predict(test_dataset=validation_set)

In [36]:
prediction.predictions.shape

(1040, 20)

In [37]:
preds, labels = prediction.predictions, prediction.label_ids
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = np.where(labels!= - 100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds = ['\n'.join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
decoded_labels = ['\n'.join(nltk.sent_tokenize(pred.strip())) for pred in decoded_labels]

In [2]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, T5Tokenizer, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained('rut5-base-absum-finetuned-rut5/checkpoint-4000')
tokenizer = T5Tokenizer.from_pretrained('rut5-base-absum-finetuned-rut5/checkpoint-4000')

### Prediction

In [95]:
tokenized_test_dataset = preprocess_function(test_dataset)



In [101]:
validation_set = Dataset(tokenized_test_dataset['input_ids'], tokenized_test_dataset['labels'])

### Pytorch version of training loop

In [103]:
del model
del trainer
torch.cuda.empty_cache()

In [136]:
model = AutoModelForSeq2SeqLM.from_pretrained('rut5-base-absum-finetuned-rut5/checkpoint-4000')
tokenizer = T5Tokenizer.from_pretrained('rut5-base-absum-finetuned-rut5/checkpoint-4000')

In [145]:
validation_set

<__main__.Dataset at 0x7f25a6878580>

In [156]:
[b['input_ids'] for b in batch]

[[3256,
  259,
  735,
  6865,
  7994,
  259,
  279,
  20619,
  10469,
  661,
  259,
  28860,
  27656,
  259,
  9078,
  1074,
  11846,
  3187,
  4686,
  259,
  279,
  8286,
  446,
  15827,
  425,
  15701,
  8286,
  259,
  279,
  259,
  27918,
  8096,
  12447,
  308,
  446,
  15827,
  17067,
  657,
  5097,
  8096,
  259,
  18806,
  308,
  13951,
  661,
  2156,
  3689,
  4251,
  657,
  1229,
  1987,
  9852,
  259,
  7451,
  833,
  617,
  807,
  814,
  3030,
  9671,
  1296,
  3543,
  259,
  5925,
  259,
  396,
  10893,
  2058,
  5530,
  748,
  259,
  735,
  4763,
  259,
  3624,
  3057,
  7784,
  388,
  3691,
  5097,
  259,
  18806,
  308,
  315,
  5583,
  324,
  259,
  279,
  22833,
  22000,
  1229,
  543,
  259,
  7258,
  1004,
  259,
  18806,
  315,
  259,
  1],
 [1132,
  259,
  18140,
  259,
  17029,
  685,
  6533,
  6687,
  5229,
  433,
  446,
  6875,
  259,
  12887,
  259,
  735,
  11782,
  279,
  259,
  17029,
  19570,
  14859,
  6620,
  1296,
  259,
  19044,
  653,
  410,
  404,
  2

In [154]:
eval_dataloader = DataLoader(validation_set, batch_size=2, collate_fn=lambda x: x )
for batch in eval_dataloader:
    #batch = [{k: v for k, v in b.items()} for b in batch]
    break
    outputs = model(**batch)
    break

In [112]:
tokenized_train_dataset = preprocess_function(train_dataset)
tokenized_test_dataset = preprocess_function(test_dataset)
print(tokenized_train_dataset.keys())

training_set = Dataset(tokenized_train_dataset['input_ids'], tokenized_train_dataset['labels'])
validation_set = Dataset(tokenized_test_dataset['input_ids'], tokenized_test_dataset['labels'])

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [117]:
training_set

<__main__.Dataset at 0x7f25967df070>

In [125]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(training_set, shuffle=True, batch_size=2, collate_fn=lambda x: x )
eval_dataloader = DataLoader(validation_set, batch_size=2, collate_fn=lambda x: x )

In [126]:
from torch.optim import AdamW
from transformers import get_scheduler
import torch

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(30000, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(30000, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [130]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = [{k: v for k, v in b.items()} for b in batch]
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/2080 [00:00<?, ?it/s]

### RuT5 Pytorch

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split