In [None]:
!pip install datasets
!pip install transformers
# !pip install sentencepiece

Collecting datasets
  Downloading datasets-2.2.1-py3-none-any.whl (342 kB)
[?25l[K     |█                               | 10 kB 28.4 MB/s eta 0:00:01[K     |██                              | 20 kB 34.7 MB/s eta 0:00:01[K     |██▉                             | 30 kB 13.9 MB/s eta 0:00:01[K     |███▉                            | 40 kB 7.6 MB/s eta 0:00:01[K     |████▉                           | 51 kB 7.9 MB/s eta 0:00:01[K     |█████▊                          | 61 kB 9.3 MB/s eta 0:00:01[K     |██████▊                         | 71 kB 9.4 MB/s eta 0:00:01[K     |███████▋                        | 81 kB 8.8 MB/s eta 0:00:01[K     |████████▋                       | 92 kB 9.8 MB/s eta 0:00:01[K     |█████████▋                      | 102 kB 8.7 MB/s eta 0:00:01[K     |██████████▌                     | 112 kB 8.7 MB/s eta 0:00:01[K     |███████████▌                    | 122 kB 8.7 MB/s eta 0:00:01[K     |████████████▌                   | 133 kB 8.7 MB/s eta 0:00:01

In [None]:
from datasets import load_dataset
import torch
import pandas as pd
from sklearn.model_selection import KFold
import numpy as np
from transformers import Trainer, TrainingArguments, Seq2SeqTrainingArguments, Seq2SeqTrainer, T5ForConditionalGeneration, T5TokenizerFast
import datasets
from datasets import Dataset
import random
from datetime import datetime
import requests
from urllib.parse import urlparse

In [None]:
def retrieval_eval(query, gold_urls):
  retrieval_1 = 0
  retrieval_5 = 0
  retrieval_10 = 0
  retrieval_20 = 0
  retrieval_50 = 0
  mrr = 0
  reciprocal_ranks = []

  for i in range(0,len(query)):
    added = False
    cand = query[i]
    gold_url = urlparse(gold_urls[i])
    gold_url_netloc = gold_url[1]
    gold_url_path = gold_url[2]
    if gold_url_path[-1]=='/':
      gold_url_path = gold_url_path[:-1]
    
    params = {
            'query': cand.strip(),
            'pageSize': '50',
            'key': '<your_key>',
        }
    try:
        response_dict = requests.get(
            'https://factchecktools.googleapis.com/v1alpha1/claims:search',
            params=params).json()
    except Exception as e:
      response_dict = {}
      print("Google API is not working")
      break

    if 'claims' in response_dict.keys():
      for i in range(0, len(response_dict["claims"])):
        claim = response_dict['claims'][i]
        ret_url = urlparse(claim['claimReview'][0]['url'])
        ret_url_netloc = ret_url[1]
        ret_url_path = ret_url[2]
        if ret_url_path[-1]=='/':
          ret_url_path = ret_url_path[:-1]

        if(ret_url_netloc == gold_url_netloc and ret_url_path == gold_url_path):
          added = True
          reciprocal_ranks.append(1/(i+1))
          if i<1:
            retrieval_1 +=1

          if i<5:
            retrieval_5 +=1

          if i<10:
            retrieval_10 +=1  

          if i<20:
            retrieval_20 +=1

          retrieval_50 +=1
          break
    
    if added == False:
      reciprocal_ranks.append(0)

  if len(reciprocal_ranks) >0:
    mrr = sum(reciprocal_ranks)/len(reciprocal_ranks)

  return { 
      'retrieval_1': (retrieval_1*100)/len(query),
      'retrieval_5': (retrieval_5*100)/len(query),
      'retrieval_10': (retrieval_10*100)/len(query),
      'retrieval_20': (retrieval_20*100)/len(query),
      'retrieval_50': (retrieval_50*100)/len(query),
      'mrr': mrr,
      }

In [None]:
device = torch.device('cuda')
torch.manual_seed(0)
random.seed(0)

In [None]:
filenames = ['summ_data_eng_preprocessed_hashtag_removed.csv', 'summ_data_eng_preprocessed_mention_removed.csv']

In [None]:
def t5_summarizer(df,model, min_len, max_len):
  t5_summaries = []
  for row in df:
    preprocess_text = row.strip().replace("\n","")
    tokenized_text = tokenizer.encode(preprocess_text, return_tensors="pt").to(device)
    summary_ids = model.generate(tokenized_text,
                                      num_beams=6,
                                      no_repeat_ngram_size=2,
                                      min_length=min_len,
                                      max_length=max_len,
                                      early_stopping=True)

    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    t5_summaries.append(output)
    del tokenized_text
  return t5_summaries

In [None]:
for i in range(0, 1):
  file = filenames[i]
  summ_data = pd.read_csv(file, header=0, index_col=0)
  index = list(pd.read_csv('index.csv', header=0, index_col=0)['index'])
  t5_summ_data = summ_data[['tweet', 'claim_reviewed']].copy()
  t5_summ_data.columns = ['source', 'target']
  t5_summ_data = t5_summ_data.reindex(index)

  kf = KFold(n_splits=5)
  ct = 0
  hf_model_name = 't5-large'
  # %load_ext tensorboard
  # %tensorboard --logdir output/

  for train_index, val_index in kf.split(t5_summ_data):
    if ct!=1:
      ct = ct+1
      continue
    tokenizer = T5TokenizerFast.from_pretrained(hf_model_name)
    model = T5ForConditionalGeneration.from_pretrained(hf_model_name)

    def tokenize(batch):
      tokenized_input = tokenizer(batch['source'], padding='longest')
      tokenized_label = tokenizer(batch['target'], padding='longest')

      tokenized_input['labels'] = tokenized_label['input_ids']

      return tokenized_input

    train_dataset = Dataset.from_pandas(t5_summ_data.iloc[train_index])
    train_dataset = train_dataset.map(tokenize, batched=True, batch_size = len(train_dataset))
    val_dataset = Dataset.from_pandas(t5_summ_data.iloc[val_index])
    val_dataset = val_dataset.map(tokenize, batched=True, batch_size = len(val_dataset))
    
    
    train_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
    val_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])

    model_dir = str(i)+'/model/'+str(ct)+"/"
    output_dir = str(i)+'/output/'+str(ct)+"/"
    now = datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
    logging_dir = output_dir+"runs/"+now+"/"
    

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        num_train_epochs=8,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        eval_accumulation_steps=1, # Number of eval steps to keep in GPU (the higher, the mor vRAM used)
        prediction_loss_only=True, # If I need co compute only loss and not other metrics, setting this to true will use less RAM
        learning_rate=0.00005,
        evaluation_strategy='steps', # Run evaluation every eval_steps
        save_steps=100, # How often to save a checkpoint
        save_total_limit=1, # Number of maximum checkpoints to save
        remove_unused_columns=True, # Removes useless columns from the dataset
        run_name='train', # Wandb run name
        logging_steps=100, # How often to log loss to wandb
        eval_steps=100, # How often to run evaluation on the val_set
        logging_first_step=False, # Whether to log also the very first training step to wandb
        load_best_model_at_end=True, # Whether to load the best model found at each evaluation.
        metric_for_best_model="loss", # Use loss to evaluate best model.
        greater_is_better=False, # Best model is the one with the lowest loss, not highest.
        generation_max_length = 15,
        generation_num_beams = 6,
        predict_with_generate=True,
        report_to="tensorboard",
        logging_dir=logging_dir
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()
    trainer.save_model(model_dir)



    cur_model = T5ForConditionalGeneration.from_pretrained(model_dir).to(device)

    decoded_sources = val_dataset['source']

    decoded_preds = t5_summarizer(val_dataset['source'],cur_model, 5, 15)
    decoded_labels = val_dataset['target']

    output = pd.DataFrame({'ID': val_dataset['__index_level_0__'], 'Source Text': decoded_sources, 'Target Text': decoded_labels, 'Generated Text': decoded_preds})
    output.to_csv(output_dir +"/predictions.xlsx")

    del cur_model
    del decoded_preds
    del model
    del tokenizer

    ct = ct +1

  output_dir = str(i)+'/output/'
  prediction_file_paths = ['0/predictions.xlsx', '1/predictions.xlsx', '2/predictions.xlsx', '3/predictions.xlsx', '4/predictions.xlsx']
  for file in prediction_file_paths:
    preds = pd.read_csv(output_dir+file, header=0, index_col=0)
    gold_urls = []
    for index, row in preds.iterrows():
      gold_urls.append(summ_data.loc[row['ID']]['evidence_url'])

    print("{} T5 FT Retrieval = {}".format(file, retrieval_eval(preds['Generated Text'], gold_urls)))



For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: __index_level_0__, target, source. If __index_level_0__, target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 453
  Num Epochs = 8
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 912
  batch[k] = torch.tensor([f[k] for f in features])


Step,Training Loss,Validation Loss
100,4.8626,0.760083
200,0.7891,0.645172
300,0.6521,0.594348
400,0.5221,0.565447
500,0.474,0.550843
600,0.3982,0.536752
700,0.3739,0.533535
800,0.3341,0.52992
900,0.322,0.529947


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: __index_level_0__, target, source. If __index_level_0__, target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 114
  Batch size = 4
Saving model checkpoint to 0/output/1/checkpoint-100
Configuration saved in 0/output/1/checkpoint-100/config.json
Model weights saved in 0/output/1/checkpoint-100/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: __index_level_0__, target, source. If __index_level_0__, target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 114
  Batch size = 4
Saving model checkpoint to 0/output/1/checkpoint-200
Configur

In [None]:
%load_ext tensorboard
%tensorboard --logdir .

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)