In [None]:
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install sentencepiece
!pip install transformers



In [None]:
import torch

def encode_data(dataset, tokenizer, max_seq_length=128):
    """Featurizes the dataset into input IDs and attention masks for input into a
     transformer-style model.
  Args:
    dataset: A Pandas dataframe containing the data to be encoded.
    tokenizer: A transformers.PreTrainedTokenizerFast object that is used to
      tokenize the data.
    max_seq_length: Maximum sequence length to either pad or truncate every
      input example to.
  Returns:
    input_ids: A PyTorch.Tensor (with dimensions [len(dataset), max_seq_length])
      containing token IDs for the data.
    attention_mask: A PyTorch.Tensor (with dimensions [len(dataset), max_seq_length])
      containing attention masks for the data.
  """

    message = dataset['text'].apply(lambda x: 'true or false: ' + x).astype(str).values.tolist()

    inputs = tokenizer(
      text= message,
      padding = 'max_length',
      truncation = True,
      max_length = max_seq_length,
      is_split_into_words = False,
      return_tensors='pt'
      )
    
    input_ids = torch.tensor(inputs["input_ids"])
    attention_mask = torch.tensor(inputs["attention_mask"])

    return input_ids, attention_mask


def extract_labels(dataset, tokenizer):
    """Converts labels into numerical labels.
  Args:
    dataset: A Pandas dataframe containing the labels in the column 'label'.
  Returns:
    labels: A list of integers corresponding to the labels for each example,
      where 1 is Misinformation, 0 is factual. 
  """
    CLASS_TOKENS = ['true','false']

    target = list(dataset.label.apply(lambda x: CLASS_TOKENS[x]).astype(str).values)

    target_encodings = tokenizer(
      text = target,
      padding = 'longest',
      truncation = False,
      is_split_into_words = False,
      return_tensors='pt')

    labels = torch.tensor(target_encodings['input_ids'])
    decoder_attention_mask = torch.tensor(target_encodings['attention_mask'])

    return labels, decoder_attention_mask


In [None]:
from torch.utils.data import Dataset


class TGDataset(Dataset):
    """
    A torch.utils.data.Dataset wrapper for the BoolQ dataset.
    """

    def __init__(self, dataframe, tokenizer, max_seq_length=256):
        """
        Args:
          dataframe: A Pandas dataframe containing the data.
          tokenizer: A transformers.PreTrainedTokenizerFast object that is used to
            tokenize the data.
          max_seq_length: Maximum sequence length to either pad or truncate every
            input example to.
        """
        self.encoded_data = encode_data(dataframe, tokenizer, max_seq_length)

        self.label_list = extract_labels(dataframe, tokenizer)

    def __len__(self):
        label, decoder_attention_mask = self.label_list
        return len(label)

    def __getitem__(self, i):
        """
        Returns:
          example: A dictionary containing the input_ids, attention_mask, and
            label for the i-th example, with the values being numeric tensors
            and the keys being 'input_ids', 'attention_mask', and 'labels'.
        """
    
        input_ids, attention_mask = self.encoded_data
        label, decoder_attention_mask = self.label_list
        example = {
          'input_ids': input_ids[i],
          'attention_mask': attention_mask[i],
          'labels': label[i],
          'decoder_attention_mask': decoder_attention_mask[i]
        }

        return example

In [None]:
def compute_metrics(eval_pred):
    """Computes accuracy, f1, precision, and recall from a 
    transformers.trainer_utils.EvalPrediction object.
    """
    from sklearn import metrics

    labels = eval_pred.label_ids[:,0]
    preds = np.argmax(eval_pred.predictions[0], axis=2)[:,0]
    

    accuracy = metrics.accuracy_score(y_true=labels, y_pred=preds)
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(y_true=labels, y_pred=preds, average='macro')

    result = {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

    print("result ", result)
    return result

def model_init():
    """Returns an initialized model for use in a Hugging Face Trainer."""
    from transformers import T5Config, T5ForConditionalGeneration

    configuration = T5Config()
    model = T5ForConditionalGeneration(configuration).from_pretrained("t5-base")

    return model

In [None]:
import pandas
import torch
from transformers import T5Tokenizer, Trainer, TrainingArguments
from transformers import T5ForConditionalGeneration
import sklearn
import numpy as np
from sklearn import metrics
from sklearn.metrics import f1_score

train_df = pandas.read_csv("/content/drive/MyDrive/mumin-classifier/T5/T5_data/train_en.csv")
val_df = pandas.read_csv("/content/drive/MyDrive/mumin-classifier/T5/T5_data/val_en.csv")
test_df = pandas.read_csv("/content/drive/MyDrive/mumin-classifier/T5/T5_data/test_en.csv")

tokenizer = T5Tokenizer.from_pretrained("t5-base")
train_data = TGDataset(train_df, tokenizer)
val_data = TGDataset(val_df, tokenizer)
test_data = TGDataset(test_df, tokenizer)


model_path = "out_t5"
trainingargs = TrainingArguments(
    output_dir=model_path,
    do_train=True,
    do_eval=True,
    disable_tqdm=False,
    learning_rate=1e-4,
    num_train_epochs=3,
    weight_decay=0.01,
    #logging_steps=500,
    logging_first_step=True,
    #save_steps=1000,
    evaluation_strategy = "epoch"
    )

trainer = Trainer(
    args = trainingargs,
    tokenizer = tokenizer,
    train_dataset = train_data,
    eval_dataset = val_data,
    model_init = model_init,
    compute_metrics = compute_metrics
    ) 

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
loading configuration file https://huggingface.co/t5-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/91e9fe874e06c44883b535d6c950b8b89d6eaa3298d8e7fb3b2c78039e9f8b7b.66b9637a52aa11e9285cdd6e668cc0df14b3bcf0b6674cf3ba5353c542649637
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type":

In [None]:
print("STARTED TRAINING")
trainer.train()
print("TRAINING DONE")

trainer.save_model()
print("MODEL SAVED")   

STARTED TRAINING


loading configuration file https://huggingface.co/t5-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/91e9fe874e06c44883b535d6c950b8b89d6eaa3298d8e7fb3b2c78039e9f8b7b.66b9637a52aa11e9285cdd6e668cc0df14b3bcf0b6674cf3ba5353c542649637
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngra

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,11.4474,0.099938,0.956923,0.52231,0.517154,0.546145
2,0.195,0.051287,0.970769,0.492584,0.49374,0.491433
3,0.072,0.084682,0.958462,0.523861,0.518245,0.546924


***** Running Evaluation *****
  Num examples = 650
  Batch size = 8


result  {'accuracy': 0.9569230769230769, 'f1': 0.5223097112860893, 'precision': 0.5171540243196294, 'recall': 0.5461448598130841}


Saving model checkpoint to out_t5/checkpoint-500
Configuration saved in out_t5/checkpoint-500/config.json
Model weights saved in out_t5/checkpoint-500/pytorch_model.bin
tokenizer config file saved in out_t5/checkpoint-500/tokenizer_config.json
Special tokens file saved in out_t5/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 650
  Batch size = 8


result  {'accuracy': 0.9707692307692307, 'f1': 0.49258391881342706, 'precision': 0.4937402190923318, 'recall': 0.4914330218068536}


Saving model checkpoint to out_t5/checkpoint-1000
Configuration saved in out_t5/checkpoint-1000/config.json
Model weights saved in out_t5/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in out_t5/checkpoint-1000/tokenizer_config.json
Special tokens file saved in out_t5/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 650
  Batch size = 8


result  {'accuracy': 0.9584615384615385, 'f1': 0.5238612007922081, 'precision': 0.5182451358921947, 'recall': 0.5469236760124611}




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to out_t5
Configuration saved in out_t5/config.json


TRAINING DONE


Model weights saved in out_t5/pytorch_model.bin
tokenizer config file saved in out_t5/tokenizer_config.json
Special tokens file saved in out_t5/special_tokens_map.json


MODEL SAVED


In [None]:
#Metrics
predictions = trainer.predict(test_data)
preds = np.argmax(predictions.predictions[0],axis=2)[:,0]
labels = predictions.label_ids[:,0]

test_scores = f1_score(y_true=labels, y_pred=preds, average=None)
print(f'\nMisinformation F1: {100 * test_scores[1]:.2f}%')
print(f'Factual F1: {100 * test_scores[0]:.2f}%')
print(f'macro-average F1: {100 * test_scores.mean():.4f}%\n')

report = sklearn.metrics.classification_report(y_pred=preds,y_true=labels)

print(report)

***** Running Prediction *****
  Num examples = 543
  Batch size = 8


result  {'accuracy': 0.9594843462246777, 'f1': 0.5665457184325109, 'precision': 0.7314471243042672, 'recall': 0.5435351596580004}

Misinformation F1: 97.92%
Factual F1: 15.38%
macro-average F1: 56.6546%

              precision    recall  f1-score   support

        1176       0.50      0.09      0.15        22
        6136       0.96      1.00      0.98       521

    accuracy                           0.96       543
   macro avg       0.73      0.54      0.57       543
weighted avg       0.94      0.96      0.95       543



In [None]:
langs = pandas.concat([val_df, test_df])
langs = langs.query("lang=='en' | lang=='pt' | lang=='es'| lang=='fr'| lang=='ar'")
languages = ['en','pt','es','fr','ar']
f1 = []
randomf1 = []
for lang in languages:
  print("\nLANG ", lang)
  lang_data = langs.query(f"lang=='{lang}'")
  test_data = TGDataset(lang_data, tokenizer)
  predictions = trainer.predict(test_data)
  predictions = trainer.predict(test_data)
  preds = np.argmax(predictions.predictions[0],axis=2)[:,0]
  labels = predictions.label_ids[:,0]

  test_scores = f1_score(labels, preds, average=None)

  print(f'\nMisinformation F1: {100 * test_scores[1]:.2f}%')
  print(f'Factual F1: {100 * test_scores[0]:.2f}%')
  print(f'macro-average F1: {100 * test_scores.mean():.4f}%\n')
  f1.append(100 * test_scores.mean())


LANG  en


***** Running Prediction *****
  Num examples = 680
  Batch size = 8


***** Running Prediction *****
  Num examples = 680
  Batch size = 8


result  {'accuracy': 0.9485294117647058, 'f1': 0.538034511539432, 'precision': 0.5466867469879518, 'recall': 0.5328237707630203}


***** Running Prediction *****
  Num examples = 182
  Batch size = 8


result  {'accuracy': 0.9485294117647058, 'f1': 0.538034511539432, 'precision': 0.5466867469879518, 'recall': 0.5328237707630203}

Misinformation F1: 97.35%
Factual F1: 10.26%
macro-average F1: 53.8035%


LANG  pt


***** Running Prediction *****
  Num examples = 182
  Batch size = 8


result  {'accuracy': 0.967032967032967, 'f1': 0.49162011173184356, 'precision': 0.4971751412429379, 'recall': 0.4861878453038674}


***** Running Prediction *****
  Num examples = 111
  Batch size = 8


result  {'accuracy': 0.967032967032967, 'f1': 0.49162011173184356, 'precision': 0.4971751412429379, 'recall': 0.4861878453038674}

Misinformation F1: 98.32%
Factual F1: 0.00%
macro-average F1: 49.1620%


LANG  es


  _warn_prf(average, modifier, msg_start, len(result))
***** Running Prediction *****
  Num examples = 111
  Batch size = 8


result  {'accuracy': 0.9819819819819819, 'f1': 0.4954545454545454, 'precision': 0.49099099099099097, 'recall': 0.5}


  _warn_prf(average, modifier, msg_start, len(result))
***** Running Prediction *****
  Num examples = 66
  Batch size = 8


result  {'accuracy': 0.9819819819819819, 'f1': 0.4954545454545454, 'precision': 0.49099099099099097, 'recall': 0.5}

Misinformation F1: 99.09%
Factual F1: 0.00%
macro-average F1: 49.5455%


LANG  fr


***** Running Prediction *****
  Num examples = 66
  Batch size = 8


result  {'accuracy': 0.9393939393939394, 'f1': 0.484375, 'precision': 0.49206349206349204, 'recall': 0.47692307692307695}


***** Running Prediction *****
  Num examples = 38
  Batch size = 8


result  {'accuracy': 0.9393939393939394, 'f1': 0.484375, 'precision': 0.49206349206349204, 'recall': 0.47692307692307695}

Misinformation F1: 96.88%
Factual F1: 0.00%
macro-average F1: 48.4375%


LANG  ar


  _warn_prf(average, modifier, msg_start, len(result))
***** Running Prediction *****
  Num examples = 38
  Batch size = 8


result  {'accuracy': 0.9736842105263158, 'f1': 0.4933333333333333, 'precision': 0.4868421052631579, 'recall': 0.5}
result  {'accuracy': 0.9736842105263158, 'f1': 0.4933333333333333, 'precision': 0.4868421052631579, 'recall': 0.5}

Misinformation F1: 98.67%
Factual F1: 0.00%
macro-average F1: 49.3333%



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
f1

[53.80345115394321,
 49.162011173184354,
 49.54545454545454,
 48.4375,
 49.33333333333333]