In [1]:
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install sentencepiece
!pip install transformers

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 27.1 MB/s eta 0:00:01[K     |▌                               | 20 kB 10.1 MB/s eta 0:00:01[K     |▉                               | 30 kB 8.2 MB/s eta 0:00:01[K     |█                               | 40 kB 4.3 MB/s eta 0:00:01[K     |█▍                              | 51 kB 4.1 MB/s eta 0:00:01[K     |█▋                              | 61 kB 4.9 MB/s eta 0:00:01[K     |██                              | 71 kB 5.0 MB/s eta 0:00:01[K     |██▏                             | 81 kB 5.0 MB/s eta 0:00:01[K     |██▍                             | 92 kB 5.6 MB/s eta 0:00:01[K     |██▊                             | 102 kB 5.0 MB/s eta 0:00:01[K     |███                             | 112 kB 5.0 MB/s eta 0:00:01[K     |███▎                            | 122 kB 5.0 MB/s eta 0:00:01[K     |███▌        

In [3]:
import torch

def encode_data(dataset, tokenizer, max_seq_length=128):
    """Featurizes the dataset into input IDs and attention masks for input into a
     transformer-style model.
  Args:
    dataset: A Pandas dataframe containing the data to be encoded.
    tokenizer: A transformers.PreTrainedTokenizerFast object that is used to
      tokenize the data.
    max_seq_length: Maximum sequence length to either pad or truncate every
      input example to.
  Returns:
    input_ids: A PyTorch.Tensor (with dimensions [len(dataset), max_seq_length])
      containing token IDs for the data.
    attention_mask: A PyTorch.Tensor (with dimensions [len(dataset), max_seq_length])
      containing attention masks for the data.
  """

    message = dataset['text'].apply(lambda x: 'true or false: ' + x).astype(str).values.tolist()

    inputs = tokenizer(
      text= message,
      padding = 'max_length',
      truncation = True,
      max_length = max_seq_length,
      is_split_into_words = False,
      return_tensors='pt'
      )
    
    input_ids = torch.tensor(inputs["input_ids"])
    attention_mask = torch.tensor(inputs["attention_mask"])

    return input_ids, attention_mask


def extract_labels(dataset, tokenizer):
    """Converts labels into numerical labels.
  Args:
    dataset: A Pandas dataframe containing the labels in the column 'label'.
  Returns:
    labels: A list of integers corresponding to the labels for each example,
      where 1 is Misinformation, 0 is factual. 
  """
    CLASS_TOKENS = ['true','false']

    target = list(dataset.label.apply(lambda x: CLASS_TOKENS[x]).astype(str).values)

    target_encodings = tokenizer(
      text = target,
      padding = 'longest',
      truncation = False,
      is_split_into_words = False
     # return_tensors='pt'
      )

    labels = torch.tensor(target_encodings['input_ids'])
    decoder_attention_mask = torch.tensor(target_encodings['attention_mask'])

    return labels, decoder_attention_mask


In [4]:
from torch.utils.data import Dataset


class TGDataset(Dataset):
    """
    A torch.utils.data.Dataset wrapper for the BoolQ dataset.
    """

    def __init__(self, dataframe, tokenizer, max_seq_length=256):
        """
        Args:
          dataframe: A Pandas dataframe containing the data.
          tokenizer: A transformers.PreTrainedTokenizerFast object that is used to
            tokenize the data.
          max_seq_length: Maximum sequence length to either pad or truncate every
            input example to.
        """
        self.encoded_data = encode_data(dataframe, tokenizer, max_seq_length)

        self.label_list = extract_labels(dataframe, tokenizer)

    def __len__(self):
        label, decoder_attention_mask = self.label_list
        return len(label)

    def __getitem__(self, i):
        """
        Returns:
          example: A dictionary containing the input_ids, attention_mask, and
            label for the i-th example, with the values being numeric tensors
            and the keys being 'input_ids', 'attention_mask', and 'labels'.
        """
    
        input_ids, attention_mask = self.encoded_data
        label, decoder_attention_mask = self.label_list
        example = {
          'input_ids': input_ids[i],
          'attention_mask': attention_mask[i],
          'labels': label[i],
          'decoder_attention_mask': decoder_attention_mask[i]
        }

        return example

In [5]:
def compute_metrics(eval_pred):
    """Computes accuracy, f1, precision, and recall from a 
    transformers.trainer_utils.EvalPrediction object.
    """
    from sklearn import metrics

    labels = eval_pred.label_ids[:,0]
    preds = np.argmax(eval_pred.predictions[0], axis=2)[:,0]
    

    accuracy = metrics.accuracy_score(y_true=labels, y_pred=preds)
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(y_true=labels, y_pred=preds, average='macro')

    result = {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

    print("result ", result)
    return result

def model_init():
    """Returns an initialized model for use in a Hugging Face Trainer."""
    from transformers import MT5Config, MT5ForConditionalGeneration

    configuration = MT5Config()
    model = MT5ForConditionalGeneration(configuration).from_pretrained("google/mt5-small")

    return model

In [6]:
torch.cuda.empty_cache()

In [None]:
import pandas
import torch
from transformers import MT5Tokenizer, Trainer, TrainingArguments
from transformers import MT5ForConditionalGeneration
import sklearn
import numpy as np
from sklearn import metrics
from sklearn.metrics import f1_score

train_df = pandas.read_csv("/content/drive/MyDrive/mumin-classifier/data/train_m.csv")
val_df = pandas.read_csv("/content/drive/MyDrive/mumin-classifier/data/val_m.csv")
test_df = pandas.read_csv("/content/drive/MyDrive/mumin-classifier/data/test_m.csv")

tokenizer = MT5Tokenizer.from_pretrained("google/mt5-base")
train_data = TGDataset(train_df, tokenizer)
val_data = TGDataset(val_df, tokenizer)
test_data = TGDataset(test_df, tokenizer)


model_path = "out_mt5"
trainingargs = TrainingArguments(
    output_dir=model_path,
    do_train=True,
    do_eval=False,
    disable_tqdm=False,
    learning_rate=1e-4,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    #logging_steps=500,
    #logging_first_step=True,
    save_steps=5000
    #evaluation_strategy = "epoch"
    )

trainer = Trainer(
    args = trainingargs,
    tokenizer = tokenizer,
    train_dataset = train_data,
    eval_dataset = val_data,
    model_init = model_init,
    compute_metrics = compute_metrics
    ) 

In [8]:
print("STARTED TRAINING")
trainer.train()
print("TRAINING DONE")

trainer.save_model()
print("MODEL SAVED")   

STARTED TRAINING


loading configuration file https://huggingface.co/google/mt5-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/97693496c1a0cae463bd18428187f9e9924d2dfbadaa46e4d468634a0fc95a41.dadce13f8f85f4825168354a04675d4b177749f8f11b167e87676777695d4fe4
Model config MT5Config {
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.19.1",
  "use_cache": true,
  "vocab_size": 250112
}

loading weights file https://huggingface.co/google

Step,Training Loss
500,7.9275
1000,1.7563
1500,1.029
2000,0.6755
2500,0.5897




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to out_mt5
Configuration saved in out_mt5/config.json


TRAINING DONE


Model weights saved in out_mt5/pytorch_model.bin
tokenizer config file saved in out_mt5/tokenizer_config.json
Special tokens file saved in out_mt5/special_tokens_map.json


MODEL SAVED


In [9]:
#Metrics
predictions = trainer.predict(test_data)
preds = np.argmax(predictions.predictions[0],axis=2)[:,0]
labels = predictions.label_ids[:,0]

test_scores = f1_score(y_true=labels, y_pred=preds, average=None)
print(f'\nMisinformation F1: {100 * test_scores[1]:.2f}%')
print(f'Factual F1: {100 * test_scores[0]:.2f}%')
print(f'macro-average F1: {100 * test_scores.mean():.4f}%\n')

report = sklearn.metrics.classification_report(y_pred=preds,y_true=labels)

print(report)

***** Running Prediction *****
  Num examples = 543
  Batch size = 4


  _warn_prf(average, modifier, msg_start, len(result))


result  {'accuracy': 0.9594843462246777, 'f1': 0.48966165413533835, 'precision': 0.47974217311233885, 'recall': 0.5}

Misinformation F1: 0.00%
Factual F1: 97.93%
macro-average F1: 48.9662%

              precision    recall  f1-score   support

         259       0.96      1.00      0.98       521
        6274       0.00      0.00      0.00        22

    accuracy                           0.96       543
   macro avg       0.48      0.50      0.49       543
weighted avg       0.92      0.96      0.94       543



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
langs = pd.concat([val_df, test_df])
langs = langs.query("lang=='en' | lang=='pt' | lang=='es'| lang=='fr'| lang=='ar'")
languages = ['en','pt','es','fr','ar']
f1 = []
randomf1 = []
for lang in languages:
  print("\nLANG ", lang)
  lang_data = langs.query(f"lang=='{lang}'")
  test_data = tg_data.TGDataset(lang_data, tokenizer)
  predictions = trainer.predict(test_data)
  predictions = trainer.predict(test_data)
  preds = np.argmax(predictions.predictions[0],axis=2)[:,0]
  labels = predictions.label_ids[:,0]

  test_scores = f1_score(labels, preds, average=None)

  print(f'\nMisinformation F1: {100 * test_scores[1]:.2f}%')
  print(f'Factual F1: {100 * test_scores[0]:.2f}%')
  print(f'macro-average F1: {100 * test_scores.mean():.4f}%\n')
  f1.append(100 * test_scores.mean())


NameError: ignored

In [22]:
import pandas as pd

train_df = pandas.read_csv("/content/test_m.csv")
val_df = pandas.read_csv("/content/val_m.csv")
test_df = pandas.read_csv("/content/test_m.csv")

langs = pd.concat([val_df, test_df])
langs = langs.query("lang=='en' | lang=='pt' | lang=='es'| lang=='fr'| lang=='ar'")
languages = ['en','pt','es','fr','ar']
f1 = []
randomf1 = []
for lang in languages:
  print("\nLANG ", lang)
  lang_data = langs.query(f"lang=='{lang}'")
  test_data = TGDataset(lang_data, tokenizer)
  predictions = trainer.predict(test_data)
  preds = np.argmax(predictions.predictions[0],axis=2)[:,0]
  labels = predictions.label_ids[:,0]

  test_scores = f1_score(labels, preds, average=None)

  print(f'\nMisinformation F1: {100 * test_scores[1]:.2f}%')
  print(f'Factual F1: {100 * test_scores[0]:.2f}%')
  print(f'macro-average F1: {100 * test_scores.mean():.4f}%\n')
  f1.append(100 * test_scores.mean())


LANG  en


***** Running Prediction *****
  Num examples = 680
  Batch size = 4


  _warn_prf(average, modifier, msg_start, len(result))


result  {'accuracy': 0.9661764705882353, 'f1': 0.4913986537023186, 'precision': 0.48308823529411765, 'recall': 0.5}


***** Running Prediction *****
  Num examples = 182
  Batch size = 4



Misinformation F1: 0.00%
Factual F1: 98.28%
macro-average F1: 49.1399%


LANG  pt


  _warn_prf(average, modifier, msg_start, len(result))


result  {'accuracy': 0.9945054945054945, 'f1': 0.4986225895316804, 'precision': 0.49725274725274726, 'recall': 0.5}


***** Running Prediction *****
  Num examples = 111
  Batch size = 4



Misinformation F1: 0.00%
Factual F1: 99.72%
macro-average F1: 49.8623%


LANG  es


  _warn_prf(average, modifier, msg_start, len(result))
***** Running Prediction *****
  Num examples = 66
  Batch size = 4


result  {'accuracy': 0.9819819819819819, 'f1': 0.4954545454545454, 'precision': 0.49099099099099097, 'recall': 0.5}

Misinformation F1: 0.00%
Factual F1: 99.09%
macro-average F1: 49.5455%


LANG  fr


  _warn_prf(average, modifier, msg_start, len(result))
***** Running Prediction *****
  Num examples = 38
  Batch size = 4


result  {'accuracy': 0.9848484848484849, 'f1': 0.4961832061068702, 'precision': 0.49242424242424243, 'recall': 0.5}

Misinformation F1: 0.00%
Factual F1: 99.24%
macro-average F1: 49.6183%


LANG  ar
result  {'accuracy': 0.9736842105263158, 'f1': 0.4933333333333333, 'precision': 0.4868421052631579, 'recall': 0.5}

Misinformation F1: 0.00%
Factual F1: 98.67%
macro-average F1: 49.3333%



  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
f1

[49.13986537023186,
 49.862258953168045,
 49.54545454545454,
 49.61832061068702,
 49.33333333333333]