In [2]:
!pip install torchinfo transformers[torch] datasets sentencepiece



In [3]:
import re
import numpy as np
import pandas as pd

import torch
from torchinfo import summary

from datasets import load_dataset

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

from transformers.pipelines.pt_utils import KeyDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, \
                         T5Tokenizer, T5ForSequenceClassification, T5Config, \
                         Trainer, TrainingArguments, TextClassificationPipeline, \
                         EarlyStoppingCallback

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
def load_data(event):
    data_files = {'train': f'{event}_train.tsv',
                  'test': f'{event}_test.tsv',
                  'validation': f'{event}_dev.tsv'}
    dataset = load_dataset(f'/content/drive/MyDrive/Datasets/{event}', data_files=data_files)

    return dataset.shuffle()

In [6]:
# Load dataset for fine-tuning and evaluation

# dataset = load_data('all_events')
# dataset = load_data('wildfires')
# dataset = load_data('california_wildfires_2018')
dataset = load_data('floods')
# dataset = load_data('kerala_floods_2018')

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [7]:
def load_unlabeled_data(event):
    data_files = {'test': f'{event}.tsv'}
    dataset = load_dataset(f'/content/drive/MyDrive/Datasets/{event}', data_files=data_files)

    return dataset.shuffle()

In [8]:
# Load unlabeled Greek dataset

unlabeled_dataset = load_unlabeled_data('thessaly_floods_2023')
# unlabeled_dataset = load_unlabeled_data('evros_wildfires_2023')

In [9]:
def encode_labels(dataset):
    dataset = dataset.class_encode_column('class_label')
    num_classes = dataset['train'].features['class_label'].num_classes
    id2label = {id:dataset['train'].features['class_label'].int2str(id) for id in range(num_classes)}
    label2id = {label:id for (id,label) in id2label.items()}

    dataset = dataset.rename_columns({'class_label': 'label', 'tweet_text': 'text'})
    dataset = dataset.remove_columns('tweet_id')

    return dataset, id2label, label2id

In [10]:
# Encode labels
dataset, id2label, label2id = encode_labels(dataset)

Casting to class labels:   0%|          | 0/7296 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/2066 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1062 [00:00<?, ? examples/s]

In [11]:
def clean_text(item):
    emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        "]+", flags=re.UNICODE)
    item['text'] = emoji_pattern.sub(r'', item['text'])  # remove emojis

    item['text'] = re.sub(r'http\S+', '', item['text']) # remove links
    item['text'] = re.sub(r'www.\S+', '', item['text'])
    item['text'] = re.sub('RT @[A-Za-z0-9_]+', '', item['text']) # remove RT mentions
    item['text'] = re.sub('@[A-Za-z0-9_]+', '', item['text']) # remove mentions
    item['text'] = re.sub('#', '', item['text'])
    item['text'] = ' '.join(item['text'].split())  # remove extra spaces

    return item

In [12]:
# Clean up documents
dataset = dataset.map(clean_text)

Map:   0%|          | 0/7296 [00:00<?, ? examples/s]

Map:   0%|          | 0/2066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1062 [00:00<?, ? examples/s]

In [13]:
# Select pre-trained model

# checkpoint = 'bert-base-multilingual-cased'
# checkpoint = 'distilbert-base-multilingual-cased'
# checkpoint = 'xlm-roberta-base'
checkpoint = 'google/flan-t5-base'

In [None]:
# Instantiate tokenizer
tokenizer = None
if checkpoint == 'google/flan-t5-base':
    tokenizer = T5Tokenizer.from_pretrained(checkpoint, legacy=False)
else:
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [15]:
def tokenize_fn(batch):
    return tokenizer(batch['text'], add_special_tokens=True, padding='max_length',
                     truncation=True, max_length=128, return_token_type_ids=True)

In [16]:
# Tokenize documents
tokenized_dataset = dataset.map(tokenize_fn, batched=True)

Map:   0%|          | 0/7296 [00:00<?, ? examples/s]

Map:   0%|          | 0/2066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1062 [00:00<?, ? examples/s]

In [17]:
# Load default configuration
config = None
if checkpoint == 'google/flan-t5-base':
    config = T5Config.from_pretrained(checkpoint)
else:
    config = AutoConfig.from_pretrained(checkpoint)

# Set ids-labels mapping and dropout for the classification layer
config.id2label = id2label
config.label2id = label2id
config.classifier_dropout = 0.5

In [18]:
# Instantiate the model with a sequence classification head on top
model = None
if checkpoint == 'google/flan-t5-base':
    model = T5ForSequenceClassification.from_pretrained(checkpoint, config=config)
else:
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, config=config)

summary(model)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Layer (type:depth-idx)                                            Param #
T5ForSequenceClassification                                       --
├─T5Model: 1-1                                                    --
│    └─Embedding: 2-1                                             24,674,304
│    └─T5Stack: 2-2                                               24,674,304
│    │    └─Embedding: 3-1                                        (recursive)
│    │    └─ModuleList: 3-2                                       84,953,472
│    │    └─T5LayerNorm: 3-3                                      768
│    │    └─Dropout: 3-4                                          --
│    └─T5Stack: 2-3                                               24,674,304
│    │    └─Embedding: 3-5                                        (recursive)
│    │    └─ModuleList: 3-6                                       113,274,240
│    │    └─T5LayerNorm: 3-7                                      768
│    │    └─Dropout: 3-8             

In [19]:
# Configure training arguments

learning_rate = 5e-5 if checkpoint == 'google/flan-t5-base' else 5e-6

training_args = TrainingArguments(
    output_dir=f'finetuned-{checkpoint}',
    overwrite_output_dir=True,
    # evaluation_strategy='epoch',
    evaluation_strategy='steps',
    eval_steps=1000,
    save_steps=1000,
    logging_steps=100,
    # save_strategy='epoch',
    save_strategy='steps',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=32,
    learning_rate=learning_rate,
    weight_decay=0.01,
    num_train_epochs=5,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
)

In [20]:
# Define a function to compute metrics during training
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    if isinstance(logits, tuple):
        logits = logits[0]
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')

    return {'accuracy': accuracy, 'f1': f1,}

In [21]:
# Train the model
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    # callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
1000,0.983,0.938017,0.736347,0.718008
2000,0.8795,0.853721,0.760829,0.754621
3000,0.8392,0.876934,0.758945,0.751308
4000,0.8385,1.059357,0.76177,0.758522
5000,0.569,1.080266,0.77307,0.764949
6000,0.6412,1.089734,0.766478,0.759331
7000,0.57,1.183993,0.776836,0.769839
8000,0.3999,1.300845,0.774953,0.769514
9000,0.5691,1.280994,0.76742,0.761885


There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight', 'transformer.decoder.embed_tokens.weight'].


TrainOutput(global_step=9120, training_loss=0.7070850748241994, metrics={'train_runtime': 3192.0191, 'train_samples_per_second': 11.429, 'train_steps_per_second': 2.857, 'total_flos': 5570436268523520.0, 'train_loss': 0.7070850748241994, 'epoch': 5.0})

In [22]:
# Load evaluation for the best model checkpoint
trainer.evaluate()

{'eval_loss': 1.1839934587478638,
 'eval_accuracy': 0.7768361581920904,
 'eval_f1': 0.7698387352606907,
 'eval_runtime': 19.9458,
 'eval_samples_per_second': 53.244,
 'eval_steps_per_second': 1.705,
 'epoch': 5.0}

In [23]:
# Instantiate text classification pipeline to make predictions
classifier = TextClassificationPipeline(model=trainer.model, tokenizer=tokenizer, device=0)

In [24]:
# Define a function for predictions
def predict(tokenized_dataset):
    predictions = []
    for pred in classifier(KeyDataset(tokenized_dataset['test'], 'text'), batch_size=8, truncation=True, max_length=128): #, truncation="only_first"):
        predictions.append(pred)
    return predictions

In [25]:
# Evaluate the model on the test set
test_predictions = predict(tokenized_dataset)

predicted = [label2id[pred['label']] for pred in test_predictions]
scores = [pred['score'] for pred in test_predictions]

print(f"Accuracy: {100 * accuracy_score(tokenized_dataset['test']['label'], predicted):.2f}")
print(f"F1 weighted: {100 * f1_score(tokenized_dataset['test']['label'], predicted, average='weighted'):.2f}")
print(f"Precision weighted: {100 * precision_score(tokenized_dataset['test']['label'], predicted, average='weighted'):.2f}")
print(f"Recall weighted: {100 * recall_score(tokenized_dataset['test']['label'], predicted, average='weighted'):.2f}")

Accuracy: 78.22
F1 weighted: 77.40
Precision weighted: 77.21
Recall weighted: 78.22


  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
# Make predictions for the unlabeled Greek dataset
unlabeled_dataset = unlabeled_dataset.map(clean_text)
tokenized_unlabeled_dataset = unlabeled_dataset.map(tokenize_fn, batched=True)

predictions = predict(tokenized_unlabeled_dataset)
label_id = [label2id[pred['label']] for pred in predictions]

pred_df = pd.DataFrame(predictions)
pred_df['text'] = tokenized_unlabeled_dataset['test']['text']
pred_df['label_id'] = label_id

Map:   0%|          | 0/1464 [00:00<?, ? examples/s]

Map:   0%|          | 0/1464 [00:00<?, ? examples/s]

In [27]:
# !rm -rf 'finetuned-bert-base-multilingual-cased'
# !rm -rf 'finetuned-distilbert-base-multilingual-cased'
# !rm -rf 'finetuned-xlm-roberta-base'
!rm -rf 'finetuned-google'