In [5]:
"BERT Model"

'BERT Model'

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel

ModuleNotFoundError: No module named 'pandas'

In [4]:
import math
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup
from sklearn.metrics import precision_score, recall_score, f1_score, hamming_loss

In [5]:
model_name = "gklmip/bert-tagalog-base-uncased"
BERT_MODEL = AutoModel.from_pretrained(model_name, return_dict=True)
BERT_TOKENIZER = AutoTokenizer.from_pretrained(model_name)

Some weights of BertModel were not initialized from the model checkpoint at gklmip/bert-tagalog-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
train_path = './dataset/train-try.csv'
val_path = './dataset/val.csv'
test_path = './dataset/test-try.csv'
dataset_path = './dataset/mlthsc.csv'

In [7]:
df = pd.read_csv(dataset_path)
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)
LABELS = ['Age', 'Gender', 'Physical', 'Race', 'Religion', 'Others']

In [1]:
df

NameError: name 'df' is not defined

In [20]:
class MLTHSDataset(Dataset):

    def __init__(self, data: pd.DataFrame, tokenizer, labels: list, max_token_len: int = 128):
        self.data = data
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_token_len = max_token_len
        self._preprocess_data()

    def _preprocess_data(self):
        return 0
        # TODO: add normalizer / preprocessor logic here / can be implemented as a class

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        hate_speech_text = data_row['Text']
        labels = data_row[self.labels]

        encoding = self.tokenizer.encode_plus(
            hate_speech_text,
            add_special_tokens=True,
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            return_attention_mask=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(labels)
        }
    
    def _get_stats(self, _print: str = False):
        stats = {
            "text_count": len(self.data),
            "instance_per_label": self.data[self.labels].sum(),
            "shape": self.data.shape
        }
        if (_print): 
            print("\nDATASET STATISTICS:\n")
            print("Number of Text", len(self.data))
            print("Instance per Label\n", self.data[self.labels].sum())
            print("Shape: ", self.data.shape)
        return stats

    
    def _print_sample_hate_speech(self, index: int = 0, get_encoding: bool = False):
        sample_row = self.data.iloc[index]
        sample_text = sample_row.Text
        sample_labels = sample_row[self.labels]
        print("\nSAMPLE TRAINING HATE SPEECH:")
        print("Index: ", index)
        print("Text: ", sample_text)
        print("Labels: ", sample_labels.to_dict())

        encoding = self.tokenizer.encode_plus(
            sample_text,
            add_special_tokens=True,
            max_length=512,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors='pt',
        )
    
        if (get_encoding):
            print("Encoding:\n", encoding, "\n")
            print("Input IDs: ", encoding["input_ids"].squeeze()[:20])
            print("Attention Mask: ", encoding["attention_mask"].squeeze()[:20])
            print("Tokens:",  self.tokenizer.convert_ids_to_tokens(encoding["input_ids"].squeeze())[:20])

    def _get_data_frame(self, _print: str = False):
        if (_print): print(self.data)
        return self.data

In [18]:
class MLTHSDataModule():
    
    def __init__(self, train_df, val_df, test_df, labels, tokenizer, batch_size=8, max_token_len=128):
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.labels = labels
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.mlths_train_dataset = MLTHSDataset(self.train_df, self.labels, self.tokenizer)
        self.mlths_val_dataset = MLTHSDataset(self.val_df, self.labels, self.tokenizer)
        self.mlths_test_dataset = MLTHSDataset(self.test_df, self.labels, self.tokenizer)

    def train_dataloader(self):
        return DataLoader(self.mlths_train_dataset, batch_size=self.batch_size, num_workers=2, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.mlths_val_dataset, batch_size=self.batch_size, num_workers=2, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.mlths_test_dataset, batch_size=self.batch_size, num_workers=2, shuffle=False)

# To implement: LINEAR CLASSIFIER

In [19]:
class MLTHSClassifier():

    def __init__(self, config: dict, labels: list, data_module: MLTHSDataModule):
        self.config = config
        self.threshold = config['threshold']

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.bert = AutoModel.from_pretrained(config['model_name'], return_dict=True)
        self.tokenizer = AutoTokenizer.from_pretrained(config['model_name'])

        self.data_module = data_module
        self.labels = labels

        self.criterion = nn.BCEWithLogitsLoss(reduction='mean')
        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.config['lr'], weight_decay=self.config['w_decay'])
        self.classifier = nn.Linear(self.bert.config.hidden_size, self.config['n_labels'])

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = output.last_hidden_state[:, 0, :]
        cls_embedding = torch.sigmoid(cls_embedding)
        logits = self.classifier(cls_embedding)
        loss = 0
        if labels is not None:
            loss = self.criterion(
                        logits.view(-1, self.config['n_labels']), 
                        labels.view(-1, self.config['n_labels'])
                    )
        return loss, logits

    def configure_optimizers(self):
        optimizer = self.optimizer
        total_steps = self.config['train_size'] / self.config['bs']
        warmup_steps = math.floor(total_steps * self.config['warmup'])
        scheduler = get_cosine_schedule_with_warmup(
                        optimizer, 
                        warmup_steps, 
                        total_steps
                    )
        return [optimizer], [scheduler]
    

In [39]:
from torchmetrics import MetricCollection, Accuracy, Precision, Recall
from transformers import TrainingArguments, Trainer, EvalPrediction

In [None]:
class MLTHSCTrainer(Trainer):
    def __init__(self, config, data_module,*args, **kwargs):
        super().__init__(*args, **kwargs)
        self.config = config
        self.data_module = self.get_data_module()

    def get_data_module(self):
        mlths_data_module = MLTHSDataModule(
        train_df,
        val_df,
        test_df, 
        labels=LABELS,
        tokenizer=BERT_TOKENIZER,
        batch_size=self.config['bs'],
        max_token_len=128
    )
        mlths_data_module.setup()
        return mlths_data_module

    def get_train_dataloader(self):
        return self.data_module.train_dataloader()

    def get_eval_dataloader(self):
        return self.data_module.val_dataloader()

    def get_test_dataloader(self):
        return self.data_module.test_dataloader()


In [11]:
N_EPOCHS = 5
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
THRESHOLD = 0.5

In [12]:
mlths_data_module = MLTHSDataModule(
        train_df,
        val_df,
        test_df, 
        labels=LABELS,
        tokenizer=BERT_TOKENIZER,
        batch_size=BATCH_SIZE,
        max_token_len=128
    )

mlths_data_module.setup()
mlths_dl = mlths_data_module.train_dataloader()


In [13]:
config = {
    'model_name': model_name,
    'n_labels': len(LABELS),
    'train_size': len(mlths_dl),
    'bs': BATCH_SIZE,
    'n_epochs': N_EPOCHS,
    'lr': LEARNING_RATE,
    'warmup': 0.2,
    'w_decay': 0.01,
    'threshold': THRESHOLD
}

In [34]:
model = MLTHSClassifier(config)
args = TrainingArguments(
    "checkpoint",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=config['bs'],
    per_device_eval_batch_size=config['bs'],
    num_train_epochs=config['n_epochs'],
    weight_decay=config['w_decay'],
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)


Some weights of BertModel were not initialized from the model checkpoint at gklmip/bert-tagalog-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [106]:
import torch
import numpy as np
from sklearn.metrics import multilabel_confusion_matrix
from transformers import EvalPrediction

def multilabel_metrics(predictions, labels, threshold=0.5):

    # Apply sigmoid activation to logits/raw scores from the classifier 
    sigmoid = torch.nn.Sigmoid()
    probabilities = sigmoid(torch.Tensor(predictions))

    # Filter out labels using the 0.5 threshold
    y_pred = np.zeros(probabilities.shape)
    y_pred[np.where(probabilities >= threshold)] = 1
    
    y_true = labels

    confusion_matrix = multilabel_confusion_matrix(y_true, y_pred)
    label_metrics = {}
    
    classes = ['Age', 'Gender', 'Physical', 'Race', 'Religion', 'Others']

    for i in range(confusion_matrix.shape[0]):
        TP = confusion_matrix[i, 1, 1]  # True Positives
        FP = confusion_matrix[i, 0, 1]  # False Positives
        FN = confusion_matrix[i, 1, 0]  # False Negatives
        TN = confusion_matrix[i, 0, 0]  # True Negatives

        precision = TP / (TP + FP) if TP + FP > 0 else 0
        recall = TP / (TP + FN) if TP + FN > 0 else 0
        f1_score = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

        label_name = classes[i]

        label_metrics[label_name] = {
            "Precision": f"{precision * 100:.2f}%",
            "Recall": f"{recall * 100:.2f}%",
            "F1-Score": f"{f1_score * 100:.2f}%"
        }

    # Calculate Hamming Loss
    xor_result = np.logical_xor(y_true, y_pred)
    xor_sum = np.sum(xor_result)
    hamming_loss = xor_sum / (y_true.shape[0] * y_true.shape[1])
    
    label_metrics['Hamming Loss'] = f"{hamming_loss:.4f}"

    return label_metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multilabel_metrics(predictions=preds, labels=p.label_ids, threshold=0.5)
    return result

Custom Metrics:
Metrics per label:
Age:
  Precision: 50.00%
  Recall: 100.00%
  F1: 66.67%
Gender:
  Precision: 30.00%
  Recall: 100.00%
  F1: 46.15%
Physical:
  Precision: 60.00%
  Recall: 100.00%
  F1: 75.00%
Race:
  Precision: 50.00%
  Recall: 100.00%
  F1: 66.67%
Religion:
  Precision: 50.00%
  Recall: 100.00%
  F1: 66.67%
Others:
  Precision: 30.00%
  Recall: 100.00%
  F1: 46.15%



In [None]:
trainer = MLTHSCTrainer(
    model=model,
    args=args,
    tokenizer=BERT_TOKENIZER,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()
trainer.save_model("Multilabel-Tagalog-Hate-Speech-Classifier-Model")

In [108]:
mlths_train_dataset = MLTHSDataset(train_df, LABELS, BERT_TOKENIZER)
mlths_val_dataset = MLTHSDataset(val_df, LABELS, BERT_TOKENIZER)
mlths_test_dataset = MLTHSDataset(test_df, LABELS, BERT_TOKENIZER)


     ID                                               Text  Age  Gender  \
0    81  yak tom napaka kadiri neto yak talaga hahaha e...    0       0   
1    82  taena haha kung ano ano pang sinasabi tapos li...    0       0   
2    83  pukinginang mga chingchong mandurugas kayo coo...    0       0   
3    84  PAKYU KAYO ISLAM MGA ISIS ABUSAYAF GAGO LUMAYA...    0       0   
4    85  MuKha mo mukhang kabayo magsama kayo ni quibol...    0       0   
5    86  Yung 2 teenagers na sinigawan ko Dahil umaawra...    1       1   
6    87  TANGINA NI GADGET ADDICT, WALA NANG NAIDULOT K...    0       0   
7    88  tangina talaga ng mga straight boys (and girls...    0       1   
8    89  Sobrang excited ko na sa mundo kung saan patay...    1       0   
9    90  nakakatangina talaga ng mga dating daan. bobo ...    1       1   
10   91  pansin nyo ba napakabobo ng mga INC? kakaurat ...    0       0   
11   92  Putangina niyo, wala na sana kayong eardrums s...    1       0   
12   93  tangina mo lemue

In [109]:
predictions = trainer.predict(mlths_test_dataset)
print(predictions.predictions.shape, predictions.label_ids.shape)

TypeError: `model` must be a `LightningModule` or `torch._dynamo.OptimizedModule`, got `MLTHSDataset`