In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [4]:
bert_model = "gklmip/bert-tagalog-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_model)

In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split

dataset = pd.read_csv('./dataset/cleaned_mlthsc.csv', nrows=1000)

train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)



In [7]:
LABELS = ["Age", "Gender", "Physical", "Race", "Religion", "Others"]
id2label = {idx:label for idx, label in enumerate(LABELS)}
label2id = {label:idx for idx, label in enumerate(LABELS)}

In [6]:
def preprocess_data(data):
    text = data["Text"]

    encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=128,
            return_token_type_ids=False,
            return_attention_mask=True
        )
    
    labels = data[LABELS]
    
    representation = {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': torch.FloatTensor(labels)
    }

    return representation


In [6]:
from datasets import Dataset
import torch

# Create a list of encoded examples for train and test data
encoded_train_data = [preprocess_data(row) for _, row in train_data.iterrows()]
encoded_test_data = [preprocess_data(row) for _, row in test_data.iterrows()]

# Combine the encoded examples into a dictionary
encoded_train_dict = {key: [example[key] for example in encoded_train_data] for key in encoded_train_data[0]}
encoded_test_dict = {key: [example[key] for example in encoded_test_data] for key in encoded_test_data[0]}

# Convert the dictionaries to datasets
train_dataset = Dataset.from_dict(encoded_train_dict)
test_dataset = Dataset.from_dict(encoded_test_dict)

# Print the first few examples to verify the encoding
print(train_dataset)
print(test_dataset)

print(train_dataset[0])
print(test_dataset[0])

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 800
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 200
})
{'input_ids': [101, 1767, 23941, 110, 160, 43209, 5230, 1779, 1767, 15721, 5489, 1109, 8774, 25301, 1754, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [30]:
model = AutoModelForSequenceClassification.from_pretrained(bert_model,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(LABELS),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at gklmip/bert-tagalog-base-uncased and are newly initialized: ['bert.pooler.dense.weight', 'classifier.weight', 'bert.pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
batch_size = 32
metric_name = "hamming_loss"

In [9]:
args = TrainingArguments(
    "checkpoint",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [10]:
import numpy as np
from sklearn.metrics import multilabel_confusion_matrix
from transformers import EvalPrediction
import torch

In [5]:
def multilabel_metrics(predictions, labels, threshold=0.5):

    print("predictions:", predictions)

    # Apply sigmoid activation to logits/raw scores from the classifier 
    sigmoid = torch.nn.Sigmoid()
    probabilities = sigmoid(torch.Tensor(predictions))

    print("probabilities:", probabilities)

    # Filter out labels using the 0.5 threshold
    y_pred = np.zeros(probabilities.shape)
    y_pred[np.where(probabilities >= threshold)] = 1

    y_true = np.zeros(labels.shape)
    y_true[np.where(labels == 1)] = 1

    print("Y PRED:", y_pred)
    print("Y TRUE:", y_true)
    
    confusion_matrix = multilabel_confusion_matrix(y_true, y_pred)
    print(confusion_matrix)
    label_metrics = {}
    
    classes = ['Age', 'Gender', 'Physical', 'Race', 'Religion', 'Others']

    for i in range(confusion_matrix.shape[0]):
        TP = confusion_matrix[i, 1, 1]  # True Positives
        FP = confusion_matrix[i, 0, 1]  # False Positives
        FN = confusion_matrix[i, 1, 0]  # False Negatives
        TN = confusion_matrix[i, 0, 0]  # True Negatives

        # TN FP
        # FN TP 

        precision = TP / (TP + FP) if TP + FP > 0 else 0
        recall = TP / (TP + FN) if TP + FN > 0 else 0
        f1_score = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

        label_name = classes[i]

        label_metrics[label_name] = {
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1_score
        }

    # Calculate Hamming Loss
    xor_result = np.logical_xor(y_true, y_pred)
    xor_sum = np.sum(xor_result)
    hamming_loss = xor_sum / (y_true.shape[0] * y_true.shape[1])
    
    label_metrics['Hamming Loss'] = hamming_loss

    return label_metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

    print("preds", preds)

    result = multilabel_metrics(predictions=preds, labels=p.label_ids, threshold=0.5)
    return result

NameError: name 'EvalPrediction' is not defined

In [6]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

NameError: name 'Trainer' is not defined

In [3]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]))

tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()
(tn, fp, fn, tp)

[[0 2]
 [1 1]]


(0, 2, 1, 1)

In [14]:
trainer.train()

  0%|          | 0/125 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  2%|▏         | 2/125 [03:03<3:16:26, 95.82s/it]

: 

# Evaluation

In [46]:
model_checkpoint = "model-trial-1"
trained_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(bert_model)

In [40]:
index = 14
row = test_data.iloc[index]
test_sentence = row['Text']
row

ID                                                        939
Text        TANGINA! LAGI NALANG NADADAMAY YUNG LAG LILING...
Age                                                         0
Gender                                                      0
Physical                                                    0
Race                                                        1
Religion                                                    1
Others                                                      0
Name: 938, dtype: object

In [41]:
def preprocess_text(text):
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors='pt')
    return encoding

In [42]:
encoded_test_sentence = preprocess_text(test_sentence)

In [43]:
with torch.no_grad():
    model_outputs = trained_model(**encoded_test_sentence)

print(model_outputs)


SequenceClassifierOutput(loss=None, logits=tensor([[-2.9301, -3.2368, -2.9677,  2.2064,  1.7061, -3.6536]]), hidden_states=None, attentions=None)


In [44]:
predictions = model_outputs.logits.sigmoid().tolist()[0]  # Apply sigmoid to get probabilities

print(predictions)

[0.050687409937381744, 0.03780565410852432, 0.04890599846839905, 0.9008221626281738, 0.8463234901428223, 0.02524476684629917]


In [45]:
threshold = 0.5
predicted_labels = [(label, f"{pred*100:.2f}%") for label, pred in zip(LABELS, predictions) if pred >= threshold]
print("Input:", test_sentence)
print("Labels:")
for label, probability in predicted_labels:
    print(f"({label}, {probability})")

Input: TANGINA! LAGI NALANG NADADAMAY YUNG LAG LILINGKOD KO AH? TSK PAKITANG TAO? NAKAKA PUNYETA TALAGA MGA PINAG SASBI MK EH NOH PALIBHASA MUSLIM MUSLIMAN MA KAYA KA GANYAN
Labels:
(Race, 90.08%)
(Religion, 84.63%)


### Inference on test sentence

Palagay ng bagong test_sentence para matry kung tama ang labels

In [103]:
test_sentence = "ambobo ng mga batang katoliko na bisaya"

encoded_test_sentence = preprocess_text(test_sentence)

with torch.no_grad():
    model_outputs = trained_model(**encoded_test_sentence)

predictions = model_outputs.logits.sigmoid().tolist()[0]  # Apply sigmoid to get probabilities
label_probabilities = [(label, f"{prob * 100:.2f}%") for label, prob in zip(LABELS, predictions)]

threshold = 0.5
predicted_labels = [(label, f"{pred*100:.2f}%") for label, pred in zip(LABELS, predictions) if pred >= threshold]
print("Input:", test_sentence)
print("Probabilities: ", label_probabilities)

print("Labels:")
for label, probability in predicted_labels:
    print(f"({label}, {probability})")

Input: ambobo ng mga batang katoliko na bisaya
Probabilities:  [('Age', '50.16%'), ('Gender', '6.47%'), ('Physical', '5.72%'), ('Race', '29.04%'), ('Religion', '89.39%'), ('Others', '1.56%')]
Labels:
(Age, 50.16%)
(Religion, 89.39%)


In [None]:
# 