# Evaluation

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import multilabel_confusion_matrix
from datasets import Dataset
import torch

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, EvalPrediction

In [3]:
model_checkpoint = "model-trial-1"
trained_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained("gklmip/bert-tagalog-base-uncased")

In [4]:
LABELS = ["Age", "Gender", "Physical", "Race", "Religion", "Others"]
id2label = {idx:label for idx, label in enumerate(LABELS)}
label2id = {label:idx for idx, label in enumerate(LABELS)}

In [5]:
dataset = pd.read_csv('./dataset/cleaned_mlthsc.csv', nrows=1000)
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# We will use only the testing dataset

In [29]:
# Convert text into BERT representations

def encode_data(data):
    text = data["Text"]

    encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=128,
            return_token_type_ids=False,
            return_attention_mask=True
        )
    
    labels = data[LABELS]
    
    representation = {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': torch.FloatTensor(labels)
    }

    return representation


### Create encoded testing dataset

In [None]:
# Create a list of encoded examples for test data
encoded_test_data = [encode_data(row) for _, row in test_data.iterrows()]

# Combine the encoded examples into a dictionary
encoded_test_dict = {key: [example[key] for example in encoded_test_data] for key in encoded_test_data[0]}

# Convert the dictionaries to datasets
test_dataset = Dataset.from_dict(encoded_test_dict)

In [14]:
# Print the first few examples to verify the encoding
print(test_dataset)
print(test_dataset[0])
print(test_dataset[0]['labels'])

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 200
})
{'input_ids': [101, 18215, 6067, 1116, 28888, 3591, 1863, 3587, 32560, 1738, 14683, 1744, 1894, 12936, 19451, 1741, 5397, 2309, 1111, 1894, 3407, 51114, 1894, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [0.0, 0.0, 0.0, 0.0, 

In [17]:
y_true = test_dataset['labels']
y_true

[[0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 1.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 1.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 1.0

In [32]:
def preprocess_text(text):
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors='pt')
    return encoding

In [62]:
y_pred = [] # list of lists of predictions

for text in test_data['Text']:

    with torch.no_grad():
        model_outputs = trained_model(**preprocess_text(text))

    probabilities = np.array(model_outputs.logits.sigmoid().tolist()[0])
    
    prediction = np.zeros(len(probabilities))
    prediction[np.where(probabilities >= 0.5)] = 1

    print(prediction)

    y_pred.append(prediction)

y_pred



[0. 0. 0. 0. 0. 1.]
[0. 0. 1. 0. 0. 0.]
[0. 0. 0. 0. 1. 0.]
[0. 0. 1. 1. 0. 0.]
[0. 0. 0. 1. 0. 0.]
[0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 1.]
[1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0.]
[1. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0.]
[0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 1.]
[0. 1. 0. 0. 0. 0.]
[0. 0. 0. 1. 1. 0.]
[0. 0. 0. 0. 1. 0.]
[0. 1. 0. 0. 0. 0.]
[0. 0. 0. 1. 1. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0.]
[1. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 1. 0.]
[0. 0. 1. 0. 0. 0.]
[0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0.]
[0. 0. 0. 1. 0. 0.]
[0. 0. 0. 1. 0. 0.]
[1. 1. 0. 0. 0. 0.]
[0. 0. 1. 1. 0. 0.]
[0. 0. 0. 0. 0. 1.]
[0. 0. 0. 1. 0. 0.]
[0. 0. 1. 0. 0. 0.]
[0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 1.]
[0. 1. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 1. 0.]
[1. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0.]
[1. 1. 1. 0. 0. 0.]
[0. 0. 1. 0. 0. 0.]
[0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 1.]
[1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 1.]


[array([0., 0., 0., 0., 0., 1.]),
 array([0., 0., 1., 0., 0., 0.]),
 array([0., 0., 0., 0., 1., 0.]),
 array([0., 0., 1., 1., 0., 0.]),
 array([0., 0., 0., 1., 0., 0.]),
 array([0., 0., 0., 1., 0., 0.]),
 array([0., 0., 0., 0., 0., 1.]),
 array([1., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 1., 0.]),
 array([1., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 1., 0., 0.]),
 array([0., 1., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 1.]),
 array([0., 1., 0., 0., 0., 0.]),
 array([0., 0., 0., 1., 1., 0.]),
 array([0., 0., 0., 0., 1., 0.]),
 array([0., 1., 0., 0., 0., 0.]),
 array([0., 0., 0., 1., 1., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 1., 0., 0.]),
 array([0., 0., 0., 0., 1., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 1., 0.]),
 array([1., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 1., 1., 0.]),
 array([0., 0., 1., 0., 0., 0.]),
 array([0., 1., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 1., 0.]),
 array([0., 0., 0., 1., 0., 0.]),
 array([0., 0.

In [63]:
len(y_pred)

200

In [12]:
def multilabel_metrics(logits, labels, threshold=0.5):

    print("predictions:", logits)

    # Apply sigmoid activation to logits/raw scores from the classifier 
    sigmoid = torch.nn.Sigmoid()
    probabilities = sigmoid(torch.Tensor(logits))

    print("probabilities:", probabilities)

    # Set labels as 0 or 1 with 0.5 threshold
    y_pred = np.zeros(probabilities.shape)                  # Create an array of 0s with size: number of labels (six)
    y_pred[np.where(probabilities >= threshold)] = 1        # Set each label to 1 only if logit is greater than threshold (0.5) 

    y_true = np.zeros(labels.shape)
    y_true[np.where(labels == 1)] = 1

    print("Y PRED:", y_pred)
    print("Y TRUE:", y_true)
    
    confusion_matrix = multilabel_confusion_matrix(y_true, y_pred)
    print(confusion_matrix)
    label_metrics = {}
    
    classes = ['Age', 'Gender', 'Physical', 'Race', 'Religion', 'Others']

    for i in range(confusion_matrix.shape[0]):
        TP = confusion_matrix[i, 1, 1]  # True Positives
        FP = confusion_matrix[i, 0, 1]  # False Positives
        FN = confusion_matrix[i, 1, 0]  # False Negatives
        TN = confusion_matrix[i, 0, 0]  # True Negatives

        # TN FP
        # FN TP 

        precision = TP / (TP + FP) if TP + FP > 0 else 0
        recall = TP / (TP + FN) if TP + FN > 0 else 0
        f1_score = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

        label_name = classes[i]

        label_metrics[label_name] = {
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1_score
        }

    # Calculate Hamming Loss
    xor_result = np.logical_xor(y_true, y_pred)
    xor_sum = np.sum(xor_result)
    hamming_loss = xor_sum / (y_true.shape[0] * y_true.shape[1])
    
    label_metrics['Hamming Loss'] = hamming_loss

    return label_metrics

def compute_metrics():
    preds = get_predictions()

    print("preds", preds)

    result = multilabel_metrics(predictions=preds, labels=p.label_ids, threshold=0.5)
    return result

In [None]:
def preprocess_text(text):
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors='pt')
    return encoding

In [None]:
def get_y_pred(test_sentence):

    encoded_test_sentence = preprocess_text(test_sentence)

    with torch.no_grad():
        model_outputs = trained_model(**encoded_test_sentence)

    predictions = model_outputs.logits.sigmoid().tolist()[0]  # Apply sigmoid to get probabilities

    # Get all labels
    label_probabilities = [{"name": label, "probability": f"{prob * 100:.2f}%"} for label, prob in zip(LABELS, predictions)]

    # Sort label probabilities in descending order
    label_probabilities = sorted(label_probabilities, key=lambda item: -float(item["probability"][:-1]))
    print(label_probabilities)

    threshold = 0.5

    # Labels greater than 0.5 threshold
    predicted_labels = [(label, f"{pred*100:.2f}%") for label, pred in zip(LABELS, predictions) if pred >= threshold]
    print("Input:", test_sentence)
    print("Probabilities: ", label_probabilities)

    print("Labels:")
    for label, probability in predicted_labels:
        print(f"({label}, {probability})")


    return label_probabilities

In [None]:
import csv 

csv_file = "result.csv"

header = ["ID", "Text", "Age", "Gender", "Physical", "Race", "Religion", "Others"]

with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)

def evaluate_dataset(dataset):

    token_id = 1

    with open(dataset, 'r') as f:
        tagged_sentences = f.readlines()

    for text in tagged_sentences: 
    
        sentence = preprocess_sentence(tagged_sentence)
        sentence = sentence.split()

        y_pred_labels = get_predicted_tags(tagged_sentence)
        y_true_labels = get_actual_tags(tagged_sentence)

**Goals:** 
- Get the y_pred which is a list of predictions for all 1000 rows 
- Get the y_true which is a list of true labels for all 1000 rows 

**Pseudocode:** 
- For each row in dataset
    - Get the encpded

