<a href="https://colab.research.google.com/github/yogasgm/prototype_finetuning_pytorch/blob/main/Prototype_Multilabel_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Importing libraries

In [None]:
!pip install transformers

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import random
import shutil
import sys
from sklearn.model_selection import train_test_split

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m48.8 MB/s[0m eta [36m0:00:0

# Setting seed for reproducibility

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
set_seed(43)

# Downloading dataset

In [None]:
from requests import get as rget

res = rget("https://xxx:xxxxxxxx@raw.githubusercontent.com/yogasgm/indonesian-online-toxicity-detection/main/dataset/targeted-online-toxicity-cat-processed.csv")
with open('file.csv', 'wb+') as f:
        f.write(res.content)

train_df = pd.read_csv('file.csv')

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7094 entries, 0 to 7093
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   comment_text              7094 non-null   object
 1   religion_creed            7094 non-null   int64 
 2   race_ethnicity            7094 non-null   int64 
 3   physical_disability       7094 non-null   int64 
 4   gender_sexualorientation  7094 non-null   int64 
 5   other                     7094 non-null   int64 
dtypes: int64(5), object(1)
memory usage: 332.7+ KB


In [None]:
train_df.columns

Index(['comment_text', 'religion_creed', 'race_ethnicity',
       'physical_disability', 'gender_sexualorientation', 'other'],
      dtype='object')

# Selecting required columns

In [None]:
train_df = train_df[['comment_text', 'religion_creed', 'race_ethnicity', 'physical_disability', 'gender_sexualorientation', 'other',]]

In [None]:
target_list = ['religion_creed', 'race_ethnicity', 'physical_disability', 'gender_sexualorientation', 'other',]

# Preparing the tokenizer

In [None]:
#Set Max Lenght, maksimal 512 (BERT)
MAX_LEN = 128

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
#download the tokenizer
tokenizer = BertTokenizer.from_pretrained('indolem/indobertweet-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['comment_text']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

# Splitting & Tokenizing Dataset

In [None]:
# Adjusting the train/validation/test split
train_df, temp_df = train_test_split(train_df, test_size=0.2, random_state=43)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=43)

# Reset the indices
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [None]:
# Label distribution in the training set
train_counts = train_df[target_list].sum(axis=0)
print("Label distribution in the training set:\n", train_counts)

# Label distribution in the validation set
val_counts = val_df[target_list].sum(axis=0)
print("\nLabel distribution in the validation set:\n", val_counts)

# Label distribution in the test set
test_counts = test_df[target_list].sum(axis=0)
print("\nLabel distribution in the test set:\n", test_counts)

Label distribution in the training set:
 religion_creed              1135
race_ethnicity               964
physical_disability         1321
gender_sexualorientation    1005
other                       1706
dtype: int64

Label distribution in the validation set:
 religion_creed              145
race_ethnicity              113
physical_disability         165
gender_sexualorientation    130
other                       216
dtype: int64

Label distribution in the test set:
 religion_creed              139
race_ethnicity              124
physical_disability         173
gender_sexualorientation    137
other                       205
dtype: int64


In [None]:
# Label distribution in the training set
train_counts_percentage = (train_df[target_list].sum(axis=0) / len(train_df)) * 100
print("Label distribution in the training set:\n", train_counts_percentage)

# Label distribution in the validation set
val_counts_percentage = (val_df[target_list].sum(axis=0) / len(val_df)) * 100
print("\nLabel distribution in the validation set:\n", val_counts_percentage)

# Label distribution in the test set
test_counts_percentage = (test_df[target_list].sum(axis=0) / len(test_df)) * 100
print("\nLabel distribution in the test set:\n", test_counts_percentage)

Label distribution in the training set:
 religion_creed              20.000000
race_ethnicity              16.986784
physical_disability         23.277533
gender_sexualorientation    17.709251
other                       30.061674
dtype: float64

Label distribution in the validation set:
 religion_creed              20.451340
race_ethnicity              15.937941
physical_disability         23.272214
gender_sexualorientation    18.335684
other                       30.465444
dtype: float64

Label distribution in the test set:
 religion_creed              19.577465
race_ethnicity              17.464789
physical_disability         24.366197
gender_sexualorientation    19.295775
other                       28.873239
dtype: float64


In [None]:
train_df.shape

(5675, 6)

In [None]:
val_df.shape

(709, 6)

In [None]:
val_df

Unnamed: 0,comment_text,religion_creed,race_ethnicity,physical_disability,gender_sexualorientation,other
0,ju ju semakin banyak yg hate nyebut kalian tua...,0,0,1,0,0
1,si wts tua bangka jelek gembrot dan bau terasi...,0,0,1,0,0
2,justru islam makin keliatan jelek sebab komen ...,1,0,0,1,0
3,kesel bgt bgstttt punya kelompok pd ga bisa di...,0,0,0,0,1
4,israel itu cuma gelar yang di beri jibril ke y...,1,0,0,0,0
...,...,...,...,...,...,...
704,lgbt goblok homo biadab ini hukum mati udh pantes,0,0,0,1,0
705,bagian dari melestarikan dungu biadab islam m...,1,0,0,0,0
706,goblok lu yg namanya bisnis ya harus sesuai ke...,0,1,0,0,0
707,najis cina sipit galak maksudnya setuju itu mu...,0,1,1,0,0


In [None]:
test_df

Unnamed: 0,comment_text,religion_creed,race_ethnicity,physical_disability,gender_sexualorientation,other
0,kalo fetish paha paha sembarang orang boleh di...,0,0,0,1,0
1,halah otak mesum yang ada dipikirannya kesetar...,0,0,1,1,0
2,dasar lu jawa norak,0,1,0,0,0
3,eksekusi aja nih orang,0,0,0,0,1
4,mampus haha rt nangis dipojokan omah omah gasa...,0,0,1,0,0
...,...,...,...,...,...,...
705,ada org sange sama suara org tolol,0,0,0,1,0
706,jangankan bapaknya gua yang baca aja muak ama lu,0,0,0,0,1
707,pgn gue gebukin tapi terlalu kasar untuk gue y...,0,0,0,0,1
708,anjeeeerrr inget pas mantan gw tipes gw ajak n...,0,0,0,1,0


In [None]:
# Create the CustomDataset for each set
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)

In [None]:
len(train_dataset)

5675

# Setting hyperparameters

In [None]:
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 5e-5

In [None]:
# Preparing the DataLoaders
train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
# Checking for available device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
device

device(type='cuda')

# Additional functions for loading and saving checkpoints

In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss
    return model, optimizer, checkpoint['epoch'], valid_loss_min

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

# Training the Model

Defining and Initializing the BERT Classification Model

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('indolem/indobertweet-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

Setting Up the Loss Function and Optimizer

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)

Initialization of Validation Target and Output Lists

In [None]:
val_targets=[]
val_outputs=[]

Training and Validation Loop with Early Stopping

In [None]:
def train_model(n_epochs, training_loader, validation_loader, model,
                optimizer, checkpoint_path, best_model_path, patience):

  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf
  no_improve = 0


  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)

    print('############# Epoch {}: Training End     #############'.format(epoch))

    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################
    # validate the model #
    ######################

    model.eval()

    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch,
            train_loss,
            valid_loss
            ))

      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }


      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
              valid_loss_min,
              valid_loss
              ))
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss
        no_improve = 0
      else:
        no_improve += 1
        if no_improve >= patience:
          print("Early stopping due to no improvement in validation loss")
          break

  return model

In [None]:
# Save checkpoint

ckpt_path = "/content/gdrive/MyDrive/curr_ckpt_16_5e-5_IndoBT_TargetedCat"
best_model_path = "/content/gdrive/MyDrive/best_model_16_5e-5_IndoBT_TargetedCat.pt"

# Start Train

In [None]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path, patience=2)

############# Epoch 1: Training Start   #############
############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############
############# Epoch 1: Validation End     #############
Epoch: 1 	Avgerage Training Loss: 0.000776 	Average Validation Loss: 0.004673
Validation loss decreased (inf --> 0.004673).  Saving model ...
############# Epoch 2: Training Start   #############
############# Epoch 2: Training End     #############
############# Epoch 2: Validation Start   #############
############# Epoch 2: Validation End     #############
Epoch: 2 	Avgerage Training Loss: 0.000478 	Average Validation Loss: 0.004419
Validation loss decreased (0.004673 --> 0.004419).  Saving model ...
############# Epoch 3: Training Start   #############
############# Epoch 3: Training End     #############
############# Epoch 3: Validation Start   #############
############# Epoch 3: Validation End     #############
Epoch: 3 	Avgerage Training Loss: 0.000356 	Average

In [None]:
# Load the saved checkpoint
model, optimizer, start_epoch, valid_loss_min = load_ckp(best_model_path, model, optimizer)

print(f'The validation loss of the best saved model is: {valid_loss_min}')

The validation loss of the best saved model is: 0.004418710245762342


# Test

In [None]:
# Process new dataset
#new_dataset = CustomDataset(new_df, tokenizer, MAX_LEN)
new_dataset = test_dataset

# Create DataLoader
new_data_loader = torch.utils.data.DataLoader(new_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

# Load the model
model, optimizer, start_epoch, valid_loss_min = load_ckp(best_model_path, model, optimizer)

# Switch model to the evaluation mode
model.eval()

new_outputs = []
new_targets = []
test_loss = 0.0

# Define loss function
loss_fn = torch.nn.BCEWithLogitsLoss()

# Pass new data through the model
with torch.no_grad():
    for batch_idx, data in enumerate(new_data_loader):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        # Calculate loss
        loss = loss_fn(outputs, targets)
        test_loss += loss.item() * data['input_ids'].size(0)

        new_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        new_targets.extend(targets.cpu().detach().numpy().tolist())

# Average the test loss over all batches
test_loss = test_loss / len(new_data_loader.dataset)

print(f'Test Loss: {test_loss:.6f}')

Test Loss: 0.184774


In [None]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

# Convert the outputs and targets to numpy arrays
new_outputs_np = np.array(new_outputs)
new_targets_np = np.array(new_targets)

# Threshold the outputs (This depends on your requirements, 0.5 is used as an example)
new_outputs_bin = (new_outputs_np > 0.5)

# Calculate metrics
print(classification_report(new_targets_np, new_outputs_bin))

# Calculate macro and micro metrics
precision_macro = precision_score(new_targets_np, new_outputs_bin, average='macro')
recall_macro = recall_score(new_targets_np, new_outputs_bin, average='macro')
f1_macro = f1_score(new_targets_np, new_outputs_bin, average='macro')

precision_micro = precision_score(new_targets_np, new_outputs_bin, average='micro')
recall_micro = recall_score(new_targets_np, new_outputs_bin, average='micro')
f1_micro = f1_score(new_targets_np, new_outputs_bin, average='micro')

print(f'Macro Precision: {precision_macro} Macro Recall: {recall_macro} Macro F1: {f1_macro}')
print(f'Micro Precision: {precision_micro} Micro Recall: {recall_micro} Micro F1: {f1_micro}')

              precision    recall  f1-score   support

           0       0.75      0.86      0.80       139
           1       1.00      0.86      0.93       124
           2       0.97      0.88      0.92       173
           3       0.75      0.64      0.69       137
           4       0.81      0.80      0.80       205

   micro avg       0.85      0.81      0.83       778
   macro avg       0.86      0.81      0.83       778
weighted avg       0.86      0.81      0.83       778
 samples avg       0.79      0.77      0.77       778

Macro Precision: 0.8568463033634496 Macro Recall: 0.8070177536251993 Macro F1: 0.8289835258528886
Micro Precision: 0.85 Micro Recall: 0.8084832904884319 Micro F1: 0.828722002635046


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(new_targets_np, new_outputs_bin)

print(f'Accuracy: {accuracy}')

Accuracy: 0.7352112676056338


# Test with New Input Text

In [None]:
def classify_text(model, text, tokenizer, max_len, threshold=0.5):
    # Prepare the text
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs["token_type_ids"].to(device)

    # Get the model outputs
    with torch.no_grad():
        outputs = model(input_ids, attention_mask, token_type_ids)

    # Convert to probabilities
    probabilities = torch.sigmoid(outputs).cpu().detach().numpy().tolist()

    # Define the class labels in the same order that the model was trained on
    class_labels = ['religion_creed', 'race_ethnicity', 'physical_disability', 'gender_sexualorientation', 'other']


    # Convert the probabilities to labels
    predicted_labels = [class_labels[i] for i, prob in enumerate(probabilities[0]) if prob > threshold]

    return probabilities, predicted_labels


In [None]:
text = ""
probabilities, predicted_labels = classify_text(model, text, tokenizer, MAX_LEN)
print("Probabilities:", probabilities)
print("Predicted labels:", predicted_labels)

Probabilities: [[0.034384358674287796, 0.9888610243797302, 0.37569937109947205, 0.005026870407164097, 0.005398421548306942]]
Predicted labels: ['race_ethnicity']


In [None]:
text = ""
probabilities, predicted_labels = classify_text(model, text, tokenizer, MAX_LEN)
print("Probabilities:", probabilities)
print("Predicted labels:", predicted_labels)

Probabilities: [[0.04150913283228874, 0.01595042273402214, 0.009111431427299976, 0.021590616554021835, 0.8473755121231079]]
Predicted labels: ['other']


In [None]:
text = ""
probabilities, predicted_labels = classify_text(model, text, tokenizer, MAX_LEN)
print("Probabilities:", probabilities)
print("Predicted labels:", predicted_labels)

Probabilities: [[0.010650143958628178, 0.009747701697051525, 0.01013413816690445, 0.35011836886405945, 0.04104405641555786]]
Predicted labels: []
