In [6]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn import functional as F
from collections import defaultdict

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=5)  # 5 classes: E, S, G, N, I

text_list = [] # Report data

# Map the predicted index to the corresponding class label
label_map = {0: 'E', 1: 'S', 2: 'G', 3: 'N', 4: 'I'}

# Dictionary to store the frequency of each class
label_count = defaultdict(int)

# Classify each sentence in the text list
for text in text_list:
    # Tokenize the text and obtain the output tensors
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs)

    # Compute probabilities using softmax
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=-1)

    # Get the predicted label index
    predicted_index = torch.argmax(probabilities, dim=-1).item()

    # Map the predicted index to the corresponding class label
    predicted_label = label_map[predicted_index]

    # Update the label count
    label_count[predicted_label] += 1

# Print the frequency of each class
for label, count in label_count.items():
    print(f"Class {label}: {count} texts")



Downloading (…)solve/main/vocab.txt: 100%|██████████| 996k/996k [00:00<00:00, 3.68MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 4.88kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 625/625 [00:00<00:00, 477kB/s]
Downloading model.safetensors: 100%|██████████| 714M/714M [01:01<00:00, 11.6MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class N: 623 texts
Class G: 25 texts
Class I: 22 texts
Class E: 4 texts
Class S: 2 texts
