In [None]:
# Import the Libraries
import torch
from transformers import AdamW, get_linear_schedule_with_warmup, DistilBertForSequenceClassification, DistilBertTokenizer
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np

In [None]:
# Set to initialize the random number generator.
RANDOM_SEED = 42
MAX_LEN = 256
BATCH_SIZE=8

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
# Check the GPU
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


### Open the SA dataset from huggingface

In [None]:
# Set your Hugging Face token
import huggingface_hub
from datasets import load_dataset
huggingface_hub.login("your_huggingface_token")

In [None]:
SA_dataset = load_dataset("ulinnuha/sentiment_analysis_ladin_italian")
SA_df = pd.DataFrame(SA_dataset["train"])
SA_df.head()

In [None]:
# Set the Class number
NCLASSES= 2

In [None]:
# Mapping dictionary to convert 'pos' to 0 and 'neg' to 1
label_map = {'pos': 0, 'neg': 1}

# Apply the mapping to the 'label' column
SA_df['label'] = SA_df['label'].map(label_map)

#### Split the data into training and test (For Ladin entries)
You can change the column for Italian operation

In [None]:
from sklearn.model_selection import train_test_split
#Set which the language
language = 'ladin'

X_train, X_test, y_train, y_test = train_test_split(
                                  SA_df[language], SA_df['label'],
                                  test_size=0.20,
                                  random_state=42,
                                  stratify = SA_df['label'])

### Set the training and validation data

In [6]:
X_train, X_val, y_train, y_val = train_test_split(
                                  X_train, y_train,
                                  test_size=0.10,
                                  random_state=42,
                                  stratify = y_train)

### Set the Dataloader

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

In [8]:
from torch.utils.data import Dataset
import torch

class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer  # ✅ Ensure tokenizer is passed
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = self.labels[item]

        # Debugging: Check if tokenizer is correctly assigned
        print(type(self.tokenizer))  # Should be <class 'transformers.DistilBertTokenizer'>

        encoding = tokenizer.encode_plus(
                                        review,
                                        max_length=self.max_len,
                                        truncation=True,
                                        add_special_tokens=True,
                                        return_token_type_ids=False,
                                        padding='max_length',
                                        return_attention_mask=True,
                                        return_tensors='pt')

        return{
            'review':review,
            'input_ids':encoding['input_ids'].flatten(),
            'attention_mask':encoding['attention_mask'].flatten(),
            'label':torch.tensor(label, dtype=torch.long)}



In [None]:
def data_loader(review, label, tokenizer, max_len, BATCH_SIZE):
  dataset = ReviewDataset(
    reviews= review.to_numpy(),
    labels = label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(dataset, batch_size= BATCH_SIZE, num_workers=2)

In [None]:
# Set dataloader for train, test and validation
train_data_loader = data_loader(X_train, y_train, tokenizer, MAX_LEN, BATCH_SIZE)
validation_data_loader = data_loader(X_val, y_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(X_test, y_test, tokenizer, MAX_LEN, BATCH_SIZE)

## mBERT Model

In [None]:
import torch.nn as nn

class MultilingualDistilBert(nn.Module):
    def __init__(self, n_class):
        super(MultilingualDistilBert, self).__init__()
        # Use DistilBERT model for sequence classification (it already has a classification head)
        self.bert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=n_class)
        self.do = nn.Dropout(0.5)

    def forward(self, input_ids, attention_mask):
        # Forward pass
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Extract logits (output before softmax)
        logits = outputs.logits
        return logits


In [None]:
model = MultilingualDistilBert(NCLASSES)
model = model.to(device)

In [13]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

135326210

## Training process

In [None]:
# Set the parameters
EPOCHS=3
optimizer = AdamW (model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader)*EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps = total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)



In [15]:
def train_model (model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model=model.train()
  losses = []
  correct_predictions = 0
  i=0
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
    print('Sample {}/{} , Training: Loss: {}'.format(i, n_examples/BATCH_SIZE,loss))
    i+=1
  return correct_predictions.double()/n_examples, np.mean(losses)

#### Set for the evaluation

In [16]:
def eval_model(model, data_loader, loss_fn, device, n_examples, modo):
  model = model.eval()
  losses = []
  correct_predictions = 0
  i=0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
      print('Sample {}/{} , {}: Loss: {}'.format(i, n_examples//BATCH_SIZE, modo,loss))
      i+=1
  return correct_predictions.double()/n_examples, np.mean(losses)


#### Start the training process

In [None]:
for epoch in range(EPOCHS):
  print('Epoch {} from {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss,  = train_model(
    model, train_data_loader, loss_fn, optimizer, device, scheduler, len(X_train)
  )
  validation_acc, validation_loss = eval_model(model, validation_data_loader, loss_fn, device, len(X_val), 'Validation')
  #checkpoint = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
             #'optimizer': optimizer.state_dict()}
  #torch.save(checkpoint, f'checkpoint_{epoch+1}.pth')

In [None]:
# Print the metric evaluation during training
print('Training: Loss: {}, accuracy: {}'.format(train_loss, train_acc))
print('Validatión: Loss: {}, accuracy: {}'.format(validation_loss, validation_acc))
print('')

Training: Loss: 0.09478573292531764, accuracy: 0.9762407016764738
Validatión: Loss: 0.20257187141188687, accuracy: 0.9560439560439561



## Testing stage

In [19]:
from sklearn.metrics import balanced_accuracy_score, f1_score

def testing_model(model, data_loader, loss_fn, device, n_examples, modo):
    model.eval()  # Change to eval mode for testing
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []
    i = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)

            # Calculate loss
            loss = loss_fn(outputs, labels)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            # Store predictions and labels for metrics calculation
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            print(f'Sample {i}/{n_examples//BATCH_SIZE} , {modo}: Loss: {loss.item()}')
            i += 1

    # Calculate Balanced Accuracy and F1 Score
    balanced_acc = balanced_accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')  # or use 'macro' for unweighted average

    # Return metrics
    return correct_predictions.double()/n_examples, np.mean(losses), balanced_acc, f1

In [None]:
# Run the testing stage
testing_acc, testing_loss, balanced_acc, f1 = testing_model(model, test_data_loader, loss_fn, device, len(X_test), 'Testing')

#### Print the evaluation metrics

In [21]:
print(f"Testing Accuracy: {testing_acc}")
print(f"Testing Loss: {testing_loss}")
print(f"Balanced Accuracy: {balanced_acc}")
print(f"F1 Score: {f1}")

Testing Accuracy: 0.950459448661606
Testing Loss: 0.23714174882607564
Balanced Accuracy: 0.922791565139817
F1 Score: 0.9502880663994667
