# BERT for ICLE dataset
This notebook contains the code to fine-tune BERT for the ICLE-NLI dataset. We evaluate under five-fold cross-validation.

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

# BERT ICLE 5CV

In [None]:
id2label = {0: "BUL", 1: "CHI", 2: "CZE", 3: "FRE", 4: "JPN", 5: "RUS", 6: "SPA"}
label2id = {"BUL": 0, "CHI": 1, "CZE": 2, "FRE": 3, "JPN": 4, "RUS": 5, "SPA": 6}


class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(text, padding='max_length', truncation=True, max_length=510, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': torch.tensor(label2id[label])}

# convert dataframe to dataset
icle = "/content/drive/MyDrive/thesis_NLI/ICLE-NLI-results.csv"
df = pd.read_csv(icle)
dataset = TextDataset(df['text'].tolist(), df['language'].tolist())

# 5-fold cv
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=1)

# lists to store accuracies for each fold
fold_accuracies = []
predictions = {}

# perform 5-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(df['text'], df['language'])):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7, id2label=id2label, label2id=label2id)
    print(f"Training Fold {fold+1}/{k_folds}")

    # train and test sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset, train_indices)
    val_dataset = torch.utils.data.Subset(dataset, val_indices)

    # create data loaders
    train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=12, shuffle=False)

    # training
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(12):  # 12 epochs
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # evaluation
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1) # get predicted values
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())
        for predicted_label, test_index in zip(val_predictions, val_indices):
            predicted_lang = id2label[predicted_label] # get predicted labels
            predictions[test_index] = predicted_lang
    print(f"Num predictions: {len(predictions)}")
    model.save_pretrained(f"/content/drive/MyDrive/thesis_NLI/ICLE/finetuned_bert_{fold}") # Local saving
    tokenizer.save_pretrained(f"/content/drive/MyDrive/thesis_NLI/ICLE/finetuned_bert_{fold}")

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")

# calculate average accuracy across all folds
average_accuracy = sum(fold_accuracies) / len(fold_accuracies)
print(f"Average Accuracy: {average_accuracy}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Fold 1/5
Num predictions: 154
Accuracy for Fold 1: 0.8116883116883117


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Fold 2/5
Num predictions: 308
Accuracy for Fold 2: 0.7467532467532467


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Fold 3/5
Num predictions: 462
Accuracy for Fold 3: 0.7987012987012987


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Fold 4/5
Num predictions: 616
Accuracy for Fold 4: 0.7857142857142857


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Fold 5/5
Num predictions: 770
Accuracy for Fold 5: 0.7727272727272727
Average Accuracy: 0.7831168831168831


# Add results to CSV

In [None]:
from collections import OrderedDict
sorted_dict = OrderedDict(sorted(predictions.items()))
sorted_predictions = list(sorted_dict.values())
print(sorted_dict)
icle = "/content/drive/MyDrive/thesis_NLI/ICLE-NLI-results.csv"
df = pd.read_csv(icle)
column_name = 'preds_finetuned_bert'
num_columns = len(df.columns)
df.insert(num_columns, column_name, sorted_predictions)
df.head()
df.to_csv(icle, index=False)

OrderedDict([(0, 'RUS'), (1, 'RUS'), (2, 'CHI'), (3, 'BUL'), (4, 'BUL'), (5, 'BUL'), (6, 'BUL'), (7, 'BUL'), (8, 'FRE'), (9, 'BUL'), (10, 'BUL'), (11, 'BUL'), (12, 'FRE'), (13, 'FRE'), (14, 'RUS'), (15, 'BUL'), (16, 'RUS'), (17, 'CZE'), (18, 'BUL'), (19, 'BUL'), (20, 'BUL'), (21, 'BUL'), (22, 'BUL'), (23, 'CZE'), (24, 'BUL'), (25, 'FRE'), (26, 'BUL'), (27, 'BUL'), (28, 'BUL'), (29, 'BUL'), (30, 'BUL'), (31, 'FRE'), (32, 'BUL'), (33, 'BUL'), (34, 'BUL'), (35, 'BUL'), (36, 'BUL'), (37, 'BUL'), (38, 'CZE'), (39, 'BUL'), (40, 'BUL'), (41, 'BUL'), (42, 'BUL'), (43, 'BUL'), (44, 'BUL'), (45, 'BUL'), (46, 'BUL'), (47, 'RUS'), (48, 'BUL'), (49, 'FRE'), (50, 'BUL'), (51, 'BUL'), (52, 'RUS'), (53, 'BUL'), (54, 'BUL'), (55, 'JPN'), (56, 'BUL'), (57, 'BUL'), (58, 'FRE'), (59, 'BUL'), (60, 'BUL'), (61, 'BUL'), (62, 'BUL'), (63, 'RUS'), (64, 'RUS'), (65, 'BUL'), (66, 'BUL'), (67, 'BUL'), (68, 'BUL'), (69, 'BUL'), (70, 'BUL'), (71, 'BUL'), (72, 'BUL'), (73, 'BUL'), (74, 'BUL'), (75, 'BUL'), (76, 'BUL