In [98]:
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset
import numpy as np
from sklearn.metrics import accuracy_score

In [99]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [100]:
df = pd.read_json("/Users/kras/Documents/Conferences/2024 | SERP Conf. /labelled-data-varied.json")

In [102]:
def extract_label(annotation):
    try:
        return annotation[0]['result'][0]['value']['choices'][0]
    except (IndexError, KeyError):
        return None

df['label'] = df['annotations'].apply(extract_label)

In [103]:
label_map = {"Entailment": 0, "Contradiction": 1, "Neutral": 2}
df['label'] = df['label'].map(label_map)

In [104]:
train_df, val_df = train_test_split(df, test_size=0.1)

In [97]:
class NLIDataset(Dataset):
    """
    A dataset class for natural language inference tasks.
    """

    def __init__(self, encodings, labels=None):
        """
        Initializes the NLIDataset object.

        Args:
            encodings (dict): The input encodings for the dataset.
            labels (list, optional): The labels for the dataset. Defaults to None.
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Returns the item at the specified index in the dataset.

        Args:
            idx (int): The index of the item to retrieve.

        Returns:
            dict: The item at the specified index, including input encodings and labels if available.
        """
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        """
        Returns the length of the dataset.

        Returns:
            int: The length of the dataset.
        """
        return len(self.encodings['input_ids'])


In [105]:
tokenizer = AutoTokenizer.from_pretrained("cmarkea/distilcamembert-base-nli")

def tokenize_function(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_function(train_df.to_dict('list'))
val_encodings = tokenize_function(val_df.to_dict('list'))

train_dataset = NLIDataset(train_encodings, train_df['label'].tolist())
val_dataset = NLIDataset(val_encodings, val_df['label'].tolist())


In [106]:
model = AutoModelForSequenceClassification.from_pretrained("cmarkea/distilcamembert-base-nli", num_labels=3)
model = model.to(device)

In [107]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,
    report_to="none" 
)

In [108]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

In [109]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [110]:
# Train the model
trainer.train()

 22%|██▏       | 11/51 [00:03<00:08,  4.85it/s]

{'loss': 1.3674, 'grad_norm': 28.725780487060547, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.59}


 41%|████      | 21/51 [00:05<00:05,  5.19it/s]

{'loss': 1.2864, 'grad_norm': 25.32940101623535, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.18}


 61%|██████    | 31/51 [00:07<00:03,  5.20it/s]

{'loss': 1.1456, 'grad_norm': 22.48459815979004, 'learning_rate': 3e-06, 'epoch': 1.76}


 80%|████████  | 41/51 [00:08<00:01,  5.16it/s]

{'loss': 1.0369, 'grad_norm': 21.445453643798828, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.35}


100%|██████████| 51/51 [00:11<00:00,  4.53it/s]

{'loss': 0.7965, 'grad_norm': 17.739839553833008, 'learning_rate': 5e-06, 'epoch': 2.94}
{'train_runtime': 11.2675, 'train_samples_per_second': 35.944, 'train_steps_per_second': 4.526, 'train_loss': 1.1186417948965932, 'epoch': 3.0}





TrainOutput(global_step=51, training_loss=1.1186417948965932, metrics={'train_runtime': 11.2675, 'train_samples_per_second': 35.944, 'train_steps_per_second': 4.526, 'train_loss': 1.1186417948965932, 'epoch': 3.0})

In [94]:
# Evaluate the model
evaluation_results = trainer.evaluate()
print(evaluation_results)

100%|██████████| 2/2 [00:00<00:00, 25.36it/s]

{'eval_loss': 0.43464216589927673, 'eval_accuracy': 1.0, 'eval_runtime': 0.7804, 'eval_samples_per_second': 19.222, 'eval_steps_per_second': 2.563, 'epoch': 3.0}





In [95]:
save_directory = "./distilcamembert-nli-finetuned-01042024"


In [96]:
trainer.save_model(save_directory)