In [1]:
%%capture
!pip install -q datasets evaluate #requirements.txt

In [2]:
import torch
import pandas as pd
import numpy as np
from evaluate import load
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline

# Loading Data

## Utils

In [3]:
def prepare_dataset(file_path):
    """
    Prepares the dataset by splitting it into training and testing sets, and converts the 'text' and 'text_en' columns
    into a format that can be used for fine-tuning, with the same split for both columns.

    Arguments:
    file_path : str : The file path to the dataset in TSV format.

    Returns:
    datasets.DatasetDict : A dictionary containing the train and test datasets formatted for fine-tuning,
                            with both 'text' and 'text_en' columns.
    """
    # Load the dataset from the given TSV file path using the datasets library
    raw_dataset = load_dataset('csv', data_files=file_path, delimiter='\t')

    # Convert the dataset to a pandas DataFrame for easier manipulation
    raw_dataframe = raw_dataset['train'].to_pandas()

    # Extract the text and label columns as lists
    X = raw_dataframe[['text', 'text_en']]  # Use 'text' and 'text_en' columns
    y = raw_dataframe['label'].tolist()  # 'label' is assumed to be the column for target labels

    # Split the data into training and testing sets (90% training, 10% testing)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

    # Convert the training and testing data into Hugging Face dataset format (DatasetDict)
    dataset = DatasetDict({
        'train': Dataset.from_dict({
            'text': X_train['text'].tolist(),
            'text_en': X_train['text_en'].tolist(),
            'label': y_train
        }),
        'test': Dataset.from_dict({
            'text': X_test['text'].tolist(),
            'text_en': X_test['text_en'].tolist(),
            'label': y_test
        })
    })

    # Return the prepared dataset
    return dataset


In [4]:
def select_dataset_text_column(dataset, text_column):
    """
    Converts the dataset to use a specific text column ('text' or 'text_en') and returns the train and test splits
    with the specified column and 'label' column.

    Arguments:
    dataset : datasets.DatasetDict : The prepared dataset in Hugging Face format (train and test sets).
    text_column : str : The column to use for text data ('text' or 'text_en').

    Returns:
    datasets.DatasetDict : A dictionary containing the train and test datasets with the specified text column.
    """
    # Convert the training and testing data into Hugging Face dataset format (DatasetDict)
    dataset_with_text_column = DatasetDict({
        'train': Dataset.from_dict({
            'text': dataset['train'][text_column],  # Select the specified text column
            'label': dataset['train']['label']  # Keep the 'label' column
        }),
        'test': Dataset.from_dict({
            'text': dataset['test'][text_column],  # Select the specified text column
            'label': dataset['test']['label']  # Keep the 'label' column
        })
    })

    # Return the modified dataset with the selected text column
    return dataset_with_text_column

In [5]:
# Tokenizes the input data using the provided tokenizer.
def tokenize_function(data, tokenizer):
    """
    Arguments:
    data : dict : A dictionary containing the text data.
    tokenizer : transformers.PreTrainedTokenizer : The tokenizer to use for tokenizing the text.

    Returns:
    dict : Tokenized data with 'input_ids' and 'attention_mask'.
    """
    return tokenizer(data['text'], padding="max_length", truncation=True)


## Main

In [6]:
# File path for the dataset
file_path = "power-it-train.tsv"

In [7]:
dataset = prepare_dataset(file_path)

Generating train split: 0 examples [00:00, ? examples/s]

# Fine-tuning

## Loading Model

In [8]:
# fine-tuning model
model_name = "FacebookAI/xlm-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenizing data for training

In [9]:
# For power identification -> text is used
dataset_text = select_dataset_text_column(dataset, 'text')

# Tokenize the data
tokenized_datasets = dataset_text.map(lambda x: tokenize_function(x, tokenizer), batched=True)

Map:   0%|          | 0/7063 [00:00<?, ? examples/s]

Map:   0%|          | 0/785 [00:00<?, ? examples/s]

## Fine-tuning Utils

In [10]:
# Returns the appropriate training arguments based on the dataset name.
def get_training_arguments():
    """
    Arguments:
    dataset_name : str : The name of the dataset (or any identifier to switch between datasets)

    Returns:
    transformers.TrainingArguments : Training arguments configured based on the dataset.
    """
    return TrainingArguments(
        output_dir='./results_power',
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=5e-6,
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        logging_dir="./results_power/logs",
        save_total_limit=2,
        fp16=True,
        report_to=[]
    )


In [11]:
# Computes accuracy as the metric.
def compute_metrics(eval_pred):
    """
    Arguments:
    eval_pred : tuple : Contains logits and true labels for evaluation.

    Returns:
    dict : A dictionary containing the accuracy of the predictions.
    """
    metric = load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [12]:
# Fine-tunes a sequence classification model on the given dataset.
def fine_tune_model(dataset, model_name, tokenizer):
    """
    Arguments:
    dataset : datasets.DatasetDict : A dictionary containing the train and test datasets.
    model_name : str : The name of the pre-trained model to use.
    tokenizer : transformers.PreTrainedTokenizer : The tokenizer for tokenizing the text.
    dataset_name : str : The name of the dataset (e.g., 'orientation', 'power').

    Returns:
    transformers.Trainer : The trained model.
    """
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Get dataset-specific training arguments
    training_args = get_training_arguments()

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,  # This function is defined above
        #callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Add early stopping
    )

    trainer.train()
    return trainer


## Main

In [13]:
# Clear CUDA cache (ensure no memory leakage)
torch.cuda.empty_cache()

# Fine-tune the model on 'power' dataset
trainer = fine_tune_model(tokenized_datasets, model_name, tokenizer)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.532464,0.747771
2,0.614900,0.506943,0.761783
3,0.535300,0.502022,0.770701


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [14]:
# Get the predictions and labels
predictions = trainer.predict(tokenized_datasets['test'])
predicted_labels = predictions.predictions.argmax(axis=-1)  # Get the predicted class labels
true_labels = predictions.label_ids

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(true_labels, predicted_labels))


Accuracy: 77.07%
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       491
           1       0.71      0.65      0.68       294

    accuracy                           0.77       785
   macro avg       0.76      0.75      0.75       785
weighted avg       0.77      0.77      0.77       785



# Inference

## Loading Model

In [30]:
torch.cuda.empty_cache()


In [47]:
# LLM model for inference
model_name = "facebook/bart-large-mnli"
pipe = pipeline("zero-shot-classification", model=model_name, device="cuda")

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda


## Utils

In [48]:
metric = load("accuracy")

In [49]:
# Perform zero-shot inference on a dataset using a pre-defined pipeline.
def perform_inference(dataset, candidate_labels):
    """
    Arguments:
    dataset : DatasetDict : The dataset object containing a 'test' split with a 'text' field.
    candidate_labels : list : A list of candidate labels for zero-shot classification.

    Returns:
    list : A list of indices corresponding to the highest-confidence label for each prediction.
    """
    # Perform zero-shot classification on the test dataset's 'text' field
    zeroshot_preds = pipe(
        dataset['test']['text'],  # Input text data from the test split
        batch_size=16,           # Process the data in batches of size 16 for efficiency
        candidate_labels=candidate_labels  # Specify the candidate labels for classification
    )

    # Print the raw predictions for debugging or inspection
    print(zeroshot_preds)

    # Convert predictions to indices of the highest-confidence label for each instance
    return [candidate_labels.index(pred["labels"][0]) for pred in zeroshot_preds]


## On English Text

In [50]:
dataset_text_en = select_dataset_text_column(dataset, 'text_en')

In [51]:
torch.cuda.empty_cache()

# Inference on "text_en" (English)
inference_text_en = perform_inference(dataset_text_en, [0, 1])
#metric = load("accuracy") # defined above already
accuracy_text_en = metric.compute(predictions=inference_text_en, references=dataset_text_en["test"]["label"])

print("Accuracy on English text:", accuracy_text_en)

Accuracy on English text: {'accuracy': 0.4089171974522293}


In [52]:
# Compute classification report
classification_report_text_en = classification_report(dataset_text_en["test"]["label"], inference_text_en)
print("Classification Report on English text:")
print(classification_report_text_en)

Classification Report on English text:
              precision    recall  f1-score   support

           0       0.64      0.13      0.21       491
           1       0.38      0.88      0.53       294

    accuracy                           0.41       785
   macro avg       0.51      0.50      0.37       785
weighted avg       0.54      0.41      0.33       785



## On Original Text

In [53]:
dataset_text = select_dataset_text_column(dataset, 'text')

In [54]:
# Inference on "text" (Original Language)
inference_text = perform_inference(dataset_text, [0, 1])
#metric = load("accuracy") # defined above already
accuracy_text = metric.compute(predictions=inference_text, references=dataset_text["test"]["label"])

print("Accuracy on Original text:", accuracy_text)

Accuracy on Original text: {'accuracy': 0.4356687898089172}


In [55]:
# Compute classification report
classification_report_text = classification_report(dataset_text["test"]["label"], inference_text)
print("Classification Report on English text:")
print(classification_report_text)

Classification Report on English text:
              precision    recall  f1-score   support

           0       0.64      0.22      0.33       491
           1       0.38      0.79      0.51       294

    accuracy                           0.44       785
   macro avg       0.51      0.51      0.42       785
weighted avg       0.54      0.44      0.40       785

