# 0. Imports, libraries and rusable functions

In [18]:
# Standard Library Imports
import ast
import copy
import csv
import json
import math
import os
import re
import time
import warnings
import logging
import random
import collections
from collections import Counter, defaultdict
from typing import List, Tuple, Optional
from IPython.display import HTML, display
import math
import time
from unidecode import unidecode
import string
import multiprocessing as mp



# Data Handling Libraries
import numpy as np
import pandas as pd
import csv
from torch.utils.data import random_split
import datasets
from datasets import ClassLabel, Sequence, Dataset, DatasetDict, load_dataset, load_metric, concatenate_datasets, load_from_disk


# Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
# import scikitplot as skplt  # Uncomment if scikit-plot is installed and needed

# Machine Learning: Model Preparation
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import cross_val_score, cross_validate, KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler

# Machine Learning: Models and Frameworks
import tensorflow as tf
import torch
from torch.utils.data import DataLoader
import evaluate
import xgboost
import wandb
from xgboost import plot_importance  # Uncomment if xgboost importance plot is required


# NLP and Transformers
import spacy
import transformers
from transformers import (AdamW, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForMultipleChoice,
                          AutoTokenizer, CamembertForSequenceClassification, DistilBertConfig,
                          DistilBertForSequenceClassification, DistilBertModel, EarlyStoppingCallback,
                          get_linear_schedule_with_warmup, RobertaForSequenceClassification, EvalPrediction,
                          Trainer, TrainerCallback, TrainingArguments, XLMRobertaForSequenceClassification,
                         DefaultDataCollator, BertForQuestionAnswering, DataCollatorWithPadding, PreTrainedTokenizerFast,
                         default_data_collator, is_torch_xla_available, pipeline)
from transformers.trainer_utils import PredictionOutput, speed_metrics

# Experiment Tracking and Optimization Utilities
import optuna
from optuna.trial import TrialState
# import wandb  # Uncomment if using Weights & Biases for experiment tracking

# Progress Bar Utilities
from tqdm.auto import tqdm


In [19]:
class LoggingCallback(TrainerCallback):
    def __init__(self, log_path):
        self.log_path = log_path
    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            with open(self.log_path, "a") as f:
                f.write(json.dumps(logs) + "\n")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

GPU: NVIDIA GeForce RTX 4070 Ti SUPER is available.
cuda


# 1. Global Variables

In [20]:
## Arguments and global vriables
pretrained_model_name = "microsoft/deberta-v3-base"
normalized_model_name = pretrained_model_name.replace("/", "-")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
assert isinstance( tokenizer, PreTrainedTokenizerFast )
data_collator = DefaultDataCollator()
max_length = 512 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
pad_on_right = right_padding = tokenizer.padding_side == 'right'
global_counter = 0
traing_answer_mismatches = []
logger = logging.getLogger(__name__)



In [21]:
training_args = TrainingArguments(
    output_dir=f"./{normalized_model_name}-best_model",
    overwrite_output_dir = True,
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
    save_total_limit=4, 
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="wandb",  # Enable logging to Weights & Biases
    run_name=f"{normalized_model_name}-best_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    num_train_epochs=3,
    lr_scheduler_type = 'linear',
    fp16=True,  # Enable mixed-precision training
)

# 2.Datasets Exploration and Preprocessing

## 2.1 Explore Datasets

In [31]:
# Names of the datasets as they might appear in Hugging Face's datasets library
dataset_names = {        
    "Test" : "metaeval/reclor",
}

# Attempt to load each dataset and print a few examples
for name, dataset_info in dataset_names.items():
    try:
        if isinstance(dataset_info, tuple):
            dataset = load_dataset(*dataset_info, trust_remote_code=True)
        else:
            dataset = load_dataset(dataset_info, trust_remote_code=True)
        
        print("--------------------------------------------")
        print(f"--    {name} Dataset Examples:")
        print("--------------------------------------------")
        for split in dataset.keys():
            print(f"Size of {split} set: {len(dataset[split])}")
        for i, example in enumerate(dataset['train'].take(2)):
            print(f"Example {i + 1}: {example}")
    except Exception as e:
        print(f"Failed to load {name}: {str(e)}")
        
    print()


Downloading readme:   0%|          | 0.00/407 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.96M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/546k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4638 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

--------------------------------------------
--    Test Dataset Examples:
--------------------------------------------
Size of train set: 4638
Size of validation set: 500
Example 1: {'answers': ['Unlike aspirin and other medications that reduce pain and swelling and that are currently available, the new medication would repair existing cell damage that had been caused by rheumatoid arthritis.', 'A patient treated with the new medication for rheumatoid arthritis could sustain a joint injury without becoming aware of it.', 'Joint diseases other than rheumatoid arthritis would not be affected by the new medication.', "The benefits to rheumatoid arthritis sufferers of the new medication would outweigh the medication's possible harmful side effects."], 'context': "In rheumatoid arthritis, the body' s immune system misfunctions by attacking healthy cells in the joints causing the release of a hormone that in turn causes pain and swelling. This hormone is normally activated only in reaction t

In [34]:
# Names of the datasets as they might appear in Hugging Face's datasets library
dataset_names = {    
    "AR-LSAT (Zhong et al., 2021)": "olegbask/AR-LSAT",
    "ReClor (Yu et al., 2020)": "metaeval/reclor",
    "LogiQA 2.0 (Liu et al., 2023)": "baber/logiqa2",    
    "RTE (Dagan et al., 2005; Haim et al., 2006; Giampiccolo et al., 2007, 2008; Bentivogli et al., 2009)": "SetFit/rte",
    "FOLIO (Han et al., 2022)" : "tasksource/folio",    
    "PrOntoQA (Saparov and He, 2023)" : "longface/prontoqa-train",
    #"PrOntoQA (Saparov and He, 2023)" : "longface/pronto-qa-flanT5",
    "TellMeWhy (Lal et al., 2021)" : "StonyBrookNLP/tellmewhy",
    "HotpotQA (Yang et al., 2018)" : ("hotpotqa/hotpot_qa", "fullwiki"),  # 'distractor' or 'fullwiki'
    "GSM8K (Cobbe et al., 2021)" : ("openai/gsm8k", "main"), #"socratic" or "main"
    "MRPC (Dolan & Brockett, 2015)" : ("glue", "mrpc"), #"SetFit/mrpc",    
    "AbductionRules (Young et al., 2022)" : "tasksource/AbductionRules",
    "ProofWriter (Tafjord et al., 2021)" : "tasksource/proofwriter",
    "SQuAD 2.0 (Rajpurkar et al., 2018)" : "squad_v2",
    "MultiNLI (Williams, Nangia, and Bowman 2018)" : "nyu-mll/multi_nli",
    "Adversarial NLI (Nie et al. 2020)" : "Aivalf/NLI_adversarial_dataset",
    "ConTRoL (Liu et al., 2021a)" : "tasksource/ConTRoL-nli",
    "RACE (lai et al., 2017)" : ('ehovy/race', 'all'),
    
}

# Attempt to load each dataset and print a few examples
for name, dataset_info in dataset_names.items():
    try:
        if isinstance(dataset_info, tuple):
            dataset = load_dataset(*dataset_info, trust_remote_code=True)
        else:
            dataset = load_dataset(dataset_info, trust_remote_code=True)
        
        print("--------------------------------------------")
        print(f"--    {name} Dataset Examples:")
        print("--------------------------------------------")
        for split in dataset.keys():
            print(f"Size of {split} set: {len(dataset[split])}")
        for i, example in enumerate(dataset['train'].take(2)):
            print(f"Example {i + 1}: {example}")
    except Exception as e:
        print(f"Failed to load {name}: {str(e)}")
        
    print()


--------------------------------------------
--    AR-LSAT (Zhong et al., 2021) Dataset Examples:
--------------------------------------------
Size of train set: 1585
Size of validation set: 231
Size of test set: 230
Example 1: {'context': 'Exactly six trade representatives negotiate a treaty: Klosnik, Londi, Manley, Neri, Osata, Poirier. There are exactly six chairs evenly spaced around a circular table. The chairs are numbered 1 through 6, with successively numbered chairs next to each other and chair number 1 next to chair number 6. Each chair is occupied by exactly one of the representatives. The following conditions apply: Poirier sits immediately next to Neri. Londi sits immediately next to Manley, Neri, or both. Klosnik does not sit immediately next to Manley. If Osata sits immediately next to Poirier, Osata does not sit immediately next to Manley.', 'question': 'Which one of the following seating arrangements of the six representatives in chairs 1 through 6 would NOT violate th

Repo card metadata block was not found. Setting CardData to empty.


--------------------------------------------
--    RTE (Dagan et al., 2005; Haim et al., 2006; Giampiccolo et al., 2007, 2008; Bentivogli et al., 2009) Dataset Examples:
--------------------------------------------
Size of train set: 2490
Size of validation set: 277
Size of test set: 3000
Example 1: {'text1': 'No Weapons of Mass Destruction Found in Iraq Yet.', 'text2': 'Weapons of Mass Destruction Found in Iraq.', 'label': 1, 'idx': 0, 'label_text': 'not entailment'}
Example 2: {'text1': 'A place of sorrow, after Pope John Paul II died, became a place of celebration, as Roman Catholic faithful gathered in downtown Chicago to mark the installation of new Pope Benedict XVI.', 'text2': 'Pope Benedict XVI is the new leader of the Roman Catholic Church.', 'label': 0, 'idx': 1, 'label_text': 'entailment'}

--------------------------------------------
--    FOLIO (Han et al., 2022) Dataset Examples:
--------------------------------------------
Size of train set: 1001
Size of validation set: 

In [24]:
# Load the dataset from Hugging Face's datasets library
dataset_name = "tasksource/proofwriter"
dataset = load_dataset(dataset_name, trust_remote_code=True)

# Function to fetch unique labels from a dataset split and count the size
def get_labels_and_size(data_split):
    unique_labels = set()
    for example in data_split:
        unique_labels.add(example['answer'])
    return unique_labels, len(data_split)

# Fetch unique labels and sizes for each split in the dataset
split_info = {
    split: get_labels_and_size(dataset[split])
    for split in dataset.keys()
}

# Display the unique labels and size for each split
for split, (labels, size) in split_info.items():
    print(f"Unique labels in {split} split: {labels}")
    print(f"Size of {split} split: {size}\n")


Unique labels in train split: {'True', 'Unknown', 'False'}
Size of train split: 585552

Unique labels in test split: {'True', 'Unknown', 'False'}
Size of test split: 174476

Unique labels in validation split: {'True', 'Unknown', 'False'}
Size of validation split: 85468



In [25]:
# Load the PrOntoQA dataset
dataset_name = "longface/prontoqa-train"
dataset = load_dataset(dataset_name, trust_remote_code=True)

# Initialize a set to store unique answers
unique_answers = set()

# Iterate through the training dataset to extract answers
for example in dataset['train']:
    # Extract the text after "The answer is:"
    response_text = example['prompt']
    answer_start = response_text.find("###The answer is:") + len("###The answer is:")
    answer = response_text[answer_start:].strip()
    unique_answers.add(answer)

# Display all unique answers
print("Unique Answers in PrOntoQA Dataset:")
for answer in unique_answers:
    print(answer)


Unique Answers in PrOntoQA Dataset:
True
False


## 2.2 Combine Datasets

In [38]:
def process_ar_lsat():
    dataset = load_dataset("olegbask/AR-LSAT")
    # The dataset already has train, validation, and test splits with specified sizes

    def map_example(example):
        # 'label' is an integer index (0-based) indicating the correct option
        label_index = example['label']
        # 'answers' is a list of option texts
        options = example['answers']
        # Get the text of the correct answer
        label_text = options[label_index]
        return {
            'Context': example['context'],
            'Question': example['question'],
            'Options': options,
            'Label_Text': label_text,
            'Label': label_index,
            'Type': "MCQA",
            'Source Dataset': "AR-LSAT"
        }

    for split in ['train', 'validation', 'test']:
        dataset[split] = dataset[split].map(
            map_example,
            remove_columns=dataset[split].column_names
        )
    return dataset

def process_logiqa2():
    dataset = load_dataset("baber/logiqa2")
    # The dataset has train, validation, and test splits with specified sizes

    def map_example(example):
        label_index = example['answer']
        options = example['options']
        label_text = options[label_index]
        return {
            'Context': example['text'],
            'Question': example['question'],
            'Options': options,
            'Label_Text': label_text,
            'Label': label_index,
            'Type': "MCQA",
            'Source Dataset': "LogiQA 2.0"
        }

    for split in ['train', 'validation', 'test']:
        dataset[split] = dataset[split].map(
            map_example,
            remove_columns=dataset[split].column_names
        )
    return dataset



def process_control():
    dataset = load_dataset("tasksource/ConTRoL-nli")
    # The dataset has train, validation, and test splits with specified sizes

    options = ['neutral', 'entailment', 'contradiction']

    def map_example(example):
        label_text = example['label']
        label_index = options.index(label_text)
        return {
            'Context': example['premise'],
            'Question': f'Does the following statement classify as "neutral," "entailment," or "contradiction"? {example["hypothesis"]}',
            'Options': options,
            'Label_Text': label_text,
            'Label': label_index,
            'Type': "Classification",
            'Source Dataset': "ConTRoL"
        }

    for split in ['train', 'validation', 'test']:
        dataset[split] = dataset[split].map(
            map_example,
            remove_columns=dataset[split].column_names
        )
    return dataset




def process_rte():
    dataset = load_dataset("SetFit/rte")
    # The dataset has train, validation, and test splits
    # The test split contains unlabeled data; we'll ignore it

    # Remove the original test split
    del dataset['test']

    # Get the training data
    train_data = dataset['train']
    n = len(train_data)
    test_size = 277  # Create a new test split of size 277

    # Shuffle the indices
    indices = list(range(n))
    random.seed(42)  # For reproducibility
    random.shuffle(indices)

    # Split indices for the new test and updated train splits
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]

    # Create the new test and updated train splits
    dataset['test'] = train_data.select(test_indices)
    dataset['train'] = train_data.select(train_indices)

    options = ['entailment', 'not entailment']

    def map_example(example):
        label_text = example['label_text']
        label_index = options.index(label_text)
        return {
            'Context': example['text1'],
            'Question': f'Does the following statement classify as "entailment" or "not entailment"? {example["text2"]}',
            'Options': options,
            'Label_Text': label_text,
            'Label': label_index,
            'Type': "Classification",
            'Source Dataset': "RTE"
        }

    # Map over the updated splits
    for split in ['train', 'validation', 'test']:
        dataset[split] = dataset[split].map(
            map_example,
            remove_columns=dataset[split].column_names
        )
    return dataset





def process_folio():
    dataset = load_dataset("tasksource/folio")
    # The dataset has train and validation splits; test split is missing
    # We'll generate a test split from the training data
    train_data = dataset['train']
    n = len(train_data)
    test_size = 203  # Same size as validation split
    indices = list(range(n))
    random.seed(42)  # For reproducibility
    random.shuffle(indices)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    dataset['test'] = train_data.select(test_indices)
    dataset['train'] = train_data.select(train_indices)

    options = ['True', 'False', 'Uncertain']

    def map_example(example):
        label_text = example['label']
        label_index = options.index(label_text)
        return {
            'Context': example['premises'],
            'Question': f'Does the following conclusion classify as "True", "False" or "Uncertain"? {example["conclusion"]}',
            'Options': options,
            'Label_Text': label_text,
            'Label': label_index,
            'Type': "Classification",
            'Source Dataset': "FOLIO"
        }

    for split in ['train', 'validation', 'test']:
        dataset[split] = dataset[split].map(
            map_example,
            remove_columns=dataset[split].column_names
        )
    return dataset






def process_prontoqa():
    dataset = load_dataset("longface/prontoqa-train")
    # Dataset only has a train split; we'll generate validation and test splits
    train_data = dataset['train']
    n = len(train_data)
    val_size = n // 10  # Using 10% of the data for validation
    test_size = n // 10  # Using 10% of the data for test
    indices = list(range(n))
    random.seed(42)
    random.shuffle(indices)
    val_indices = indices[:val_size]
    test_indices = indices[val_size:val_size + test_size]
    train_indices = indices[val_size + test_size:]
    dataset['validation'] = train_data.select(val_indices)
    dataset['test'] = train_data.select(test_indices)
    dataset['train'] = train_data.select(train_indices)

    options = ['True', 'False']

    def map_example(example):
        # Extract the prompt
        prompt = example['prompt']

        # Split the prompt into context and response
        context_part = prompt.split("###Response:")[0].strip()

        # Extract the context
        context = context_part.replace("###Context:", "").split("Is the following statement true or false?")[0].strip()

        # Extract the statement in question
        statement = context_part.split("Is the following statement true or false?")[-1].strip()

        # Construct the question including the options "True" or "False"
        question = f'Is the following statement "True" or "False"? {statement}'

        # Extract the answer
        answer_line = prompt.split("###The answer is:")[-1].strip()
        label_text = answer_line.capitalize()  # Ensure it's "True" or "False"
        label_index = options.index(label_text)

        return {
            'Context': context,
            'Question': question,
            'Options': options,
            'Label_Text': label_text,
            'Label': label_index,
            'Type': "Classification",
            'Source Dataset': "PrOntoQA"
        }

    for split in ['train', 'validation', 'test']:
        dataset[split] = dataset[split].map(
            map_example,
            remove_columns=dataset[split].column_names
        )
    return dataset


def process_tellmewhy():
    dataset = load_dataset("StonyBrookNLP/tellmewhy")
    # The dataset has train, validation, and test splits with specified sizes

    options = ['Answerable', 'Not Answerable']

    def map_example(example):
        label_text = example['is_ques_answerable']
        label_index = options.index(label_text)
        return {
            'Context': example['narrative'],
            'Question': f'Does the following question classify as "Answerable" or "Not Answerable"? {example["question"]}',
            'Options': options,
            'Label_Text': label_text,
            'Label': label_index,
            'Type': "Classification",
            'Source Dataset': "TellMeWhy"
        }

    for split in ['train', 'validation', 'test']:
        dataset[split] = dataset[split].map(
            map_example,
            remove_columns=dataset[split].column_names
        )
    return dataset


def process_mrpc():
    dataset = load_dataset("glue", "mrpc")
    # The dataset has train and validation splits; test labels are not available
    # We'll generate a test split from the training data
    train_data = dataset['train']
    n = len(train_data)
    test_size = 1725  # As per your provided test size
    indices = list(range(n))
    random.seed(42)  # For reproducibility
    random.shuffle(indices)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    dataset['test'] = train_data.select(test_indices)
    dataset['train'] = train_data.select(train_indices)

    # Label mapping
    label_mapping = {'0': 'not equivalent', '1': 'equivalent'}
    options = ['not equivalent', 'equivalent']

    def map_example(example):
        label_text = label_mapping.get(str(example['label']), 'not equivalent')
        label_index = options.index(label_text)
        return {
            'Context': example['sentence1'],
            'Question': f'Does the following statement classify as "not equivalent" or "equivalent"? {example["sentence2"]}',
            'Options': options,
            'Label_Text': label_text,
            'Label': label_index,
            'Type': "Classification",
            'Source Dataset': "MRPC"
        }

    for split in ['train', 'validation', 'test']:
        if split in dataset:
            dataset[split] = dataset[split].map(
                map_example,
                remove_columns=dataset[split].column_names
            )
    return dataset



def process_proofwriter():
    dataset = load_dataset("tasksource/proofwriter")
    # The dataset has train, validation, and test splits with specified sizes

    options = ['True', 'False', 'Unknown']

    def map_example(example):
        label_text = example['answer']
        label_index = options.index(label_text)
        return {
            'Context': example['theory'],
            'Question': f'Is the following statement "True", "False" or "Unknown"? {example["question"]}',
            'Options': options,
            'Label_Text': label_text,
            'Label': label_index,
            'Type': "Classification",
            'Source Dataset': "ProofWriter"
        }

    for split in ['train', 'validation', 'test']:
        dataset[split] = dataset[split].map(
            map_example,
            remove_columns=dataset[split].column_names
        )
    return dataset



def process_multi_nli():
    dataset = load_dataset("multi_nli")
    # The dataset has train and validation splits; test split lacks labels
    # We'll generate a test split from the training data
    train_data = dataset['train']
    n = len(train_data)
    test_size = 9832  # As per the size of the test set
    indices = list(range(n))
    random.seed(42)
    random.shuffle(indices)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    dataset['test'] = train_data.select(test_indices)
    dataset['train'] = train_data.select(train_indices)

    # Combine validation matched and mismatched
    validation = concatenate_datasets([dataset['validation_matched'], dataset['validation_mismatched']])
    dataset['validation'] = validation

    # Label mapping
    label_mapping = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
    options = ['entailment', 'neutral', 'contradiction']

    def map_example(example):
        label_text = label_mapping.get(example['label'], 'neutral')
        label_index = options.index(label_text)
        return {
            'Context': example['premise'],
            'Question': f'Does the following statement classify as "neutral," "entailment," or "contradiction"? {example["hypothesis"]}',
            'Options': options,
            'Label_Text': label_text,
            'Label': label_index,
            'Type': "Classification",
            'Source Dataset': "MultiNLI"
        }

    for split in ['train', 'validation', 'test']:
        dataset[split] = dataset[split].map(
            map_example,
            remove_columns=dataset[split].column_names
        )
    return dataset



def process_anli():
    dataset = load_dataset("Aivalf/NLI_adversarial_dataset")
    # Check which splits are available
    splits = dataset.keys()
    # If validation or test splits are missing, generate them from the train set
    if 'validation' not in dataset or 'test' not in dataset:
        train_data = dataset['train']
        n = len(train_data)
        val_size = n // 10  # Using 10% of the data for validation
        test_size = n // 10  # Using 10% of the data for testing
        indices = list(range(n))
        random.seed(42)
        random.shuffle(indices)
        val_indices = indices[:val_size]
        test_indices = indices[val_size:val_size + test_size]
        train_indices = indices[val_size + test_size:]
        dataset['validation'] = train_data.select(val_indices)
        dataset['test'] = train_data.select(test_indices)
        dataset['train'] = train_data.select(train_indices)

    options = ['ENTAILMENT', 'CONTRADICTION', 'NEUTRAL']

    def map_example(example):
        label_text = example['label']
        label_index = options.index(label_text)
        return {
            'Context': example['premise'],
            'Question': f'Does the following statement classify as "ENTAILMENT", "CONTRADICTION", or "NEUTRAL"? {example["hypothesis"]}',
            'Options': options,
            'Label_Text': label_text,
            'Label': label_index,
            'Type': "Classification",
            'Source Dataset': "Adversarial NLI"
        }

    for split in ['train', 'validation', 'test']:
        if split in dataset:
            dataset[split] = dataset[split].map(
                map_example,
                remove_columns=dataset[split].column_names
            )
    return dataset


def process_race():
    dataset = load_dataset("ehovy/race", "all") 

    def map_example(example):
        # RACE data includes multiple choice questions with four options
        options = example['options']
        # Correct answer is stored as 'answer'
        label_text = example['answer']
        # Convert the letter answer to an index (A=0, B=1, C=2, D=3)
        label_index = ord(label_text) - ord('A')
        return {
            'Context': example['article'],
            'Question': example['question'],
            'Options': options,
            'Label_Text': label_text,
            'Label': label_index,
            'Type': "MCQA",
            'Source Dataset': "RACE"
        }

    # Apply the mapping function to each split
    for split in dataset.keys():
        dataset[split] = dataset[split].map(
            map_example,
            remove_columns=dataset[split].column_names
        )
    return dataset

def process_reclor():
    dataset = load_dataset("metaeval/reclor")
    # ReClor has train and validation splits, test split is not available
    # We will generate a test split from the training data        
    train_data = dataset['train']
    n = len(train_data)
    test_size = 500  # Since validation size is 500, we'll make test size the same
    indices = list(range(n))
    random.seed(42)
    random.shuffle(indices)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    dataset['test'] = train_data.select(test_indices)
    dataset['train'] = train_data.select(train_indices)

    def map_example(example):
        label_index = example['label']
        options = example['answers']
        label_text = options[label_index]
        return {
            'Context': example['context'],
            'Question': example['question'],
            'Options': options,
            'Label_Text': label_text,
            'Label': label_index,
            'Type': "MCQA",
            'Source Dataset': "ReClor"
        }

    for split in ['train', 'validation', 'test']:
        dataset[split] = dataset[split].map(
            map_example,
            remove_columns=dataset[split].column_names
        )
    return dataset



def combineDatasets():
    datasets = []
    ar_lsat = process_ar_lsat()
    reclor = process_reclor()
    logiqa2 = process_logiqa2()
    rte = process_rte()
    folio = process_folio()
    prontoqa = process_prontoqa()
    tellmewhy = process_tellmewhy()
    mrpc = process_mrpc()
    proofwriter = process_proofwriter()
    multi_nli = process_multi_nli()
    anli = process_anli()
    control = process_control()
    race = process_race()

    # Combine datasets
    splits = ['train', 'validation', 'test']
    combined_dataset = DatasetDict()
    for split in splits:
        datasets_to_concat = []
        for dataset in [ar_lsat, reclor, logiqa2, rte, folio, prontoqa, tellmewhy, mrpc, proofwriter, multi_nli, anli, control, race]:
            if split in dataset:
                datasets_to_concat.append(dataset[split])
        combined_dataset[split] = concatenate_datasets(datasets_to_concat)

    # Save the combined dataset
    return combined_dataset


combined_dataset = combineDatasets()

Repo card metadata block was not found. Setting CardData to empty.


Map:   0%|          | 0/4934 [00:00<?, ? examples/s]

Map:   0%|          | 0/87866 [00:00<?, ? examples/s]

Map:   0%|          | 0/4887 [00:00<?, ? examples/s]

In [39]:
combined_dataset

DatasetDict({
    train: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 1163327
    })
    validation: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 123613
    })
    test: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 205891
    })
})

In [41]:
combined_dataset.save_to_disk('C:/combined_dataset-2', max_shard_size="1GB")

Saving the dataset (0/1 shards):   0%|          | 0/1163327 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/123613 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/205891 [00:00<?, ? examples/s]

## 2.3 Explore The Combined Dataset

In [43]:
# Load the combined dataset
combined_dataset = load_from_disk('combined_dataset-2')

# Get the splits
splits = combined_dataset.keys()

# Print number of examples in each split
print("Number of examples in each split:")
for split in splits:
    num_examples = len(combined_dataset[split])
    print(f"{split.capitalize()}: {num_examples}")

# Calculate total number of examples
total_examples = sum(len(combined_dataset[split]) for split in splits)
print(f"\nTotal number of examples: {total_examples}")

# Print source dataset statistics for each split
print("\nSource Dataset counts in each split:")
for split in splits:
    source_counter = Counter(combined_dataset[split]['Source Dataset'])
    print(f"\n{split.capitalize()} split:")
    for source, count in source_counter.items():
        print(f"{source}: {count}")

# Print total counts per source dataset across all splits
total_source_counter = Counter()
for split in splits:
    total_source_counter.update(combined_dataset[split]['Source Dataset'])

print("\nTotal counts per Source Dataset:")
for source, count in total_source_counter.items():
    print(f"{source}: {count}")

# Get all unique source datasets
all_sources = set(total_source_counter.keys())

# Display an example from each source dataset
print("\nExamples from each Source Dataset:")
for source in all_sources:
    found = False
    for split in splits:
        # Get the dataset split
        ds = combined_dataset[split]
        # Filter to examples from this source
        indices = [i for i, s in enumerate(ds['Source Dataset']) if s == source]
        if indices:
            idx = indices[0]
            example = ds[idx]
            print(f"\nSource: {source} (from {split} split)")
            print(f"Context: {example['Context']}\n")
            print(f"Question: {example['Question']}\n")
            print(f"Options: {example['Options']}\n")
            print(f"Label_Text: {example['Label_Text']}\n")
            print(f"Label_Index: {example['Label']}\n")
            found = True
            break  # Move to the next source
    if not found:
        print(f"No example found for source {source}")


Number of examples in each split:
Train: 1163327
Validation: 123613
Test: 205891

Total number of examples: 1492831

Source Dataset counts in each split:

Train split:
AR-LSAT: 1585
ReClor: 4138
LogiQA 2.0: 12567
RTE: 2213
FOLIO: 798
PrOntoQA: 2304
TellMeWhy: 71892
MRPC: 1943
ProofWriter: 585552
MultiNLI: 382870
Adversarial NLI: 2880
ConTRoL: 6719
RACE: 87866

Validation split:
AR-LSAT: 231
ReClor: 500
LogiQA 2.0: 1569
RTE: 277
FOLIO: 203
PrOntoQA: 288
TellMeWhy: 8976
MRPC: 408
ProofWriter: 85468
MultiNLI: 19647
Adversarial NLI: 360
ConTRoL: 799
RACE: 4887

Test split:
AR-LSAT: 230
ReClor: 500
LogiQA 2.0: 1572
RTE: 277
FOLIO: 203
PrOntoQA: 288
TellMeWhy: 10689
MRPC: 1725
ProofWriter: 174476
MultiNLI: 9832
Adversarial NLI: 360
ConTRoL: 805
RACE: 4934

Total counts per Source Dataset:
AR-LSAT: 2046
ReClor: 5138
LogiQA 2.0: 15708
RTE: 2767
FOLIO: 1204
PrOntoQA: 2880
TellMeWhy: 91557
MRPC: 4076
ProofWriter: 845496
MultiNLI: 412349
Adversarial NLI: 3600
ConTRoL: 8323
RACE: 97687

Examples f

## 2.4 Clean The Combined Dataset

In [44]:
# Load the combined dataset
combined_dataset = load_from_disk('combined_dataset-2')
combined_dataset

DatasetDict({
    train: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 1163327
    })
    validation: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 123613
    })
    test: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 205891
    })
})

In [45]:
def find_max_length(data):
    max_len = 0
    # Tokenize without truncation
    for example in data:
        # Concatenate the context and question text
        text = example['Context'] + " " + example['Question']
        tokenized_text = tokenizer.encode(text, add_special_tokens=True)
        max_len = max(max_len, len(tokenized_text))
    return max_len

# Loop over each source in the dataset
unique_sources = set(combined_dataset["test"]["Source Dataset"])
max_lengths = {}

for source in unique_sources:
    # Filter the dataset for the current source
    source_data = combined_dataset["test"].filter(lambda x: x['Source Dataset'] == source)
    # Calculate the max length for the current source
    max_length = find_max_length(source_data)
    max_lengths[source] = max_length
    print(f"Maximum length of tokenized input for {source}: {max_length}")

Filter:   0%|          | 0/205891 [00:00<?, ? examples/s]

Maximum length of tokenized input for RACE: 1110


Filter:   0%|          | 0/205891 [00:00<?, ? examples/s]

Maximum length of tokenized input for Adversarial NLI: 324


Filter:   0%|          | 0/205891 [00:00<?, ? examples/s]

Maximum length of tokenized input for RTE: 303


Filter:   0%|          | 0/205891 [00:00<?, ? examples/s]

Maximum length of tokenized input for PrOntoQA: 156


Filter:   0%|          | 0/205891 [00:00<?, ? examples/s]

Maximum length of tokenized input for TellMeWhy: 113


Filter:   0%|          | 0/205891 [00:00<?, ? examples/s]

Maximum length of tokenized input for AR-LSAT: 206


Filter:   0%|          | 0/205891 [00:00<?, ? examples/s]

Maximum length of tokenized input for LogiQA 2.0: 270


Filter:   0%|          | 0/205891 [00:00<?, ? examples/s]

Maximum length of tokenized input for FOLIO: 229


Filter:   0%|          | 0/205891 [00:00<?, ? examples/s]

Maximum length of tokenized input for MRPC: 116


Filter:   0%|          | 0/205891 [00:00<?, ? examples/s]

Maximum length of tokenized input for MultiNLI: 341


Filter:   0%|          | 0/205891 [00:00<?, ? examples/s]

Maximum length of tokenized input for ProofWriter: 281


Filter:   0%|          | 0/205891 [00:00<?, ? examples/s]

Maximum length of tokenized input for ConTRoL: 1702


Filter:   0%|          | 0/205891 [00:00<?, ? examples/s]

Maximum length of tokenized input for ReClor: 184


In [48]:
# Define the callable class for computing token lengths
class ComputeTokenLengths:
    def __init__(self, tokenizer_name):
        self.tokenizer_name = pretrained_model_name
        self.tokenizer = None

    def __call__(self, examples):
        if self.tokenizer is None:
            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
        
         # Concatenate context, question, and options into full texts
        full_texts = [
            f"{ctx} {ques} {' '.join(opts)}" for ctx, ques, opts in zip(
                examples['Context'], examples['Question'], examples['Options'])
        ]
        # Tokenize
        tokenized_inputs = self.tokenizer(
            full_texts, truncation=False, add_special_tokens=True
        )
        # Get lengths
        examples['token_length'] = [len(ids) for ids in tokenized_inputs['input_ids']]
        return examples

# Instantiate the class
compute_token_lengths = ComputeTokenLengths(pretrained_model_name)

# Function to compute the number of examples with fewer than 500 tokens by source
def count_examples_fewer_than_500_tokens(dataset):
    # Add token lengths to each example
    dataset_with_lengths = dataset.map(
        compute_token_lengths, batched=True, batch_size=1000, num_proc=1
    )
    count_by_source = defaultdict(int)
    for example in dataset_with_lengths:
        source = example['Source Dataset']
        if example['token_length'] < 500:
            count_by_source[source] += 1
    return count_by_source

# Process each split and count examples with fewer than 500 tokens
splits = combined_dataset.keys()
for split in splits:
    print(f"\nCounting examples with fewer than 500 tokens in the {split} split...")
    count_under_500 = count_examples_fewer_than_500_tokens(combined_dataset[split])
    print(f"Number of examples in {split} split with fewer than 500 tokens:")
    for source, count in count_under_500.items():
        print(f"{source}: {count}")


Counting examples with fewer than 500 tokens in the train split...


Map:   0%|          | 0/1163327 [00:00<?, ? examples/s]



Number of examples in train split with fewer than 500 tokens:
AR-LSAT: 1585
ReClor: 4138
LogiQA 2.0: 12567
RTE: 2213
FOLIO: 798
PrOntoQA: 2304
TellMeWhy: 71892
MRPC: 1943
ProofWriter: 585552
MultiNLI: 382870
Adversarial NLI: 2880
ConTRoL: 3768
RACE: 78867

Counting examples with fewer than 500 tokens in the validation split...


Map:   0%|          | 0/123613 [00:00<?, ? examples/s]

Number of examples in validation split with fewer than 500 tokens:
AR-LSAT: 231
ReClor: 499
LogiQA 2.0: 1569
RTE: 277
FOLIO: 203
PrOntoQA: 288
TellMeWhy: 8976
MRPC: 408
ProofWriter: 85468
MultiNLI: 19647
Adversarial NLI: 360
ConTRoL: 595
RACE: 4428

Counting examples with fewer than 500 tokens in the test split...


Map:   0%|          | 0/205891 [00:00<?, ? examples/s]

Number of examples in test split with fewer than 500 tokens:
AR-LSAT: 230
ReClor: 500
LogiQA 2.0: 1571
RTE: 277
FOLIO: 203
PrOntoQA: 288
TellMeWhy: 10689
MRPC: 1725
ProofWriter: 174476
MultiNLI: 9832
Adversarial NLI: 360
ConTRoL: 415
RACE: 4473


In [49]:
# Define the callable class for computing and filtering token lengths
class FilterTokenLengths:
    def __init__(self, tokenizer_name):
        self.tokenizer_name = pretrained_model_name
        self.tokenizer = None

    def __call__(self, examples):
        if self.tokenizer is None:
            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
        
         # Concatenate context, question, and options into full texts
        full_texts = [
            f"{ctx} {ques} {' '.join(opts)}" for ctx, ques, opts in zip(
                examples['Context'], examples['Question'], examples['Options'])
        ]
        # Tokenize
        tokenized_inputs = self.tokenizer(
            full_texts, truncation=False, add_special_tokens=True
        )
        # Get lengths
        token_lengths = [len(ids) for ids in tokenized_inputs['input_ids']]
        # Filter examples longer than 500 tokens
        return {
            k: [v[i] for i in range(len(v)) if token_lengths[i] <= 500] for k, v in examples.items()
        }

# Instantiate the class
filter_token_lengths = FilterTokenLengths(pretrained_model_name)

# Define function to filter out long examples in each dataset split
def filter_long_examples(dataset):
    return dataset.map(
        filter_token_lengths, batched=True, batch_size=1000, num_proc=1, load_from_cache_file=False
    )

# Process each split and remove examples with more than 500 tokens
filtered_dataset = DatasetDict()
splits = combined_dataset.keys()
for split in splits:
    print(f"\nFiltering examples longer than 500 tokens in the {split} split...")
    filtered_dataset[split] = filter_long_examples(combined_dataset[split])



Filtering examples longer than 500 tokens in the train split...


Map:   0%|          | 0/1163327 [00:00<?, ? examples/s]


Filtering examples longer than 500 tokens in the validation split...


Map:   0%|          | 0/123613 [00:00<?, ? examples/s]


Filtering examples longer than 500 tokens in the test split...


Map:   0%|          | 0/205891 [00:00<?, ? examples/s]

In [50]:
# Get the splits
splits = filtered_dataset.keys()

# Print number of examples in each split
print("Number of examples in each split:")
for split in splits:
    num_examples = len(filtered_dataset[split])
    print(f"{split.capitalize()}: {num_examples}")

# Calculate total number of examples
total_examples = sum(len(filtered_dataset[split]) for split in splits)
print(f"\nTotal number of examples: {total_examples}")

# Print source dataset statistics for each split
print("\nSource Dataset counts in each split:")
for split in splits:
    source_counter = Counter(filtered_dataset[split]['Source Dataset'])
    print(f"\n{split.capitalize()} split:")
    for source, count in source_counter.items():
        print(f"{source}: {count}")

# Print total counts per source dataset across all splits
total_source_counter = Counter()
for split in splits:
    total_source_counter.update(filtered_dataset[split]['Source Dataset'])

print("\nTotal counts per Source Dataset:")
for source, count in total_source_counter.items():
    print(f"{source}: {count}")

# Get all unique source datasets
all_sources = set(total_source_counter.keys())

# Display an example from each source dataset
print("\nExamples from each Source Dataset:")
for source in all_sources:
    found = False
    for split in splits:
        # Get the dataset split
        ds = filtered_dataset[split]
        # Filter to examples from this source
        indices = [i for i, s in enumerate(ds['Source Dataset']) if s == source]
        if indices:
            idx = indices[0]
            example = ds[idx]
            print(f"\nSource: {source} (from {split} split)")
            print(f"Context: {example['Context']}\n")
            print(f"Question: {example['Question']}\n")
            print(f"Options: {example['Options']}\n")
            print(f"Label_Text: {example['Label_Text']}\n")
            print(f"Label_Index: {example['Label']}\n")
            found = True
            break  # Move to the next source
    if not found:
        print(f"No example found for source {source}")


Number of examples in each split:
Train: 1151512
Validation: 122956
Test: 205044

Total number of examples: 1479512

Source Dataset counts in each split:

Train split:
AR-LSAT: 1585
ReClor: 4138
LogiQA 2.0: 12567
RTE: 2213
FOLIO: 798
PrOntoQA: 2304
TellMeWhy: 71892
MRPC: 1943
ProofWriter: 585552
MultiNLI: 382870
Adversarial NLI: 2880
ConTRoL: 3772
RACE: 78998

Validation split:
AR-LSAT: 231
ReClor: 499
LogiQA 2.0: 1569
RTE: 277
FOLIO: 203
PrOntoQA: 288
TellMeWhy: 8976
MRPC: 408
ProofWriter: 85468
MultiNLI: 19647
Adversarial NLI: 360
ConTRoL: 595
RACE: 4435

Test split:
AR-LSAT: 230
ReClor: 500
LogiQA 2.0: 1571
RTE: 277
FOLIO: 203
PrOntoQA: 288
TellMeWhy: 10689
MRPC: 1725
ProofWriter: 174476
MultiNLI: 9832
Adversarial NLI: 360
ConTRoL: 415
RACE: 4478

Total counts per Source Dataset:
AR-LSAT: 2046
ReClor: 5137
LogiQA 2.0: 15707
RTE: 2767
FOLIO: 1204
PrOntoQA: 2880
TellMeWhy: 91557
MRPC: 4076
ProofWriter: 845496
MultiNLI: 412349
Adversarial NLI: 3600
ConTRoL: 4782
RACE: 87911

Examples f

In [51]:
# Define the callable class
class ComputeTokenLengths:
    def __init__(self, tokenizer_name):
        self.tokenizer_name = tokenizer_name
        self.tokenizer = None

    def __call__(self, examples):
        if self.tokenizer is None:
            from transformers import AutoTokenizer
            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
        # Handle options
        if isinstance(examples['Options'][0], list):
            options_texts = ['\n'.join(opts) for opts in examples['Options']]
        else:
            options_texts = examples['Options']
        # Concatenate texts
        full_texts = [
            f"{ctx} {ques} {opts}"
            for ctx, ques, opts in zip(examples['Context'], examples['Question'], options_texts)
        ]
        # Tokenize
        tokenized_inputs = self.tokenizer(
            full_texts, truncation=False, add_special_tokens=True
        )
        # Get lengths
        examples['token_length'] = [len(ids) for ids in tokenized_inputs['input_ids']]
        return examples

# Instantiate the class
compute_token_lengths = ComputeTokenLengths(pretrained_model_name)

# Function to compute max token lengths by source
def max_token_length_by_source(dataset):
    dataset_with_lengths = dataset.map(
        compute_token_lengths, batched=True, batch_size=1000, num_proc=6
    )
    source_max_lengths = defaultdict(int)
    for example in dataset_with_lengths:
        source = example['Source Dataset']
        length = example['token_length']
        if length > source_max_lengths[source]:
            source_max_lengths[source] = length
    return source_max_lengths

# Now, process your splits
splits = filtered_dataset.keys()
global_max_lengths = defaultdict(int)

for split in splits:
    print(f"\nProcessing {split} split...")
    max_lengths = max_token_length_by_source(filtered_dataset[split])
    print(f"Max token lengths in {split.capitalize()} split:")
    for source, max_len in max_lengths.items():
        print(f"{source}: {max_len}")
        if max_len > global_max_lengths[source]:
            global_max_lengths[source] = max_len

print("\nGlobal maximum token lengths across all splits:")
for source, max_len in global_max_lengths.items():
    print(f"{source}: {max_len}")



Processing train split...


Map (num_proc=6):   0%|          | 0/1151512 [00:00<?, ? examples/s]

Max token lengths in Train split:
AR-LSAT: 367
ReClor: 393
LogiQA 2.0: 496
RTE: 270
FOLIO: 290
PrOntoQA: 158
TellMeWhy: 120
MRPC: 113
ProofWriter: 284
MultiNLI: 464
Adversarial NLI: 446
ConTRoL: 500
RACE: 500

Processing validation split...


Map (num_proc=6):   0%|          | 0/122956 [00:00<?, ? examples/s]

Max token lengths in Validation split:
AR-LSAT: 414
ReClor: 366
LogiQA 2.0: 447
RTE: 265
FOLIO: 240
PrOntoQA: 155
TellMeWhy: 113
MRPC: 98
ProofWriter: 276
MultiNLI: 257
Adversarial NLI: 333
ConTRoL: 403
RACE: 500

Processing test split...


Map (num_proc=6):   0%|          | 0/205044 [00:00<?, ? examples/s]

Max token lengths in Test split:
AR-LSAT: 421
ReClor: 363
LogiQA 2.0: 390
RTE: 308
FOLIO: 232
PrOntoQA: 158
TellMeWhy: 118
MRPC: 119
ProofWriter: 284
MultiNLI: 345
Adversarial NLI: 333
ConTRoL: 478
RACE: 500

Global maximum token lengths across all splits:
AR-LSAT: 421
ReClor: 393
LogiQA 2.0: 496
RTE: 308
FOLIO: 290
PrOntoQA: 158
TellMeWhy: 120
MRPC: 119
ProofWriter: 284
MultiNLI: 464
Adversarial NLI: 446
ConTRoL: 500
RACE: 500


In [52]:
# Save the filtered dataset
filtered_dataset.save_to_disk('C:/cleaned_dataset-2', max_shard_size="1GB")

Saving the dataset (0/1 shards):   0%|          | 0/1151512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/122956 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/205044 [00:00<?, ? examples/s]

# End of NoteBook