In [19]:
from datasets import load_dataset
import re
import numpy as np

In [20]:
label_names_mapping = {
    # taken from, 
    # glue: https://github.com/tensorflow/datasets/blob/1e12611a0be5f4753d271d7eb1dde15eb8f0185c/docs/community_catalog/huggingface/glue.md
    # super_glue: https://github.com/tensorflow/datasets/blob/1e12611a0be5f4753d271d7eb1dde15eb8f0185c/docs/community_catalog/huggingface/super_glue.md
    "cola": [
        "unacceptable",
        "acceptable"
    ],
    "mnli": [
        "entailment",
        "neutral",
        "contradiction"
    ],
    "mrpc": [
        "not_equivalent",
        "equivalent"
    ],
    "qnli": [
        "entailment",
        "not_entailment"
    ],
    "qqp": [
        "not_duplicate",
        "duplicate"
    ],
    "rte": [
        "entailment",
        "not_entailment"
    ],
    "sst2": [
        "negative",
        "positive"
    ],
    "stsb": None,
    "ax": [
        "entailment",
        "neutral",
        "contradiction"
    ],
    "wnli": [
        "not_entailment",
        "entailment"
    ],
    "boolq": [
        "False",
        "True"
    ],
    "cb": [
        "entailment",
        "contradiction",
        "neutral"
    ],
    "copa": [
        "choice1",
        "choice2"
    ],
    "multirc": [
        "False",
        "True"
    ],
    "record": None,
    "wic": [
        "False",
        "True"
    ],
    "wsc": [
        "False",
        "True"
    ],
    "wsc.fixed": [
        "False",
        "True"
    ],
    "axb": [
        "entailment",
        "not_entailment"
    ],
    "axg": [
        "entailment",
        "not_entailment"
    ],
}

In [36]:
def preprocess_glue_one_example(x, benchmark_name, label_names, feature_names=None, id_key='idx'):
    """

    CODE SOURCE: https://github.com/google-research/text-to-text-transfer-transformer/blob/24d9d3b89b129e586bbfe35cffbc5926d88adc5e/t5/data/preprocessors.py#L734C1-L812C12
    
    Convert a dataset from glue to text2text examples.

    This function uses the feature names from the dataset to unpack examples into
    a format amenable for a text2text problem. For example, consider the Quora
    Question Pairs (QQP) benchmark, which would suggest
    benchmark_name="qqp"
    label_names=['not_duplicate', 'duplicate']
    For QQP, a typical example might look like
    {
        "question1": "Why do I easily get bored of my friends?",
        "question2": "Why do I get bored of friends so quickly?",
        "label": 1,
        "idx": 10,
    }

    This example would be transformed to
    {
        "inputs": (
            "qqp question1: Why do I easily get bored of my friends? question2: "
            "Why do I get bored of my friends so quickly?"
        ),
        "targets": "duplicate",
        "idx": 10,
    }

    Args:
    x: an example to process.
    benchmark_name: the name of the GLUE benchmark for this dataset.
    label_names: a list of label names corresponding to class index.
    feature_names: an optional ordered list of feature names. If provided,
        features will be ordered in this way in the output. If not provided, all
        features (except 'idx' and 'label') will be used, sorted by name.
    id_key: str, key for id in the dataset. If not provided, 'idx' will be used.
        if None, no id will be added to the dataset.

    Returns:
    A preprocessed example.
    """
    # If an ordering is not provided, sort feature keys to ensure a consistent
    # order.
    feature_keys = (
        feature_names or sorted(set(x.keys()).difference(['label', 'idx'])))
    # Pack keys (formatted as " key: ") and corresponding text feature
    strs_to_join = []
    for key in feature_keys:
        strs_to_join.append('{}:'.format(key))
        strs_to_join.append(str(x[key]))
    # Add benchmark name at the start
    strs_to_join.insert(0, benchmark_name)

    # label name
    if x['label'] == -1:
        label_name = "<unk>"
    else:
        label_name = label_names[x["label"]]

    ex = {}
    joined = " ".join(strs_to_join)
    if benchmark_name == 'multirc':
        # Remove HTML markup.
        joined = re.sub(r"<br>", " ", joined)
        joined = re.sub(r"<(/)?b>", "", joined)

        # Store the data index in the returned example (used by eval)
        ex['idx/paragraph'] = x['idx']['paragraph']
        ex['idx/question'] = x['idx']['question']
        ex['idx/answer'] = x['idx']['answer']
    else:
        # Store the data index in the returned example (used by eval)
        if id_key:
            ex['idx'] = x[id_key]

    ex['inputs'] = joined
    ex['targets'] = label_name

    return ex

In [50]:
#
import datasets

#
dataset_config_name = "rte"
raw_datasets = load_dataset("super_glue", dataset_config_name)
label_names = label_names_mapping[dataset_config_name]
train_dataset = []
for instance in raw_datasets["train"]:
    instance_dict = preprocess_glue_one_example(
        x=instance,
        benchmark_name=dataset_config_name, 
        label_names=label_names,
    )
    train_dataset.append(instance_dict)
train_dataset = datasets.Dataset.from_list(train_dataset)

eval_dataset = []
for instance in raw_datasets["validation"]:
    instance_dict = preprocess_glue_one_example(
        x=instance,
        benchmark_name=dataset_config_name, 
        label_names=label_names,
    )
    eval_dataset.append(instance_dict)
eval_dataset = datasets.Dataset.from_list(eval_dataset)

test_dataset = []
for instance in raw_datasets["test"]:
    instance_dict = preprocess_glue_one_example(
        x=instance,
        benchmark_name=dataset_config_name, 
        label_names=label_names,
    )
    test_dataset.append(instance_dict)
test_dataset = datasets.Dataset.from_list(test_dataset)

#
len_inputs = [i["inputs"].split(" ").__len__() for i in train_dataset] + [i["inputs"].split(" ").__len__() for i in eval_dataset] + [i["inputs"].split(" ").__len__() for i in test_dataset]
len_targets = [i["targets"].split(" ").__len__() for i in train_dataset] + [i["targets"].split(" ").__len__() for i in eval_dataset] + [i["targets"].split(" ").__len__() for i in test_dataset]

#
print(np.percentile(len_inputs, [0, 25, 50, 75, 90, 100]))
print(np.percentile(len_targets, [0, 25, 50, 75, 90, 100]))

[ 10.  33.  43.  63. 106. 247.]
[1. 1. 1. 1. 1. 1.]


In [52]:
train_dataset[4]

{'idx': 4,
 'inputs': "rte hypothesis: Paul Stewart Hutchinson is accused of having stabbed a girl. premise: A man is due in court later charged with the murder 26 years ago of a teenager whose case was the first to be featured on BBC One's Crimewatch. Colette Aram, 16, was walking to her boyfriend's house in Keyworth, Nottinghamshire, on 30 October 1983 when she disappeared. Her body was later found in a field close to her home. Paul Stewart Hutchinson, 50, has been charged with murder and is due before Nottingham magistrates later.",
 'targets': 'not_entailment'}