In [1]:
from data_utils import load_glue_datasets,\
    load_hans_dataset,\
    load_mnli_mismatched_dataset,\
    load_paws_qqp_dataset,\
    load_cola_ood_dataset,\
    task_to_keys

import os
import numpy as np
import pandas as pd


In [2]:
import numpy as np
import pandas as pd

from data_utils import task_to_keys

def get_sample_ids(path):
    # we save examples as a .csv file which has an "idx" column
    df = pd.read_csv(path, sep=",", index_col=0)
    return df["idx"].values

def _select_subset_by_ids(dataset, indices):
    subset = dataset.select(indices)
    return subset

def _select_subset_by_idx(dataset, indices):
    dataset = dataset.filter(
        lambda s: s["idx"] in indices)
    return dataset

def get_balanced_subsets(dataset):
    subset_per_label = {}
    for label_idx, _ in enumerate(dataset.features["label"].names):
        subset_per_label[label_idx] = dataset.filter(
            lambda s: s["label"] == label_idx)
    return subset_per_label

def _select_random_subset(dataset, num_shots, balanced=False, seed=123):
    # fix seed
    np.random.seed(seed)

    if num_shots < 1:
        return [], []

    if balanced:
        assert num_shots % 2 == 0, "a balanced context requires at least one demonstartion per label"
        # select the same number of samples from every label
        indices = []  # we collect all indices here
        subset_per_label = get_balanced_subsets(dataset)

        for _, samples in subset_per_label.items():
            subset_indices = samples["idx"]
            # select num_shots // 2 samples
            subset_indices = np.random.choice(
                subset_indices, size=num_shots // 2, replace=False)
            indices += list(subset_indices)
        assert len(indices) == num_shots
    else:
        # just select a random subset of samples
        indices = np.random.choice(
            dataset['idx'], size=num_shots, replace=False)

    # return _select_subset_by_ids(dataset, indices), indices
    return _select_subset_by_idx(dataset, indices), indices


def create_few_shot_context(
    dataset_name,
    dataset,
    num_shots,
    description="",
    remove_label=False,
    from_indices=None,
    balanced=False,
    shuffle=False,
    seed=123
):
    separate_description_by="\n\n"
    separate_shots_by="\n\n"
    # select samples from which the context will be constructed
    if from_indices is not None:
        demonstrations = _select_subset_by_ids(dataset, from_indices)
        indices = np.array(from_indices)
    else:
        demonstrations, indices = _select_random_subset(
            dataset, num_shots, balanced, seed)

    if shuffle:
        if len(demonstrations) > 0:
            demonstrations = demonstrations.shuffle(seed)

    # create context
    context = "" if description == "" else f"{description}{separate_description_by}"
    int_to_label_converter = dataset.features['label']

    if task_to_keys[dataset_name][1] is not None:
        pattern = '{prefix1}: {text1}\n{prefix2}: {text2}'
    else:
        pattern = '{prefix1}: {text1}'
    
    for sample in demonstrations:
        second_key_present = task_to_keys[dataset_name][1]
        formated_sample = pattern.format(
            prefix1=task_to_keys[dataset_name][0].capitalize(),
            text1=sample[task_to_keys[dataset_name][0]],
            prefix2=task_to_keys[dataset_name][1].capitalize() if second_key_present is not None else None,
            text2=sample[task_to_keys[dataset_name][1]] if second_key_present is not None else None
        )
        if sample["label"] == -1 or remove_label:
            verbalized_label = ""
        else:
            verbalized_label = int_to_label_converter.int2str(sample["label"])
        context += f"{formated_sample}\nLabel: {verbalized_label}{separate_shots_by}"

    return context, indices

In [3]:
### rte, mnli, qqp

In [4]:
data_set_used='mnli'
datasets, labels, num_labels = load_glue_datasets(data_set_used)
context, indices = create_few_shot_context(
    data_set_used,
    datasets['train'],
    3
)
print(context)

Premise: The Collection and Analysis of Qualitative Data in Evaluation Research.
Hypothesis: Only quantitative data can be used in evaluation research.
Label: contradiction

Premise: The total producer costs estimated by EPA including the costs of certification, addization of the detergents, recordkeeping and enforcement through the year 2000 is almost $704 million.
Hypothesis: The costs of certification are not included in EPA's total producer costs estimates.
Label: contradiction

Premise: If you watch the action closely, you can learn a lot about Indian people by what makes them cheer, laugh, or weep.
Hypothesis: You can learn a lot about Indian people by watching their actions.
Label: entailment




In [5]:
data_set_used='mnli'
datasets, labels, num_labels = load_glue_datasets(data_set_used)
context, indices = create_few_shot_context(
    data_set_used,
    datasets['validation_matched'],
    3
)
print(context)

Premise: A 1997 Henry J. Kaiser Family Foundation survey found that Americans in managed care plans are basically content with their own care.
Hypothesis: the Henry Kaiser foundation shows that people like their healthcare
Label: entailment

Premise: In DOD's current acquisition environment, the customer is willing to trade time and money for the highest performing weapon system possible.
Hypothesis: Having the highest performing weapon system is of paramount importance being prioritized above time and money.
Label: entailment

Premise: Eh! Monsieur Lawrence, called Poirot. 
Hypothesis: Poirot requested the attention of Monsieur Lawrence.
Label: entailment




In [6]:
data_set_used='rte'
datasets, labels, num_labels = load_glue_datasets(data_set_used)
context, indices = create_few_shot_context(
    data_set_used,
    datasets['train'],
    3
)
print(context)

Sentence1: Britain agreed to lift by March 31 a 150-mile military protection zone enforced around the islands since Argentina invaded them in 1982.
Sentence2: The military protection zone around Falklands was lifted.
Label: entailment

Sentence1: Clonaid, which claims to have produced 13 cloned babies worldwide, told the Streats daily newspaper two Singaporean couples had signed deals agreeing to pay $200,000 to conceive children through cloning.
Sentence2: Clonaid has cloned 13 babies.
Label: entailment

Sentence1: In 1927 Harnold Lamb wrote a biography of Genghis Khan, and following on its success turned more and more to the writing of non-fiction, penning numerous biographies and history books until his death in 1962.
Sentence2: Harnold Lamb authored many biographies.
Label: entailment




In [7]:
data_set_used='rte'
datasets, labels, num_labels = load_glue_datasets(data_set_used)
context, indices = create_few_shot_context(
    data_set_used,
    datasets['validation'],
    3
)
print(context)

Sentence1: Italian film-maker, Fellini was awarded an honorary Oscar for lifetime achievement. He died on October 31, 1993.
Sentence2: An Italian director is awarded an honorary Oscar.
Label: entailment

Sentence1: The Daily Telegraph, most prized asset in Lord Conrad Black's crumbling media empire, has been sold to Britain's Barclay twins.
Sentence2: Daily telegraph is sold.
Label: entailment

Sentence1: Monica Meadows, a 22-year-old model from Atlanta, was shot in the shoulder on a subway car in New York City.
Sentence2: Monica Meadows, 23, was shot in shoulder while riding a subway car in New York City
Label: not_entailment




In [8]:
data_set_used='qqp'
datasets, labels, num_labels = load_glue_datasets(data_set_used)
context, indices = create_few_shot_context(
    data_set_used,
    datasets['train'],
    3
)
print(context)

Question1: How soon one can learn guitar?
Question2: What is the easiest way to learn guitar?
Label: not_duplicate

Question1: Does global warming exist?
Question2: Is Global warming real or a hoax?
Label: duplicate

Question1: Which laptop configuration is better? I3-5th gen with 8gb ram or i5-6th gen with 4gb ram.?
Question2: Which laptop configuration is better? I3-5th gen with 8gb ram or i5-6th gen with 4gb ram.
Label: duplicate




In [9]:
data_set_used='qqp'
datasets, labels, num_labels = load_glue_datasets(data_set_used)
context, indices = create_few_shot_context(
    data_set_used,
    datasets['validation'],
    3
)
print(context)

Question1: Where can I find a date picker similar to the one in Google Analytics? Not Good
Question2: Is it a good idea to block Google Analytics?
Label: not_duplicate

Question1: What is Fiscal deficite?
Question2: What is Fiscal Deficit?
Label: duplicate

Question1: Why do noses never stop growing?
Question2: Is it true your nose never stops growing?
Label: duplicate




In [10]:
data_set_used='hans'
datasets, subset = load_hans_dataset(heuristic='lexical_overlap')
context, indices = create_few_shot_context(
    data_set_used,
    datasets,
    3
)
print(context)

Premise: The senators admired the lawyers .
Hypothesis: The lawyers admired the senators .
Label: non-entailment

Premise: The doctors advised the artists who the presidents contacted .
Hypothesis: The presidents advised the doctors .
Label: non-entailment

Premise: The athlete was thanked by the banker .
Hypothesis: The banker thanked the athlete .
Label: entailment




In [11]:
data_set_used='paws-qqp'
data_path = os.path.join(os.getcwd(),'data')
dataset, dataset_name = load_paws_qqp_dataset(path=os.path.join(data_path,'paws_qqp','dev_and_test.tsv'))
context, indices = create_few_shot_context(
    data_set_used,
    dataset,
    3
)
print(context)

Sentence1: Is LinkedIn a good acquisition for Microsoft ? Does it make strategic sense ?
Sentence2: Is LinkedIn a strategic acquisition for Microsoft ? Does it make good sense ?
Label: non_duplicate

Sentence1: What are some of the good so-called Indian `Art-films ' ?
Sentence2: What are some of the Indian so-called good `Art-films ' ?
Label: non_duplicate

Sentence1: I 'm 16 years old girl . I am 5 ' 3 '' tall and weigh 80 kg . How do I lose weight ?
Sentence2: I am 16 years old girl . I 'm 5 3 `` tall and weigh 80 kg . How do I lose weight ?
Label: duplicate


