## Code to  filter  different eval datasets based on AOChildes vocabulary

In [None]:
import re
import os

import datasets
from datasets import load_dataset, load_from_disk
from datasets import DatasetDict
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

from datasets import Dataset

#### Filtering used in minibert

In [None]:
def filter_dataset_mini(example, task, aochildes_vocab, contractions):
    if task ==  'anli':
        text = example['premise'] + ' ' + example['hypothesis']
    elif task == 'arc':
        # multiple choice so per task is equal to 4
        choices = [choice for choice in example['choices']['text']]
        text = example['question']
        for choice in choices:
            text = text + ' ' + choice
    elif task == 'boolq':
        text = example['question'] + ' ' + example['passage']
    elif task == 'hellaswag':
        text = example['activity_label'] + ' ' + example['ctx'] + ' ' + example['endings'][0] + ' ' + example['endings'][1] + ' ' + example['endings'][2] + ' ' + example['endings'][3]
    elif task == 'openbookqa':
        text = example['question_stem'] + ' ' + example['choices']['text'][0] + ' ' + example['choices']['text'][1] + ' ' + example['choices']['text'][2] + ' ' + example['choices']['text'][3]
    elif task == 'piqa':
        text = example['goal'] + ' ' + example['sol1'] + ' ' + example['sol2']

    elif task == 'rte':
        text = example['sentence1'] + ' ' + example['sentence2']

    elif task == 'truthfulqa_mc1':
        choices = [choice for choice in example['mc1_targets']['choices']]
        text = example['question']
        for choice in choices:
            text = text + ' ' + choice

    elif task == 'truthfulqa_mc2':
        choices = [choice for choice in example['mc2_targets']['choices']]
        text = example['question']
        for choice in choices:
            text = text + ' ' + choice

    elif task == 'wic':
        text = example['sentence1'] + ' ' + example['sentence2']

    elif task == 'winogrande':
        text =  example['sentence']


    elif task == 'blimp':
        text = example['sentence_good'] + ' ' + example['sentence_bad']

    elif task == 'copa':
        text = example['premise'] + ' ' + example['choice1'] + ' ' + example['choice2']

    elif task == 'sst':
        text = example['sentence']

    elif task == 'qqp':
        text = example['question1'] + ' ' + example['question2']

    elif task == 'mrpc':
        text = example['sentence1'] + ' ' + example['sentence2']

    elif task == 'mnli':
        text = example['premise'] + ' ' + example['hypothesis']

    text = text.split(' ')

    # check if text is in aochildes_vocab
    cleaned_text = [w for n,w in enumerate(text) if (w == w.lower() or n==0)]
    # cleaned_text = [w for n,w in enumerate(text) if (w == w.lower())]


    cleaned_text = [re.sub('[0-9!:&“”—\-_,@#$?;’.\'\(\)"]', '', w.lower()) for w in cleaned_text]
    cleaned_text = [w for w in cleaned_text if w != '' and w not in contractions]
    example['mini_cleaned_text'] = cleaned_text

    in_vocab = True
    for word in cleaned_text:
        if word.lower() not in aochildes_vocab:
            in_vocab = False
            break
    if in_vocab:
        example['ao_filter'] = True
    else:
        example['ao_filter'] = False

    return example

In [None]:
aochildes = pd.read_csv('../data/AOChildes/AOChildes_word_frequency.csv')
aochildes_vocab = set(aochildes.word)

### Filtering on the BLIMP datasets

In [None]:
# filtering on blimp dataset
ds = load_dataset('blimp', 'anaphor_gender_agreement', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_anaphor_gender_agreement')
fil_ds.save_to_disk(save_path)

In [None]:
eval_path = "../eval_datasets/filtered/blimp_anaphor_gender_agreement"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_anaphor_gender_agreement_train"
dataset_new.save_to_disk(save_path)

In [None]:
# filtering on blimp dataset
ds = load_dataset('blimp', 'anaphor_number_agreement', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_anaphor_number_agreement')
fil_ds.save_to_disk(save_path)

In [None]:
eval_path = "../eval_datasets/filtered/blimp_anaphor_number_agreement"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_anaphor_number_agreement_train"
dataset_new.save_to_disk(save_path)

In [None]:
# filtering on blimp dataset
ds = load_dataset('blimp', 'ellipsis_n_bar_1', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_ellipsis_n_bar_1')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_ellipsis_n_bar_1"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_ellipsis_n_bar_1_train"
dataset_new.save_to_disk(save_path)

In [None]:
# filtering on blimp dataset
ds = load_dataset('blimp', 'ellipsis_n_bar_2', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_ellipsis_n_bar_2')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_ellipsis_n_bar_2"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_ellipsis_n_bar_2_train"
dataset_new.save_to_disk(save_path)

In [None]:
# filtering on blimp dataset
ds = load_dataset('blimp', 'irregular_past_participle_adjectives', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_irregular_past_participle_adjectives')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_irregular_past_participle_adjectives"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_irregular_past_participle_adjectives_train"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'irregular_past_participle_verbs', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_irregular_past_participle_verbs')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_irregular_past_participle_verbs"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_irregular_past_participle_verbs_train"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'existential_there_quantifiers_1', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_existential_there_quantifiers_1')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_existential_there_quantifiers_1"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_existential_there_quantifiers_1_train"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'existential_there_quantifiers_2', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_existential_there_quantifiers_2')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_existential_there_quantifiers_2"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_existential_there_quantifiers_2_train"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'superlative_quantifiers_1', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_superlative_quantifiers_1')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_superlative_quantifiers_1"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_superlative_quantifiers_1_train"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'superlative_quantifiers_2', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_superlative_quantifiers_2')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_superlative_quantifiers_2"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_superlative_quantifiers_2_train"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'existential_there_object_raising', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_existential_there_object_raising')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_existential_there_object_raising"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_existential_there_object_raising"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'existential_there_subject_raising', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_existential_there_subject_raising')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_existential_there_subject_raising"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_existential_there_subject_raising"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'expletive_it_object_raising', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_expletive_it_object_raising')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_expletive_it_object_raising"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_expletive_it_object_raising"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'tough_vs_raising_1', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_tough_vs_raising_1')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_tough_vs_raising_1"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_tough_vs_raising_1"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'distractor_agreement_relative_clause', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_distractor_agreement_relative_clause')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_distractor_agreement_relative_clause"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_distractor_agreement_relative_clause"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'irregular_plural_subject_verb_agreement_1', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_irregular_plural_subject_verb_agreement_1')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_irregular_plural_subject_verb_agreement_1"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_irregular_plural_subject_verb_agreement_1"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'irregular_plural_subject_verb_agreement_2', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_irregular_plural_subject_verb_agreement_2')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_irregular_plural_subject_verb_agreement_2"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_irregular_plural_subject_verb_agreement_2"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'regular_plural_subject_verb_agreement_1', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_regular_plural_subject_verb_agreement_1')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_regular_plural_subject_verb_agreement_1"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_regular_plural_subject_verb_agreement_1"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'regular_plural_subject_verb_agreement_2', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_regular_plural_subject_verb_agreement_2')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_regular_plural_subject_verb_agreement_2"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_regular_plural_subject_verb_agreement_2"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'adjunct_island', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_adjunct_island')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_adjunct_island"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_adjunct_island"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('blimp', 'complex_NP_island', split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'blimp_complex_NP_island')
fil_ds.save_to_disk(save_path)

eval_path = "../eval_datasets/filtered/blimp_complex_NP_island"
dataset_train = load_from_disk(eval_path)
dataset_new = DatasetDict({"train": dataset_train})
save_path = "../eval_datasets/filtered_with_keys/blimp_complex_NP_island"
dataset_new.save_to_disk(save_path)

In [None]:
dataset = 'principle_A_reconstruction'
ds = load_dataset('blimp', dataset, split='train')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'blimp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, dataset)
# fil_ds.save_to_disk(save_path)

eval_path = save_path
# dataset_train = load_from_disk(eval_path)
dataset_train = fil_ds
dataset_new = DatasetDict({"train": dataset_train})
save_path_common = "../eval_datasets/filtered_with_keys"
save_path = os.path.join(save_path_common, dataset)
dataset_new.save_to_disk(save_path)

### Filtering on other datasets

In [None]:
ds = load_dataset('super_glue', 'copa', split='validation')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'copa', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'copa')
fil_ds.save_to_disk(save_path)

dataset_test = fil_ds

ds = load_dataset('super_glue', 'copa', split='train')
ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'copa', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'copa_train')
fil_ds.save_to_disk(save_path)

dataset_train = fil_ds

dataset_new = DatasetDict({"train": dataset_train,
                           "validation": dataset_test})
save_path = "../eval_datasets/filtered_with_keys/copa"
dataset_new.save_to_disk(save_path)

In [None]:
# Figure out which split is used for evaluation in Eleuther

ds = load_dataset('glue', 'sst2', split='validation')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'sst', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'sst')
fil_ds.save_to_disk(save_path)

dataset_test = fil_ds

ds = load_dataset('glue', 'sst2', split='train')
ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'sst', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'sst_train')
fil_ds.save_to_disk(save_path)

dataset_train = fil_ds

dataset_new = DatasetDict({"train": dataset_train,
                           "validation": dataset_test})
save_path = "../eval_datasets/filtered_with_keys/sst"
dataset_new.save_to_disk(save_path)

In [None]:
# Figure out which split is used for evaluation in Eleuther

ds = load_dataset('glue', 'qqp', split='validation')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'qqp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'qqp')
fil_ds.save_to_disk(save_path)

dataset_test = fil_ds

ds = load_dataset('glue', 'qqp', split='train')
ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'qqp', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'qqp_train')
fil_ds.save_to_disk(save_path)

dataset_train = fil_ds

dataset_new = DatasetDict({"train": dataset_train,
                           "validation": dataset_test})
save_path = "../eval_datasets/filtered_with_keys/qqp"
dataset_new.save_to_disk(save_path)

In [None]:
# Figure out which split is used for evaluation in Eleuther

ds = load_dataset('glue', 'mrpc', split='validation')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'mrpc', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'mrpc')
fil_ds.save_to_disk(save_path)

dataset_test = fil_ds

ds = load_dataset('glue', 'mrpc', split='train')
ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'mrpc', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'mrpc_train')
fil_ds.save_to_disk(save_path)

dataset_train = fil_ds

dataset_new = DatasetDict({"train": dataset_train,
                           "validation": dataset_test})
save_path = "../eval_datasets/filtered_with_keys/mrpc"
dataset_new.save_to_disk(save_path)

In [None]:
# Figure out which split is used for evaluation in Eleuther

ds = load_dataset('piqa', split='validation')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'piqa', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'piqa')
fil_ds.save_to_disk(save_path)

dataset_test = fil_ds

ds = load_dataset('piqa', split='train')
ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'piqa', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'piqa_train')
fil_ds.save_to_disk(save_path)

dataset_train = fil_ds

dataset_new = DatasetDict({"train": dataset_train,
                           "validation": dataset_test})
save_path = "../eval_datasets/filtered_with_keys/piqa"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('ai2_arc', 'ARC-Easy', split='test')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'arc', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'arc_easy')
fil_ds.save_to_disk(save_path)

dataset_test = fil_ds

ds = load_dataset('ai2_arc', 'ARC-Easy', split='train')
ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'arc', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'arc_train')
fil_ds.save_to_disk(save_path)

dataset_train = fil_ds

dataset_new = DatasetDict({"train": dataset_train,
                           "test": dataset_test})
save_path = "../eval_datasets/filtered_with_keys/arc_easy"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('glue', 'rte', split='validation')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'rte', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'rte')
fil_ds.save_to_disk(save_path)

dataset_test = fil_ds

ds = load_dataset('glue', 'rte', split='train')
ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'rte', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'rte_train')
fil_ds.save_to_disk(save_path)

dataset_train = fil_ds

dataset_new = DatasetDict({"train": dataset_train,
                           "validation": dataset_test})
save_path = "../eval_datasets/filtered_with_keys/rte"
dataset_new.save_to_disk(save_path)

In [None]:
ds = load_dataset('glue', 'mnli', split='validation_matched')
contractions = set(['nt','s','re','t','d','ll'])
print(ds.num_rows)

ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'mnli', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)
print(fil_ds.num_rows)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'mnli')
fil_ds.save_to_disk(save_path)

dataset_test = fil_ds

ds = load_dataset('glue', 'mnli', split='train')
ds_map = ds.map(filter_dataset_mini,
                                # batched=True,
                                num_proc=8,
                                fn_kwargs={'task': 'mnli', 'aochildes_vocab': aochildes_vocab,
                                            'contractions': contractions})
fil_ds = ds_map.filter(lambda x: x['ao_filter'] == True)

eval_path = '../eval_datasets/filtered'
save_path = os.path.join(eval_path, 'mnli_train')
fil_ds.save_to_disk(save_path)

dataset_train = fil_ds

dataset_new = DatasetDict({"train": dataset_train,
                           "validation_matched": dataset_test})
save_path = "../eval_datasets/filtered_with_keys/mnli"
dataset_new.save_to_disk(save_path)