## Make a WNLI Multiple Choice dataset

In [130]:
import numpy as np
import pandas as pd
import tqdm
import spacy
from sklearn.metrics import confusion_matrix
nlp = spacy.load('en_core_web_lg')
pd.set_option("display.max_colwidth", 400)
NUM_CHOICES = 4

In [201]:
BLACKLIST = ["he", "she", "it", "they", "who", "her", "we", "them",
             "him", "his", 'their', "hers", "his", "theirs", "i", "me", "you",
             "us"]

def get_noun_chunks(sentence):
    doc = nlp(sentence)
    nc = list(set(filter(lambda n: n.text.lower() not in BLACKLIST, doc.noun_chunks)))
    return nc

def generate_alternates(sent1, sent2, exclude_existing=False, exclude_trivial=False,
                        replace_root_mode="both"):
    """
    exclude_existing: if the replacement is already in the sentence, do not repeat
    exclude_trivial: if the root word for sent2 is not in sent1, exclude it. This has
    the side effect of removing reformulations and synonyms, which is not ideal...
    replace_root: "yes", "no", "both"
    """
    sent2s = []
    replacements = get_noun_chunks(sent1)
    noun_chunks = get_noun_chunks(sent2)
    if exclude_trivial:
        noun_chunks = list(filter(lambda n: n.root.text.lower() in sent1.lower(),
                                  noun_chunks))
    for nc in noun_chunks:
        for repl in replacements:
            if (repl.root.text.lower() in nc.text.lower() or nc.root.text.lower() in repl.text.lower()):
                continue
            if exclude_existing and (repl.text.lower() in sent2.lower() or repl.root.text.lower() in sent2.lower()):
                continue
            if replace_root_mode == "yes":
                sent2s.append(sent2.replace(nc.root.text, repl.root.text, 1).lower())
            elif replace_root_mode == "no":
                sent2s.append(sent2.replace(nc.text, repl.text, 1).lower())
            elif replace_root_mode == "both":
                sent2s.append(sent2.replace(nc.text, repl.text, 1).lower())
                sent2s.append(sent2.replace(nc.root.text, repl.root.text, 1).lower())
            else:
                raise Exception
    sent2s = [s for s in set(sent2s) if s != sent2.lower()]
    return sent2s

In [370]:
def generate_dataset(sent1s, sent2s, reg_size=None, 
                     exclude_existing=False, exclude_trivial=False, replace_root_mode="both",
                     skip_empty=False):
    """
    Given true sentences sent1s and sent2s, constructs a dataset of True and False examples
    If reg_size, make sure there's always reg_size alternatives, either by removing some alternates or
    by adding random sentences
    """
    all_sent1 = []
    all_sent2 = []
    label = []
    fake_created = []
    weird_sentences = ["Glue Glue Glue", "Leaderboard Leaderboard Leaderboard", "Meuh Meuh Meuh",
                      "Beh Beh Beh", "LOL LOL LOL", "DOIU DOIU DOIU", "RRR RRR RRR"]
    cnt = 0
    for sent1, sent2 in tqdm.tqdm_notebook(zip(sent1s, sent2s), total=len(sent1s)):
        sent2s_false = generate_alternates(sent1, sent2, exclude_existing=exclude_existing,
                                           exclude_trivial=exclude_trivial, replace_root_mode=replace_root_mode)
        if skip_empty and not sent2s_false:
            continue
        all_sent1.append(sent1)
        all_sent2.append(sent2)
        label.append(1)

        n_alternates = len(sent2s_false)
        if not n_alternates:
            cnt += 1
            sent2s_false = ["Glue Glue Glue"]
            n_alternates = 1
        if reg_size is not None:
            if n_alternates == (reg_size - 1):
                pass # We got the right number!
            elif n_alternates > (reg_size - 1):
                sent2s_false = list(np.random.choice(sent2s_false, reg_size - 1, replace=False))
            elif n_alternates < (reg_size - 1):
                sent2s_false = sent2s_false + list(np.random.choice(weird_sentences, 
                                                                    reg_size - 1 - len(sent2s_false),
                                                                    replace=False))
                # Keep replace = False to ensure we have the right number...
            n_alternates = reg_size - 1
        
        fake_created.append(n_alternates)
        all_sent1.extend([sent1] * n_alternates)
        all_sent2.extend(sent2s_false)
        label.extend([0] * n_alternates)
    print("Number of sentence 1s:", len(fake_created))
    print("Mean number of fakes:", sum(fake_created)/len(fake_created))
    print("Counter of glu", cnt)
    return all_sent1, all_sent2, label, fake_created

In [203]:
def generate_train_for_csv(s1s, s2s, labels, num_choices=NUM_CHOICES):
    """
    In training every first sentence (0th) is true
    """
    rows = []
    for i, (s1, s2) in enumerate(zip(s1s, s2s)):
        if i % num_choices == 0:
            rows.append([s1, s2])
        else:
            rows[-1].append(s2)
    names = ["start"] + [f"cont_{i}" for i in range(num_choices)] 
    data = pd.DataFrame(rows, columns=names)
    data["label"] = 0
    return data

def generate_val_for_csv(s1s, s2s, labels, num_choices=NUM_CHOICES):
    """
    Labels has len s1s // num_choices
    """
    assert len(labels) == len(s1s) // num_choices
    rows = []
    for i, (s1, s2) in enumerate(zip(s1s, s2s)):
        if i % num_choices == 0:
            rows.append([s1, s2])
        else:
            rows[-1].append(s2)
    names = ["start"] + [f"cont_{i}" for i in range(num_choices)] 
    data = pd.DataFrame(rows, columns=names)
    data["label"] = labels
    return data

### Create WNLI Recast data

In [208]:
WNLI_TRAIN_PATH = "/scratch/tjf324/data/glue_auto_dl/WNLI/train.tsv"
WNLI_DEV_PATH = "/scratch/tjf324/data/glue_auto_dl/WNLI/dev.tsv"
WNLI_TEST_PATH = "/scratch/tjf324/data/glue_auto_dl/WNLI/test.tsv"

train = pd.read_csv(WNLI_TRAIN_PATH, sep='\t')
val = pd.read_csv(WNLI_DEV_PATH, sep='\t')
test = pd.read_csv(WNLI_TEST_PATH, sep='\t')

train_true = train[train.label == 1]
all_s1_train, all_s2_train, labels_train, _ = generate_dataset(train_true['sentence1'], 
                                                               train_true['sentence2'], NUM_CHOICES)

train_recast = generate_train_for_csv(all_s1_train, all_s2_train, labels_train, NUM_CHOICES)
train_recast.to_csv('/scratch/tjf324/data/glue_auto_dl/WNLI_RECAST/train.tsv', sep='\t')

all_s1_val, all_s2_val, labels_val, _ = generate_dataset(val['sentence1'], val['sentence2'], NUM_CHOICES)
val_recast = generate_val_for_csv(all_s1_val, all_s2_val, (val.label == 0).astype(int), NUM_CHOICES)
val_recast.to_csv('/scratch/tjf324/data/glue_auto_dl/WNLI_RECAST/dev.tsv', sep='\t')

all_s1_test, all_s2_test, labels_test, _ = generate_dataset(test['sentence1'], test['sentence2'], NUM_CHOICES)
test_recast = generate_val_for_csv(all_s1_test, all_s2_test, (test.sentence1 == "0").astype(int), NUM_CHOICES)
test_recast.to_csv('/scratch/tjf324/data/glue_auto_dl/WNLI_RECAST/test.tsv', sep='\t')

HBox(children=(IntProgress(value=0, max=312), HTML(value='')))

Number of sentence 1s: 312
Mean number of fakes: 3.0
Counter of glu 0


HBox(children=(IntProgress(value=0, max=71), HTML(value='')))

No alternatives, setting sent2s_false to 'Glue Glue Glue'
Number of sentence 1s: 71
Mean number of fakes: 3.0
Counter of glu 1


HBox(children=(IntProgress(value=0, max=146), HTML(value='')))

Number of sentence 1s: 146
Mean number of fakes: 3.0
Counter of glu 0


### Create MNLI Recast data

In [359]:
MNLI_TRAIN_PATH = "/scratch/tjf324/data/glue_auto_dl/MNLI/train.tsv"
MNLI_DEV_PATH = "/scratch/tjf324/data/glue_auto_dl/MNLI/dev_matched.tsv"

train_mnli = pd.read_csv(MNLI_TRAIN_PATH, sep='\t', error_bad_lines=False) # lazy
val_mnli = pd.read_csv(MNLI_DEV_PATH, sep='\t', error_bad_lines=False)

train_mnli = train_mnli[train_mnli.gold_label == "entailment"].head(2000)
val_mnli = val_mnli[val_mnli.gold_label == "entailment"].head(500)

b'Skipping line 24810: expected 12 fields, saw 13\nSkipping line 33961: expected 12 fields, saw 13\n'
b'Skipping line 75911: expected 12 fields, saw 13\nSkipping line 100114: expected 12 fields, saw 13\n'
b'Skipping line 150638: expected 12 fields, saw 13\nSkipping line 158834: expected 12 fields, saw 13\nSkipping line 173104: expected 12 fields, saw 13\nSkipping line 178252: expected 12 fields, saw 13\n'
b'Skipping line 221951: expected 12 fields, saw 13\n'
b'Skipping line 286845: expected 12 fields, saw 13\nSkipping line 314110: expected 12 fields, saw 13\n'


In [371]:
mnli_all_s1_train, mnli_all_s2_train, mnli_labels_train, _ = generate_dataset(train_mnli['sentence1'], 
                                                               train_mnli['sentence2'], NUM_CHOICES,
                                                               exclude_existing=True, exclude_trivial=True,
                                                               skip_empty=True)

mnli_train_recast = generate_train_for_csv(mnli_all_s1_train, mnli_all_s2_train, mnli_labels_train, NUM_CHOICES)
mnli_train_recast.to_csv('/scratch/tjf324/data/glue_auto_dl/MNLI_RECAST/train.tsv', sep='\t')

HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))

Number of sentence 1s: 1254
Mean number of fakes: 3.0
Counter of glu 0


In [372]:


mnli_all_s1_val, mnli_all_s2_val, mnli_labels_val, _ = generate_dataset(val_mnli['sentence1'], val_mnli['sentence2'], NUM_CHOICES,
                                                                       exclude_existing=True, exclude_trivial=True,
                                                                       skip_empty=True)
mnli_val_recast = generate_train_for_csv(mnli_all_s1_val, mnli_all_s2_val, mnli_labels_val, NUM_CHOICES)
mnli_val_recast.to_csv('/scratch/tjf324/data/glue_auto_dl/MNLI_RECAST/dev.tsv', sep='\t')

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

Number of sentence 1s: 302
Mean number of fakes: 3.0
Counter of glu 0


In [374]:
!cd /scratch/tjf324/pytorch-pretrained-BERT/ && rm -rf /scratch/tjf324/pytorch-pretrained-BERT/mnli_recast/ \
&& GLUE_DIR=/scratch/tjf324/data/glue_auto_dl/  PYTORCH_PRETRAINED_BERT_CACHE=/scratch/tjf324/models/bert \
BERT_ALL_DIR=/scratch/tjf324/models/bert/ python glue/train.py \
    --task_name mnli_recast \
    --do_train --do_val --do_test --do_val_history \
    --do_save \
    --do_lower_case \
    --bert_model bert-large-uncased \
    --bert_load_path /scratch/tjf324/models/bert/stilts/mnli.p \
    --bert_load_mode model_only \
    --bert_save_mode model_all \
    --train_batch_size 8 \
    --learning_rate 2e-5 \
    --output_dir mnli_recast

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
  data_dir: None
  bert_model: bert-large-uncased
  task_name: mnli_recast
  output_dir: mnli_recast
  bert_load_path: /scratch/tjf324/models/bert/stilts/mnli.p
  bert_load_mode: model_only
  bert_load_args: None
  bert_config_json_path: None
  bert_vocab_path: None
  bert_save_mode: model_all
  max_seq_length: 128
  do_save: True
  do_train: True
  do_val: True
  do_test: True
  do_val_history: True
  train_save_every: None
  do_lower_case: True
  train_batch_size: 8
  eval_batch_size: 32
  learning_rate: 2e-05
  num_train_epochs: 3.0
  warmup_proportion: 0.1
  no_cuda: False
  local_rank: -1
  seed: -1
  gradient_accumulation_steps: 1
  fp16: False
  loss_scale: 0
  print_trainable_params: False
  not_verbose: False
  force_overwrite: False
03/07/2019 13:15:15 - INFO - __main__ -   device: cuda n_gpu: 1, distributed training: False, 16-bits training: False
03/07/2019 13:15:15 - INFO - __main__ -

Training:   4%|█▎                               | 6/157 [00:09<04:06,  1.64s/it][A
Training:   4%|█▍                               | 7/157 [00:11<04:05,  1.64s/it][A
Training:   5%|█▋                               | 8/157 [00:13<04:03,  1.64s/it][A
Training:   6%|█▉                               | 9/157 [00:14<04:01,  1.63s/it][A
Training:   6%|██                              | 10/157 [00:16<04:00,  1.64s/it][A
Training:   7%|██▏                             | 11/157 [00:17<03:59,  1.64s/it][A
Training:   8%|██▍                             | 12/157 [00:19<03:57,  1.64s/it][A
Training:   8%|██▋                             | 13/157 [00:21<03:55,  1.64s/it][A
Training:   9%|██▊                             | 14/157 [00:22<03:54,  1.64s/it][A
Training:  10%|███                             | 15/157 [00:24<03:52,  1.64s/it][A
Training:  10%|███▎                            | 16/157 [00:26<03:50,  1.64s/it][A
Training:  11%|███▍                            | 17/157 [00:27<03:49,  1.64s

Training:  20%|██████▎                         | 31/157 [00:50<03:26,  1.64s/it][A
Training:  20%|██████▌                         | 32/157 [00:52<03:24,  1.64s/it][A
Training:  21%|██████▋                         | 33/157 [00:54<03:23,  1.64s/it][A
Training:  22%|██████▉                         | 34/157 [00:55<03:21,  1.64s/it][A
Training:  22%|███████▏                        | 35/157 [00:57<03:20,  1.64s/it][A
Training:  23%|███████▎                        | 36/157 [00:59<03:18,  1.64s/it][A
Training:  24%|███████▌                        | 37/157 [01:00<03:16,  1.64s/it][A
Training:  24%|███████▋                        | 38/157 [01:02<03:15,  1.64s/it][A
Training:  25%|███████▉                        | 39/157 [01:03<03:13,  1.64s/it][A
Training:  25%|████████▏                       | 40/157 [01:05<03:11,  1.64s/it][A
Training:  26%|████████▎                       | 41/157 [01:07<03:10,  1.64s/it][A
Training:  27%|████████▌                       | 42/157 [01:08<03:08,  1.64s

In [375]:
!cd /scratch/tjf324/pytorch-pretrained-BERT/ && rm -rf /scratch/tjf324/pytorch-pretrained-BERT/wnli_recast/ \
&& GLUE_DIR=/scratch/tjf324/data/glue_auto_dl/  PYTORCH_PRETRAINED_BERT_CACHE=/scratch/tjf324/models/bert \
BERT_ALL_DIR=/scratch/tjf324/models/bert/ python glue/train.py \
    --task_name wnli_recast \
    --do_train --do_val --do_test --do_val_history \
    --do_save \
    --do_lower_case \
    --bert_model bert-large-uncased \
    --bert_load_path /scratch/tjf324/pytorch-pretrained-BERT/mnli_recast/all_state.p \
    --bert_load_mode state_model_only \
    --bert_save_mode model_all \
    --train_batch_size 8 \
    --learning_rate 2e-5 \
    --output_dir wnli_recast

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
  data_dir: None
  bert_model: bert-large-uncased
  task_name: wnli_recast
  output_dir: wnli_recast
  bert_load_path: /scratch/tjf324/pytorch-pretrained-BERT/mnli_recast/all_state.p
  bert_load_mode: state_model_only
  bert_load_args: None
  bert_config_json_path: None
  bert_vocab_path: None
  bert_save_mode: model_all
  max_seq_length: 128
  do_save: True
  do_train: True
  do_val: True
  do_test: True
  do_val_history: True
  train_save_every: None
  do_lower_case: True
  train_batch_size: 8
  eval_batch_size: 32
  learning_rate: 2e-05
  num_train_epochs: 3.0
  warmup_proportion: 0.1
  no_cuda: False
  local_rank: -1
  seed: -1
  gradient_accumulation_steps: 1
  fp16: False
  loss_scale: 0
  print_trainable_params: False
  not_verbose: False
  force_overwrite: False
03/07/2019 13:34:38 - INFO - __main__ -   device: cuda n_gpu: 1, distributed training: False, 16-bits training: False
03/07/2019 

### Interpretation, results

In [376]:
VAL_PREDS = '/scratch/tjf324/pytorch-pretrained-BERT/wnli_recast/val_preds.csv'

eval_preds = pd.read_csv(VAL_PREDS, header=None)
eval_preds = eval_preds.as_matrix().argmax(axis=1)


preds = (eval_preds == 0).astype(int)
precise_preds = eval_preds

print("Mean pred", preds.mean()*100)
print("Accuracy", (preds == val.label).mean()*100)

val["predicted"] = preds
val["precise_preds"] = precise_preds

Mean pred 69.01408450704226
Accuracy 43.66197183098591


  after removing the cwd from sys.path.


In [328]:
confusion_matrix(val.label, preds)

array([[12, 28],
       [11, 20]])

In [292]:
# for _, (ix, s1, s2, l, p, pp) in val.iterrows():
#     if l != p and l==1:
#         print(f"Orig: {s1}, prop: {s2}, y={l}, pred={p}, chosen={val_recast.iloc[_]['cont_{}'.format(pp)]}")
# #         print(val_recast.iloc[_][['cont_{}'.format(i) for i in range(3)]])
#         print("@@@")