In [1]:
import os

import spacy
import tqdm
import pandas as pd
import numpy as np
nlp = spacy.load('en_core_web_lg')

In [2]:
BLACKLIST = ["he", "she", "it", "they", "who", "her", "we", "them",
             "him", "his", 'their', "hers", "his", "theirs", "i", "me", "you",
             "us"]

def get_noun_chunks(sentence):
    doc = nlp(sentence)
    nc = list(set(filter(lambda n: n.text.lower() not in BLACKLIST, doc.noun_chunks)))
    return nc

def generate_alternates(sent1, sent2, exclude_existing=True, exclude_trivial=True):
    """
    exclude_existing: if the replacement is already in the sentence, do not repeat
    exclude_trivial: if the root word for sent2 is not in sent1, exclude it. This has
    the side effect of removing reformulations and synonyms, which is not ideal...
    """
    sent2s = []
    replacements = get_noun_chunks(sent1)
    noun_chunks = get_noun_chunks(sent2)
    if exclude_trivial:
        noun_chunks = list(filter(lambda n: n.root.text.lower() in sent1.lower(),
                                  noun_chunks))
    for nc in noun_chunks:
        for repl in replacements:
            if repl.root.text.lower() not in nc.text.lower() and nc.root.text.lower() not in repl.text.lower():
                if exclude_existing:
                    if repl.text.lower() not in sent2.lower():
                        sent2s.append(sent2.replace(nc.text, repl.text))
                else:
                    sent2s.append(sent2.replace(nc.text, repl.text))
    return sent2s

def generate_dataset(sent1s, sent2s):
    """
    Given true sentences sent1s and sent2s, constructs a dataset of True and False examples
    """
    all_sent1 = []
    all_sent2 = []
    label = []
    fake_created = []
    for sent1, sent2 in tqdm.tqdm_notebook(zip(sent1s, sent2s), total=len(sent1s)):
        all_sent1.append(sent1)
        all_sent2.append(sent2)
        label.append(1)
        sent2s_false = generate_alternates(sent1, sent2)
        n_alternates = len(sent2s_false)
        fake_created.append(n_alternates)
        all_sent1.extend([sent1] * n_alternates)
        all_sent2.extend(sent2s_false)
        label.extend([0] * n_alternates)
    print(len(fake_created))
    print(sum(fake_created)/len(fake_created))
    return all_sent1, all_sent2, label, fake_created

In [101]:
def construct_sent1(df):
    sent1s = []
    for sents in zip(df.InputSentence1, df.InputSentence2, df.InputSentence3, df.InputSentence4):
        sent1s.append(" ".join(sents))
    return sent1s

def construct_sent2(df):
    sent2s = []
    for ans1, ans2, right in zip(df.RandomFifthSentenceQuiz1, df.RandomFifthSentenceQuiz2, df.AnswerRightEnding):
        if right == 1:
            sent2s.append(ans1)
        else:
            sent2s.append(ans2)
    return sent2s

def generate_split(df, save_path):
    df['sent1'] = construct_sent1(df)
    df['sent2'] = construct_sent2(df)
    all_s1, all_s2, labels, _ = generate_dataset(df['sent1'], df['sent2'])
    data = pd.DataFrame({'sentence1': all_s1, 'sentence2': all_s2, 'label': labels})
    # Shuffle rows to avoid weird training artefacts
    data = data.sample(frac=1)
    print('Data shape', data.shape)
    print('Saving to ', save_path)
#     data.to_csv(save_path, sep='\t', index=False)
    return data

    
FOLDER = '/scratch/tjf324/data/glue_auto_dl/cloze/'
TRAIN_FILE = os.path.join(FOLDER, 'train.tsv')
VAL_FILE = os.path.join(FOLDER, 'valid.tsv')
TEST_FILE = os.path.join(FOLDER, 'test.tsv')

ORIG_VAL = '/scratch/tjf324/data/glue_auto_dl/cloze/cloze_test_val__spring2016 - cloze_test_ALL_val.tsv'
ORIG_TEST = '/scratch/tjf324/data/glue_auto_dl/cloze/cloze_test_test__spring2016 - cloze_test_ALL_test.tsv'

train = pd.read_csv(ORIG_VAL, sep='\t')
val_test = pd.read_csv(ORIG_TEST, sep='\t')

end_val = len(val_test) // 2

gen_train = generate_split(train, TRAIN_FILE)
gen_val = generate_split(val_test.iloc[:end_val].copy(), VAL_FILE)
gen_test = generate_split(val_test.iloc[end_val:].copy(), TEST_FILE)

HBox(children=(IntProgress(value=0, max=1871), HTML(value='')))

1871
6.183324425440941
Data shape (13440, 3)
Saving to  /scratch/tjf324/data/glue_auto_dl/cloze/train.tsv


HBox(children=(IntProgress(value=0, max=935), HTML(value='')))

935
6.096256684491979
Data shape (6635, 3)
Saving to  /scratch/tjf324/data/glue_auto_dl/cloze/valid.tsv


HBox(children=(IntProgress(value=0, max=936), HTML(value='')))

936
6.25
Data shape (6786, 3)
Saving to  /scratch/tjf324/data/glue_auto_dl/cloze/test.tsv


## Investigate preds on WNLI

In [8]:
VAL_WNLI_PREDS = "/scratch/tjf324/pytorch-pretrained-BERT/cloze__wnli/val_preds.csv"
TEST_WNLI_PREDS = "/scratch/tjf324/pytorch-pretrained-BERT/cloze__wnli/test_preds.csv"
val_wnli_preds = pd.read_csv(VAL_WNLI_PREDS, header=None)
test_wnli_preds = pd.read_csv(TEST_WNLI_PREDS, header=None)

print((val_wnli_preds.iloc[:, 1] > val_wnli_preds.iloc[:, 0]).mean())
print((test_wnli_preds.iloc[:, 1] > test_wnli_preds.iloc[:, 0]).mean())

0.2535211267605634
0.4178082191780822


In [13]:
VAL_CLOZE_PREDS = "/scratch/tjf324/pytorch-pretrained-BERT/cloze/val_preds.csv"
val_cloze_preds = pd.read_csv(VAL_CLOZE_PREDS, header=None)
print((val_cloze_preds.iloc[:, 1] > val_cloze_preds.iloc[:, 0]).mean())

0.14061793519216279


In [9]:
CLOZE_VAL = "/scratch/tjf324/data/glue_auto_dl/cloze/valid.tsv"
val_csv = pd.read_csv(CLOZE_VAL, sep='\t')

In [19]:
val_csv['pred'] = (val_cloze_preds.iloc[:, 1] > val_cloze_preds.iloc[:, 0]).astype(int)

In [23]:
mistakes = val_csv[val_csv.label != val_csv.pred]
for _, (s1, s2, l, p) in mistakes.iterrows():
    if l == 0:
        print(s1, s2, l, p)
        print()

Amy and Abby were identical twins. When they were 8, they decided to play a trick on their mom. They started responding to each other's names. They wore each other's clothes. Abby could still tell them apart, though. 0 1

I was really excited about the Jimmy Buffett concert. My friend Jeff was going to come pick me up at 6 in the evening. But I was feeling sleepy, and took a little nap about 4 o'clock. When I finally woke up it was nearly 8 o'clock at night! I was so disappointed that I missed My friend. 0 1

Lee was hiking in the woods all day. When he came home, he felt a weird itch on his arm. Looking down, he saw a tick stuck to his forearm! Lee quickly pulled it off of him. Lee's arm started to bleed where his arm had been. 0 1

Katie saw a woman begging on the street. Others passed without paying any attention, but Katie stopped. She felt very bad for the beggar woman. She fumbled in her purse and handed the woman a $10 bill. the beggar woman walked away smiling after helping the

In [24]:
len(mistakes)


288

In [26]:
val_csv[val_csv.sentence1 == "Emma had been working as a dishwasher. Her hands cracked and bled from the hot soapy water. Then her mom noticed and concocted a special salve for her. Emma used the salve every night before bed."]

Unnamed: 0,sentence1,sentence2,label,pred
104,Emma had been working as a dishwasher. Her han...,bed got better.,0,0
852,Emma had been working as a dishwasher. Her han...,Her hands got better.,1,0
984,Emma had been working as a dishwasher. Her han...,Emma got better.,0,1
1336,Emma had been working as a dishwasher. Her han...,her mom got better.,0,0
2215,Emma had been working as a dishwasher. Her han...,Emma got better.,0,1
3206,Emma had been working as a dishwasher. Her han...,the salve got better.,0,0
5948,Emma had been working as a dishwasher. Her han...,the hot soapy water got better.,0,0
6256,Emma had been working as a dishwasher. Her han...,a dishwasher got better.,0,0
6584,Emma had been working as a dishwasher. Her han...,a special salve got better.,0,0
