In [1]:
import random
import sys
from matplotlib import pyplot as plt
sys.path.append("/scratch/tjf324/pytorch-pretrained-BERT/")
from pytorch_pretrained_bert import modeling, tokenization
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
import logging
import torch
import copy
import numpy as np
import pandas as pd
import tqdm
import spacy
nlp = spacy.load('en_core_web_lg')


from language_modeling.runners import (
    tokenize_example, InputExample,
    convert_example_to_features, features_to_data,
)

WNLI_TRAIN_PATH = "/scratch/tjf324/data/glue_auto_dl/WNLI/train.tsv"
WNLI_DEV_PATH = "/scratch/tjf324/data/glue_auto_dl/WNLI/dev.tsv"
WNLI_TEST_PATH = "/scratch/tjf324/data/glue_auto_dl/WNLI/test.tsv"



def get_pos(sent):
    return [token.pos_ for token in nlp(sent)]

def is_noun(pos):
    return pos in ["PRON", "PROPN", "NOUN"]

def get_pos_dict(sent):
    return {
        token.text.lower(): token.pos_
        for token in nlp(sent)
    }

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
device = torch.device("cuda:0")
bert_model_name = "bert-large-uncased"
max_sequence_length = 128
MASK = "[MASK]"

In [3]:
model = modeling.BertForPreTraining.from_pretrained(bert_model_name)
model.to(device);
model.eval();

In [4]:
tokenizer = tokenization.BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)

In [5]:
train_df = pd.read_csv(WNLI_TRAIN_PATH, sep="\t")
val_df = pd.read_csv(WNLI_DEV_PATH, sep="\t")

test_df = pd.read_csv(WNLI_TEST_PATH, sep="\t")

In [6]:
print((train_df["label"]==0).mean())
print((val_df["label"]==0).mean())

0.5086614173228347
0.5633802816901409


In [7]:
def filter_pos_dict(pos_dict):
    BLACKLIST = ["he", "she", "it", "they", "who", "her", "we", "them",
                 "him", "his", 'their', "hers", "his", "theirs", "i", "me", "you",
                 "us", ]
    return {noun: pos 
            for noun, pos in pos_dict.items()
            if is_noun(pos) and noun not in BLACKLIST}

def get_token_groups(words):
    return [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(w)) for w in words]

def get_pos_dict_and_token_groups(text):
    pos_dict = filter_pos_dict(get_pos_dict(text))
    tok_groups = get_token_groups(pos_dict.keys())
    return pos_dict, tok_groups

def get_length_info(tok_a, tok_b):
    sent_a_lengths = set(len(a_i) for a_i in tok_a)
    sent_b_lengths = set(len(b_i) for b_i in tok_b)
    sent_a_one_length = len(sent_a_lengths) == 1
    sent_b_one_length = len(sent_b_lengths) == 1
    same_length = sent_a_lengths == sent_b_lengths
    return sent_a_one_length, sent_b_one_length, same_length

def mask_predict_one_example(tokenizer, model, row):
    alt_ls = []
    pred_result = True
    example = InputExample(guid=0, text_a=row["sentence1"], text_b=row["sentence2"], is_next=True)

    tokens_a_pos_dict, tokens_a_token_groups = get_pos_dict_and_token_groups(example.text_a)
    tokens_b_pos_dict, tokens_b_token_groups = get_pos_dict_and_token_groups(example.text_b)

    tokenized_example = tokenize_example(example, tokenizer)

    b_ids = tokenizer.convert_tokens_to_ids(tokenized_example.tokens_b)

    sent_a_one_length, sent_b_one_length, same_length = get_length_info(tokens_a_token_groups, 
                                                                        tokens_b_token_groups)
#         if sent_a_one_length and sent_b_one_length and same_length:
#             pred_ls.append(False)
#             all_alt_ls.append(alt_ls)
#             continue 

    for i, tok_id in enumerate(b_ids):
        if not any(tok_id == tok_group[0] for tok_group in tokens_b_token_groups):
            continue
        else:
            to_mask = max(len(tok_group) 
                          for tok_group in tokens_b_token_groups 
                          if tok_id == tok_group[0])

        tokenized_example_changed = copy.deepcopy(tokenized_example)
        for idx in range(i, i+to_mask):
            tokenized_example_changed.tokens_b[idx] = MASK

        features = convert_example_to_features(tokenized_example_changed, tokenizer, 
                                               max_sequence_length, select_prob=0.0)

        batch = features_to_data([features]).to(device)
        with torch.no_grad():
            result = model(batch.input_ids, batch.segment_ids, batch.input_mask)

        masked_indices = np.arange(batch.input_ids.shape[1])[batch.input_ids[0].cpu().numpy() == 103]
        first_masked_index = masked_indices[0]


        preds_first_token = result[0][0, first_masked_index, :].cpu().numpy()
        possible_first_tokens = [tok_id] 
        # Possible first tokens for b:
        # Tokens in a that are not in b once masked (no repeats)
        # Though this does not seem to be beneficial so disabling for now
        # We're already overpredicting True...
        remaining_b_tok_ids = tokenizer.convert_tokens_to_ids(tokenized_example_changed.tokens_b)
        possible_first_tokens += [tok_id_a[0] 
                                  for tok_id_a, tok_a in zip(tokens_a_token_groups, tokens_a_pos_dict)
                                  if tok_a not in tokenized_example_changed.tokens_b 
                                  and tok_a[0] != tok_id]   # This handles weird cases with plurals...


        kept_preds_first_token = preds_first_token[possible_first_tokens]



        most_predicted = np.argmax(kept_preds_first_token)
        # If different than 0, than we're predicting something else....
        actually_predicted = tokenizer.ids_to_tokens[possible_first_tokens[most_predicted]]
        replaced = tokenizer.ids_to_tokens[tok_id]
        alt_ls.append(example.text_b.lower().replace(replaced, actually_predicted.upper()))

        if most_predicted:
            pred_result = False
    return pred_result, alt_ls, tokens_a_pos_dict, tokens_a_token_groups, tokens_b_pos_dict, tokens_b_token_groups

    

def masking_predictor(df, tokenizer, model):
    all_pos_dicts_sent1 = []
    all_pos_dicts_sent2 = []
    all_tok_groups_sent1 = []
    all_tok_groups_sent2 = []
    pred_ls = []
    all_alt_ls = []
    for _, row in tqdm.tqdm_notebook(df.iterrows(), total=len(df)):
        pred, alt_ls, pos_dict_a, tok_group_a, pos_dict_b, tok_group_b = mask_predict_one_example(tokenizer, 
                                                                                                  model, row)
        all_pos_dicts_sent1.append(pos_dict_a)
        all_tok_groups_sent1.append(tok_group_a)
        all_pos_dicts_sent2.append(pos_dict_b)
        all_tok_groups_sent2.append(tok_group_b)
        pred_ls.append(pred)
        all_alt_ls.append(alt_ls)
        
    pred_arr = np.array(pred_ls)
    return pred_arr, all_alt_ls, all_pos_dicts_sent1, all_pos_dicts_sent2, all_tok_groups_sent1, all_tok_groups_sent2 

In [8]:
train_pred_arr, train_all_alt_ls, pos_a, pos_b, tok_a, tok_b = masking_predictor(train_df, tokenizer, model)

# val_pred_arr, val_all_alt_ls, *_ = masking_predictor(val_df, tokenizer, model)

print("Train acc: ", (train_pred_arr==train_df["label"]).mean())
# print("Val acc: ", (val_pred_arr==val_df["label"]).mean())

print("mean train pred:", train_pred_arr.mean())
# print("mean val pred:", val_pred_arr.mean())

HBox(children=(IntProgress(value=0, max=635), HTML(value='')))


Train acc:  0.5748031496062992
mean train pred: 0.6047244094488189


In [38]:
BLACKLIST = ["he", "she", "it", "they", "who", "her", "we", "them",
             "him", "his", 'their', "hers", "his", "theirs", "i", "me", "you",
             "us"]

def get_noun_chunks(sentence):
    doc = nlp(sentence)
    nc = list(set(filter(lambda n: n.text.lower() not in BLACKLIST, doc.noun_chunks)))
    return nc

def get_masked_examples(sent1, begin, end, tokenizer):
    """
    Generate the masked examples
    """
    base_example = InputExample(guid=0, text_a=sent1, text_b=begin + end, is_next=True)
    tokenized_example = tokenize_example(base_example, tokenizer)
    
    begin_tok = tokenizer.tokenize(begin)
    end_tok = tokenizer.tokenize(end)
    print(end_tok)
    masked_examples = []
    right = tokenizer.convert_tokens_to_ids(end_tok)
    for i in range(len(end_tok)):
        masked_end_tok = end_tok.copy()
        masked_end_tok[i] = MASK
        tokenized_example_changed = copy.deepcopy(tokenized_example)
        tokenized_example_changed.tokens_a = tokenized_example_changed.tokens_a + begin_tok + masked_end_tok 
        tokenized_example_changed.tokens_b = ["glue"]
#         tokenized_example_changed.tokens_b = begin_tok + masked_end_tok 
        masked_examples.append(tokenized_example_changed)
    return masked_examples, right
    

def get_mean_predictions(model, masked_examples, right):
    """
    Once we have a sentence, see how likely the end is when removing words one by one
    """
    features = [convert_example_to_features(ex, tokenizer, max_sequence_length, select_prob=0.0)
                for ex in masked_examples]
    batch = features_to_data(features).to(device)
    with torch.no_grad():
        result = model(batch.input_ids, batch.segment_ids, batch.input_mask)
    ids = np.arange(batch.input_ids.shape[1])
    probs = []
    for i, right_idx in enumerate(right):
        masked_idx = ids[batch.input_ids[i].cpu().numpy() == 103]
        assert(len(masked_idx) == 1)
        masked_idx = masked_idx[0]
        pred_token = result[0][0, masked_idx, :].cpu().numpy()
        prob = np.exp(pred_token[right_idx])/np.exp(pred_token).sum()
        probs.append(prob)
    probs = np.array(probs)
    return probs.mean()

def filling_predictor(df, tokenizer, model):
    """
    The approach here is closer to `A simple method for commonsense reasoning`
    e.g:
        text_a = "the yellow duck liked the fish because it was beautiful"
        text_b = "the fish was beautiful"
    We produce:
        text_b_alt_1 = mean("the fish was [...]" -> beautiful ; "the fish [...] beautiful" -> was)
        text_b_alt_2 = mean("the yellow duck was [...]" -> beautiful ; "the yellow duck [...] beautiful" -> was)
    If it agrees with initial sentence, then keep that one, otherwise discard. 
    The nice thing is we can operate at the noun chunk level
    """
    text_a = "the trophy didn't fit in the bag because it was too big. "
    masked_examples, right = get_masked_examples(text_a, "the trophy", "was too big", tokenizer)
    print(get_mean_predictions(model, masked_examples, right))
    masked_examples, right = get_masked_examples(text_a, "the bag", "was too big", tokenizer)
    print(get_mean_predictions(model, masked_examples, right))

In [39]:
filling_predictor(None, tokenizer, model)

['was', 'too', 'big']
0.80300575
['was', 'too', 'big']
0.69981575


## Visualize errors

In [31]:
for i, (pred, true, sent1, sent2, faked) in enumerate(zip(train_pred_arr, train_df["label"], 
                                                   train_df['sentence1'], train_df['sentence2'], train_all_alt_ls)):
    if not (true == pred):
        print(sent1, sent2, faked[-1], bool(true), pred)
        print()
    if i > 80:    
        break

John couldn't see the stage with Billy in front of him because he is so short. John is so short. BILLY is so short. True False

When Tatyana reached the cabin, her mother was sleeping. She was careful not to disturb her, undressing and climbing back into her berth. mother was careful not to disturb her, undressing and climbing back into her berth. mother was careful not to disturb her, undressing and climbing back into her BERTH. False True

John was jogging through the park when he saw a man juggling watermelons. He was very impressive. John was very impressive. JOHN was very impressive. False True

I took the water bottle out of the backpack so that it would be handy. I took the water bottle out of the backpack so that the backpack would be handy. i took the water bottle out of the BACKPACK so that the BACKPACK would be handy. False True

The firemen arrived after the police because they were coming from so far away. The police were coming from so far away. the POLICE were coming fro

In [50]:
pd.set_option("display.max_colwidth", 400)

In [41]:
tokenizer.tokenize('trophy')

['trophy']

## Not restricting

In [None]:
# If not restricting / cancelling
# print((train_pred_arr==train_df["label"]).mean())
# print((val_pred_arr==val_df["label"]).mean())

# print("mean train pred:", train_pred_arr.mean())
# print("mean val pred:", val_pred_arr.mean())

In [None]:
# def pred_v1(df, tokenizer, model):
#     pred_ls = []
#     all_alt_ls = []
#     tokenized_examples = []
#     for _, row in tqdm.tqdm(df.iterrows(), total=len(df)):
#         result_ls = [ ]
#         alt_ls = []
#         example = InputExample(
#             guid=0,
#             text_a=row["sentence1"],
#             text_b=row["sentence2"],
#             is_next=True,
#         )
#         tokenized_example = tokenize_example(example, tokenizer)
#         tokenized_examples.append(tokenized_example)
#         tokens_a_pos = get_pos(" ".join(tokenized_example.tokens_a))
#         tokens_b_pos = get_pos(" ".join(tokenized_example.tokens_b))
#         tokens_a_nouns = {
#             tokenizer.vocab[word]
#             for word, pos in zip(tokenized_example.tokens_a, tokens_a_pos)
#             if is_noun(pos)
#         }
#         tokens_a_ids = np.array(list(tokens_a_nouns))
#         pred_result = True
#         for i in range(len(tokenized_example.tokens_b)):
#             if not is_noun(tokens_b_pos[i]):
#                 continue
#             b_token = tokenized_example.tokens_b[i]
#             tokenized_example = tokenize_example(example, tokenizer)
#             tokenized_example.tokens_b[i] = MASK
#             features = convert_example_to_features(tokenized_example, tokenizer, max_sequence_length, select_prob=0.0)
#             batch = features_to_data([features]).to(device)
#             with torch.no_grad():
#                 result = model(
#                     batch.input_ids, 
#                     batch.segment_ids, 
#                     batch.input_mask, 
#                 )
#             masked_indices = np.arange(batch.input_ids.shape[1])[batch.input_ids[0].cpu().numpy()==103]
#             assert len(masked_indices) == 1
#             masked_index = masked_indices[0]
#             srs = pd.Series(
#                 result[0][0][masked_index].cpu().numpy()[tokens_a_ids],
#                 index=[tokenizer.ids_to_tokens[i] for i in tokens_a_ids],
#             ).sort_values()
#             result_ls.append(srs.index[-1])
#             if not srs.index[-1]==b_token:
#                 pred_result = False
#             alt_ls.append(" ".join(tokenized_example.tokens_b).replace(
#                 MASK, srs.index[-1].upper(),
#             ))

#         pred_ls.append(pred_result)
#         all_alt_ls.append(alt_ls)
#     pred_arr = np.array(pred_ls)
#     return pred_arr, all_alt_ls