## Noun Compounds 
### Literality

We use the dataset from [Reddy et al. (2011)](http://www.aclweb.org/anthology/I11-1024). The dataset contains human judgements of the literality of 90 noun compounds, at both the compound and the constituent level. For example, in _sacred cow_, _sacred_ is literal while _cow_ is not. If a constituent is literal, then a sentence containing the noun compound is also affected by its meaning. For example, in a sentence discussing _olive oil_ the sentence is affected by the meanings of _olive_ and _oil_. Contrarily, a sentence containing _sacred cow_ is not affected by the meaning of _cow_. The scores in the dataset are in a scale of 0-5, 0 being non-literal and 5 being literal. We consider scores of at least 4 as literal, and at most 2 as non-literal, and ignore the 2-4 range. For each such noun compound, we look at all of its occurrences in Wikipedia, and sample some short sentences (up to 20 words). We create examples such as: `[sentence] [w1] [literal/non-literal]` and `[sentence] [w2] [literal/non-literal]`. We then make sure that there is no word `w1` and `w2` that constantly appears only in negative or only in positive examples.

In [None]:
import random
random.seed(133)

import os
import re
import csv
import json
import spacy
import shutil
import codecs
import random
import fileinput

from nltk import agreement
from itertools import count
from collections import Counter, defaultdict
from statsmodels.stats.inter_rater import fleiss_kappa

In [None]:
if not os.path.exists('nc_literality'):
    !mkdir -p nc_literality
    !wget http://sivareddy.in/papers/files/ijcnlp_compositionality_data.tgz 
    !tar -zxvf ijcnlp_compositionality_data.tgz 
    !mv ijcnlp_compositionality_data/MeanAndDeviations.clean.txt nc_literality
    !rm -r ijcnlp_compositionality_data
    !rm -r ijcnlp_compositionality_data.tgz

In [None]:
MIN_LITERAL = 4
MAX_NON_LITERAL = 2

with codecs.open('nc_literality/MeanAndDeviations.clean.txt', 'r', 'utf-8') as f_in:
    lines = [line.strip().split('\t') for line in f_in]
    
out_of_context_examples = []

# Skip the header line
for nc, data in lines[1:]:
    data = data.split()
    w1_mean, w2_mean = float(data[0]), float(data[2])
    
    # Remove the POS
    w1, w2 = nc.split()
    w1 = w1[:-2]
    w2 = w2[:-2]
    nc = '_'.join((w1, w2))
    
    for constituent, score in zip([w1, w2], [w1_mean, w2_mean]):
        if score >= MIN_LITERAL:
            out_of_context_examples.append((nc, constituent, 'LITERAL'))
        elif score <= MAX_NON_LITERAL:
            out_of_context_examples.append((nc, constituent, 'NON-LITERAL'))
        
ncs = set([nc for nc, constituent, label in out_of_context_examples])
print(f'Number of examples from Reddy et al.: {len(out_of_context_examples)}')
non_literal = len([label for _, _, label in out_of_context_examples if label == 'NON-LITERAL'])
print(f'Number of non-literal examples: {non_literal}')

To increase the size of the dataset, we will use additional noun compounds from the Tratz (2011) dataset. 
We can only trust that in *compositional* compounds, both constituents are literal (while in *non-compositional* compounds, one of the constituents may still be literal). 

In [None]:
if not os.path.exists('tratz.tsv'):
    !wget https://vered1986.github.io/papers/Tratz2011_Dataset.tar.gz
    !tar -zxvf Tratz2011_Dataset.tar.gz
    !cat Data/tratz2011_fine_grained_random/*.tsv > nc_literality/tratz.tsv
    !rm -r Data
    !rm -r Tratz2011_Dataset.tar.gz

In [None]:
w1s, w2s = zip(*[nc.split('_') for nc in ncs])
reddy_vocab = set(w1s).union(set(w2s))

with codecs.open('nc_literality/tratz.tsv', 'r', 'utf-8') as f_in:
    tratz_dataset = [line.strip().split('\t') for line in f_in]
    
# Remove expressions with more than two words
tratz_dataset = [(w1, w2, label) 
                 for w1, w2, label in tratz_dataset 
                 if ' ' not in w1 and ' ' not in w2]

labels_to_remove = {'LEXICALIZED', 'PERSONAL_NAME', 'PERSONAL_TITLE'}

tratz_out_of_context_examples = [('_'.join((w1, w2)), w1, 'LITERAL') 
                                 for w1, w2, label in tratz_dataset
                                 if w1 in reddy_vocab and labels_to_remove and
                                 ('_'.join((w1, w2)), w1, 'NON-LITERAL') not in out_of_context_examples]

tratz_out_of_context_examples += [('_'.join((w1, w2)), w2, 'LITERAL') 
                                 for w1, w2, label in tratz_dataset
                                 if w2 in reddy_vocab and labels_to_remove and
                                 ('_'.join((w1, w2)), w2, 'NON-LITERAL') not in out_of_context_examples]

# Make sure each NC-target pair has only one label
label_per_nc_target = defaultdict(set)
[label_per_nc_target[(nc, target)].add(label) for nc, target, label in out_of_context_examples]
assert([len(lst) == 1 for lst in label_per_nc_target.values()])

out_of_context_examples += tratz_out_of_context_examples
print('Number of out of context examples: {}'.format(len(out_of_context_examples)))
ncs = set([nc for nc, constituent, label in out_of_context_examples])
print('Number of noun compounds: {}'.format(len(ncs)))
literal = [label for _, _, label in out_of_context_examples if label == 'LITERAL']
non_literal = [label for _, _, label in out_of_context_examples if label == 'NON-LITERAL']
print(f'Literal: {len(literal)}, non-literal: {len(non_literal)}')

Now, let's extract all the corpus sentences in which the noun compound appears. We will generate a bash script that runs 60 `grep` commands in parallel. Change it as you wish.

In [None]:
NUM_PARALLEL = 60

corpus = '~/corpora/text/en_corpus_tokenized' # change to your corpus path
out_dir = 'nc_literality/sentences'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
    
with codecs.open(os.path.join('nc_literality', 'commands.sh'), 'w', 'utf-8') as f_out:
    f_out.write('#!/bin/bash\n')
    for i, nc in enumerate(ncs):
        f_out.write('grep -i "{}" {} > sentences/{} &\n'.format(nc.replace('_', ' '), corpus, nc))
        if i > 0 and i % NUM_PARALLEL == 0:
            f_out.write('wait\n')

Filter out sentences which are too long or too short.

In [None]:
MIN_SENT_LEN = 15
MAX_SENT_LEN = 25

nc_sentences_filtered = {}
for nc in ncs:
    try:
        with codecs.open(os.path.join(out_dir, nc), 'r', 'utf-8') as f_in:
            nc_sentences = [line.strip() for line in f_in]

        nc_sentences_filtered[nc] = [s for s in nc_sentences 
                                     if len(s.split()) <= MAX_SENT_LEN and 
                                     len(s.split()) >= MIN_SENT_LEN]

        with codecs.open(os.path.join(out_dir, nc), 'w', 'utf-8') as f_out:
            for s in nc_sentences_filtered[nc]:
                f_out.write(s + '\n')
    except:
        pass
    
print('Number of noun compounds: {}'.format(len(nc_sentences_filtered)))

Sample up to 10 sentences for each out-of-context example.

In [None]:
dataset = []

for nc, constituent, label in out_of_context_examples:
    w1, w2 = nc.split('_')
    curr_sentences = nc_sentences_filtered.get(nc, [])
    sent_tokens = [sentence.split() for sentence in curr_sentences]
    valid_sentences = [(sent, tokens) for sent, tokens in zip(curr_sentences, sent_tokens) 
                       if len([i for i, t in enumerate(tokens) 
                               if t == w1 and len(tokens) > i+1 and tokens[i+1] == w2]) > 0]
    
    if len(valid_sentences) > 0:
        for sentence, tokens in random.sample(valid_sentences, min(10, len(valid_sentences))):

            # Find the noun compound
            nc_indices = [i for i, t in enumerate(tokens) 
                          if t == w1 and len(tokens) > i+1 and tokens[i+1] == w2]

            # Find the target index
            if len(nc_indices) > 0:
                target_index = nc_indices[0] if constituent == w1 else nc_indices[0]+1
                dataset.append((sentence, nc, target_index, label))
                sentences_added += 1

print(f'Number of examples: {len(dataset)}')
ctr = Counter([label for (sentence, nc, target_index, label) in dataset])
print(f'Literal: {ctr["LITERAL"]}, Non-literal: {ctr["NON-LITERAL"]}')

Let's make sure the dataset is not biased with respect to the labels.

In [None]:
if ctr['NON-LITERAL'] / ctr['LITERAL'] < 0.25: 
    literal = [(sentence, nc, target_index, label) 
               for (sentence, nc, target_index, label) in dataset
               if label == 'LITERAL']
    non_literal = [(sentence, nc, target_index, label) 
                   for (sentence, nc, target_index, label) in dataset
                   if label == 'NON-LITERAL']
    dataset = non_literal + random.sample(literal, len(non_literal) * 4)

print('Dataset size: {}'.format(len(dataset)))
ctr = Counter([label for (sentence, nc, target_index, label) in dataset])
print(f'Literal: {ctr["LITERAL"]}, Non-literal: {ctr["NON-LITERAL"]}')

Split the dataset lexically by modifier.

In [None]:
def split_lexically(dataset, word_index=0):
    """
    Split the dataset to train, test, and validation, such that
    the word in word_index (0 = modifier, 1 = head) doesn't
    repeat across sets.
    """
    literal_instances_per_w = defaultdict(list)
    [literal_instances_per_w[nc.split('_')[word_index]].append(
        (sentence, nc, target_index, label)) 
     for (sentence, nc, target_index, label) in dataset
     if label == 'LITERAL']
    
    non_literal_instances_per_w = defaultdict(list)
    [non_literal_instances_per_w[nc.split('_')[word_index]].append(
        (sentence, nc, target_index, label)) 
     for (sentence, nc, target_index, label) in dataset
     if label == 'NON-LITERAL']
    
    # First, split the non literal examples
    words_in_non_literal = list(non_literal_instances_per_w.keys())
    train_size = 8 * len(words_in_non_literal) // 10
    val_size = test_size = len(words_in_non_literal) // 10
    words_for_train = words_in_non_literal[:train_size]
    words_for_val = words_in_non_literal[train_size+1:train_size+val_size]
    words_for_test = words_in_non_literal[train_size+val_size+1:]

    train = [ex for w in words_for_train for ex in non_literal_instances_per_w[w]]
    val = [ex for w in words_for_val for ex in non_literal_instances_per_w[w]]
    test = [ex for w in words_for_test for ex in non_literal_instances_per_w[w]]
    
    # Then add the literal ones
    train += [ex for w in words_for_train for ex in literal_instances_per_w.get(w, [])]
    val += [ex for w in words_for_val for ex in literal_instances_per_w.get(w, [])]
    test += [ex for w in words_for_test for ex in literal_instances_per_w.get(w, [])]
    
    literal_instances_per_w = {w: examples 
                               for w, examples in literal_instances_per_w.items() 
                               if w not in set(words_in_non_literal)}
    
    train_size = 8 * len(dataset) // 10
    val_size = test_size = len(dataset) // 10
    
    words = [w for w, examples in sorted(literal_instances_per_w.items(), key=lambda x: len(x[1]))]
    w_index = 0

    while len(test) < test_size:
        test += literal_instances_per_w[words[w_index]]
        w_index += 1

    print('Test set size: {} (needed: {})'.format(len(test), test_size))

    while len(val) < val_size:
        val += literal_instances_per_w[words[w_index]]
        w_index += 1

    print('Validation set size: {} (needed: {})'.format(len(val), val_size))

    train += [example for i in range(w_index, len(words)) 
              for example in literal_instances_per_w[words[i]]]
    print('Train set size: {} (needed: {})'.format(len(train), train_size))

    # Check the label distribution in the test set
    ctr = Counter([label for (sentence, nc, target_index, label) in test])
    if ctr['NON-LITERAL'] / ctr['LITERAL'] < 0.25: 
        literal = [(sentence, nc, target_index, label) 
                   for (sentence, nc, target_index, label) in test
                   if label == 'LITERAL']
        non_literal = [(sentence, nc, target_index, label) 
                       for (sentence, nc, target_index, label) in test
                       if label == 'NON-LITERAL']
        test = non_literal + random.sample(literal, len(non_literal) * 2)
    
    ctr = Counter([label for (sentence, nc, target_index, label) in test])
    print(f'Test ratio: {ctr["NON-LITERAL"] / ctr["LITERAL"]}')
    assert(0.25 <= ctr['NON-LITERAL'] / ctr['LITERAL'] <= 4)
    
    ctr = Counter([label for (sentence, nc, target_index, label) in train])
    print(f'Train ratio: {ctr["NON-LITERAL"] / ctr["LITERAL"]}')
    ctr = Counter([label for (sentence, nc, target_index, label) in val])
    print(f'Val ratio: {ctr["NON-LITERAL"] / ctr["LITERAL"]}')
    
    # Make sure the split is lexical among verbs
    test_words = [nc.split('_')[word_index] for sentence, nc, target_index, label in test]
    train_words = [nc.split('_')[word_index] for sentence, nc, target_index, label in train]
    val_words = [nc.split('_')[word_index] for sentence, nc, target_index, label in val]
    assert(len(set(train_words).intersection(set(val_words))) == 0)
    assert(len(set(train_words).intersection(set(test_words))) == 0)
    assert(len(set(test_words).intersection(set(val_words))) == 0)

    print(f'Sizes: train = {len(train)}, test = {len(test)}, validation = {len(val)}')
    return train, test, val
    

data_dir = '../diagnostic_classifiers/data/nc_literality'
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    
train, test, val = split_lexically(dataset, word_index=1)

# train_size = 8 * len(dataset) // 10
# val_size = test_size = len(dataset) // 10
# train = dataset[:train_size]
# val = dataset[train_size+1:train_size+val_size]
# test = dataset[train_size+val_size+1:]
# print(f'Sizes: train = {len(train)}, test = {len(test)}, validation = {len(val)}')

Make sure that the majority baseline by head is not too strong.

In [None]:
from sklearn.metrics import accuracy_score

def compute_majority_baseline(word_index=1):
    train_labels = [label for sentence, nc, target_index, label in train]
    test_labels = [label for sentence, nc, target_index, label in test]

    per_word_labels = defaultdict(list)

    for sentence, nc, target_index, label in train:
        per_word_labels[nc.split('_')[word_index]].append(label)

    curr_majority_labels = {w: Counter(labels).most_common(1)[0][0] 
                            for w, labels in per_word_labels.items()}
    overall_majority = Counter(test_labels).most_common(1)[0][0]
    print(f'Overall majority: {overall_majority}')

    # Predict. If the word is not there, take the overall majority label.
    test_predictions = []
    for sentence, nc, target_index, label in test:
        test_predictions.append(curr_majority_labels.get(nc.split('_')[word_index], overall_majority))

    # Evaluate
    acc = accuracy_score(test_labels, test_predictions)
    word = ['modifier', 'head'][word_index]
    print('Majority by {}: {:2.1f}%'.format(word, acc * 100.0))
    
    
compute_majority_baseline(0)
compute_majority_baseline(1)

In [None]:
for s, filename in zip([train, test, val], ['train', 'test', 'val']):
    with codecs.open(os.path.join(data_dir, '{}.jsonl'.format(filename)), 'w', 'utf-8') as f_out:
        for sentence, nc, target_index, label in s:
            example_dict = {'sentence' : sentence, 'nc': nc, 'target_index': target_index, 
                            'target_word': sentence.split()[target_index], 'label': label}
            f_out.write(json.dumps(example_dict) + '\n')

We re-annotated a sample from the test set to compute human performance. 
We assume the annotation results are found under `preprocessing/annotation/nc_literality/batch_results`.

In [None]:
def load_batch_results(result_file, remove_bad_workers=False):
    """
    Load the batch results from the CSV
    :param result_file: the batch results CSV file from MTurk
    :return: the workers and the answers
    """
    answer_by_worker, answer_by_hit = defaultdict(dict), defaultdict(dict)
    workers = set()
    incorrect = set()
    workers_wrong_answers = defaultdict(int)
    hit_id_to_orig_label = {}
    
    with codecs.open(result_file, 'r', 'utf-8') as f_in:
        reader = csv.DictReader(f_in)
        for row in reader:
            hit_id = row['HITId']
            worker_id = row['WorkerId']

            # Input fields
            sent = row['Input.sent']
            orig_label = row['Input.orig_label']
            
            tokens = sent.split()
            
            try:
                w1 = [t for t in tokens if t.startswith('<mark>')][0].replace('<mark>', '')
                w2 = [t for t in tokens if t.endswith('</mark>')][0].replace('</mark>', '')
                sent = sent.replace('<mark>', '').replace('</mark>', '').strip()
            except:
                print(f'Warning: skipped "{sentence}"')
                continue
            
            hit_id_to_orig_label[hit_id] = orig_label
            
            # Answer fields
            if row['Answer.label.literal'].lower() == 'true':
                answer = 'LITERAL'
            elif row['Answer.label.non_literal'].lower() == 'true':
                answer = 'NON-LITERAL'
            # Incorrect
            else:
                incorrect.add(hit_id)
                continue
                
            if orig_label != answer:
                workers_wrong_answers[worker_id] += 1
                
            workers.add(worker_id)
            answer_by_worker[worker_id][hit_id] = answer
            answer_by_hit[hit_id][worker_id] = answer
            
    # Remove HITs that were annotated as incorrect by at least one worker
    answer_by_hit = {hit_id: answers_by_hit_id 
                     for hit_id, answers_by_hit_id in answer_by_hit.items()
                     if hit_id not in incorrect}
    
    new_answer_by_worker = {}
    for worker_id, curr_answers in answer_by_worker.items():
        new_answer_by_worker[worker_id] = {hit_id: answer 
                                           for hit_id, answer in curr_answers.items()
                                           if hit_id not in incorrect}
        
    answer_by_worker = new_answer_by_worker
    num_answers = sum([len(answers_by_worker_id) 
                       for answers_by_worker_id in answer_by_worker.values()])
    
    if remove_bad_workers:
        workers_wrong_answers = {worker_id: n * 100.0 / len(answer_by_worker[worker_id])
                                 for worker_id, n in workers_wrong_answers.items()}

        # Remove bad workers: workers that disagreed with many of the previous annotation 
        bad_workers = {worker_id 
                       for worker_id, per in workers_wrong_answers.items() if per > 33}
        print(f'Removing {len(bad_workers)} bad workers:\n{bad_workers}')

        answer_by_worker = {worker_id: answers_by_worker_id 
                            for worker_id, answers_by_worker_id in answer_by_worker.items()
                            if worker_id not in bad_workers}

        for hit_id in answer_by_hit.keys():
            answers_by_hit_id = answer_by_hit[hit_id]
            answer_by_hit[hit_id] = {worker_id: answer 
                                      for worker_id, answer in answers_by_hit_id.items()
                                      if worker_id not in bad_workers}

        num_answers_after_filtering = sum([len(answers_by_worker_id) 
                                           for answers_by_worker_id in answer_by_worker.values()])
        print('Final: {} answers, removed {}.'.format(
            num_answers_after_filtering, 
            num_answers - num_answers_after_filtering))
    
    return workers, answer_by_worker, answer_by_hit, incorrect, hit_id_to_orig_label


results_file = 'nc_literality/batch_results.csv'
workers, answer_by_worker, answer_by_hit, incorrect, hit_id_to_orig_label = load_batch_results(
    results_file, remove_bad_workers=True)
print(f'Loaded results from {results_file}, loaded {len(answer_by_hit)} HITs')

Computes Fleiss Kappa and percent of agreement between the workers.

In [None]:
def compute_agreement(answer_by_hit):
    """
    Compute workers' agreement (Fleiss Kappa and percent) 
    """
    data = []
    percent = 0
    
    for hit_id, worker_answers in answer_by_hit.items():
        curr = [0, 0]

        for answer in worker_answers.values():
            label = 1 if answer == 'LITERAL' else 0
            curr[label] += 1
            
        if sum(curr) == 3:
            data.append(curr)
            curr_agreement = sum([max(0, a-1) for a in curr])        
            percent += curr_agreement

    kappa = fleiss_kappa(data)
    percent = percent * 100.0 / (len(data) * 2)
    return kappa, percent


kappa, percent = compute_agreement(answer_by_hit)
print('Fleiss Kappa={:.3f}, Percent={:.2f}%'.format(kappa, percent))

Compute the workers majority which we will use to estimate human performance.

In [None]:
def compute_majority(results):
    """
    Compute the majority label from the worker answers    
    :param results: HIT ID to worker answers dictionary
    """
    distribution = { hit_id : Counter(sent_results.values())
                    for hit_id, sent_results in results.items() }
    
    dataset = {}
    for hit_id, dist in distribution.items():
        if len(dist) > 0 and dist.most_common(1)[0][1] >= 2:
            label = dist.most_common(1)[0][0]
            dataset[hit_id] = label
   
    return dataset

human_annotations = compute_majority(answer_by_hit)

Compute the human performance on the test set.

In [None]:
items_compared = [1 if human_annotations[hit_id].lower() == hit_id_to_orig_label[hit_id].lower() else 0 
                  for hit_id in human_annotations.keys()]
            
human_accuracy = sum(items_compared) * 100.0 / len(items_compared)

print('Number of examples: {}, accuracy: {:.3f}'.format(len(items_compared), human_accuracy))