## Adjective-Noun Compositions
### Attribute Selection

We use the HeiPLAS data set [(Hartung, 2015)](https://pub.uni-bielefeld.de/record/2900430), which contains adjective-attribute-noun triples that were heuristically extracted from WordNet and manually filtered by linguists. For example, "_hot summer_" (temperature) vs. "_hot debate_" (manner). 

In [None]:
import random
random.seed(133)

import os
import csv
import json
import spacy
import codecs
import random

import numpy as np

from nltk.corpus import wordnet as wn
from sklearn.metrics import accuracy_score
from collections import Counter, defaultdict
from statsmodels.stats.inter_rater import fleiss_kappa

First, download the dataset:

In [None]:
if not os.path.exists('an_classification/HeiPLAS-dev.txt'):
    !mkdir -p an_classification
    !wget http://www.cl.uni-heidelberg.de/~hartung/data/HeiPLAS-release.tgz
    !tar -zvxf HeiPLAS-release.tgz
    !mv HeiPLAS-release/HeiPLAS-* an_classification
    !rm -r HeiPLAS-release

In [None]:
original_dataset = []

for s in ['test', 'dev']:
    with codecs.open('an_classification/HeiPLAS-{}.txt'.format(s), 'r', 'utf-8') as f_in:
        original_dataset += [tuple(line.strip().lower().split()) for line in f_in]        

print('Datset size: {}'.format(len(original_dataset)))

Instead of using the dataset as is (over 200 labels, which are similar words to the examples), 
we will create two examples for each _(AN, label)_ example in the dataset: one with _(AN, label, TRUE)_ and _(AN, another label, FALSE)_ for some other label which appears for either the adjective or the noun.

What we need to do:

* Create the new dataset.
* Extract context sentences.
* Filter out too long or too short sentences.
* Sample a sentence for each example.
* Split to train, test, and validation (ignoring the current split).

In [None]:
label_by_word = defaultdict(list)
[label_by_word[a].append(label) for label, a, n in original_dataset]
[label_by_word[n].append(label) for label, a, n in original_dataset]

label_by_word = {w: Counter(labels) for w, labels in label_by_word.items()}

new_dataset = []

for label, a, n in original_dataset:
    other_labels = sorted(list(label_by_word[a].items()) + list(label_by_word[n].items()), 
                          key=lambda x: x[1], reverse=True)
    other_labels = list(set([l for l, c in other_labels if l != label]))
    other_synsets = [wn.synsets(label) for label in other_labels]
    
    if len(other_labels) > 0:
        # Pick a label which is not similar to the true label. 
        # The labels are originally from WordNet, so we will use WordNet similarity.
        curr_syn = wn.synsets(label)
        
        if len(curr_syn) == 0: 
            continue
            
        curr_syn = curr_syn[0]
        
        other_labels = [l for l, syns in zip(other_labels, other_synsets)
                        if len(syns) > 0
                        if curr_syn.wup_similarity(syns[0]) < 0.4]
        
        if len(other_labels) > 0:
            new_dataset += [(a, n, label, 'True')]
            num_samples = min(3, len(other_labels))
            for other_label in random.sample(other_labels, num_samples):
                new_dataset += [(a, n, other_label, 'False')]

print('Number of examples: {}'.format(len(new_dataset)))

Add context examples from Wikipedia.

In [None]:
data_dir = 'an_classification'
if not os.path.exists(data_dir):
    os.mkdir(data_dir)

ans = {'_'.join((w1, w2)) for w1, w2, relation, label in new_dataset}
print('Number of adjective-noun compositions: {}'.format(len(ans)))

corpus = '~/corpora/text/en_corpus_tokenized' # change to your corpus path
out_dir = os.path.join(data_dir, 'sentences')
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
    
NUM_PARALLEL = 60

with codecs.open(os.path.join(data_dir, 'commands.sh'), 'w', 'utf-8') as f_out:
    f_out.write('#!/bin/bash\n')
    commands_written = 0
    for an in ans:
        f_out.write('grep -i "{}" {} > sentences/{} &\n'.format(an.replace('_', ' '), corpus, an))
        commands_written += 1

        if commands_written > 0 and commands_written % NUM_PARALLEL == 0:
            f_out.write('wait\n')

In [None]:
MIN_SENT_LEN = 15
MAX_SENT_LEN = 25

out_dir = os.path.expanduser('~/git/lexical_composition/preprocessing/an_classification/sentences')

an_sentences_filtered = {}
for an in ans:
    try:
        with codecs.open(os.path.join(out_dir, an), 'r', 'utf-8') as f_in:
            an_sentences = [line.strip() for line in f_in]

        an_sentences_filtered[an] = [s for s in an_sentences 
                                     if len(s.split()) <= MAX_SENT_LEN and 
                                     len(s.split()) >= MIN_SENT_LEN]

        with codecs.open(os.path.join(out_dir, an), 'w', 'utf-8') as f_out:
            for s in an_sentences_filtered[an]:
                f_out.write(s + '\n')
    except:
        pass

In [None]:
context_dataset = []

for w1, w2, relation, label in new_dataset:
    an = '_'.join((w1, w2))
    curr_sentences = []
    
    for sentence in an_sentences_filtered.get(an, []):
        # The corpus was already tokenized
        tokens = sentence.split()

        # Find the adjective-noun
        an_indices = [i for i, t in enumerate(tokens) 
                      if t == w1 and len(tokens) > i+1 and tokens[i+1] == w2]

        # Find the target index
        if len(an_indices) > 0:
            start = an_indices[0]
            curr_sentences.append((sentence, an, start))

    if len(curr_sentences) > 1:
        sentence, an, start = random.choice(curr_sentences)
        context_dataset.append((sentence, an, start, relation, label))
            
print(f'Dataset size: {len(context_dataset)}')

Split the dataset to train/validation/test. The split is lexical by adjective, which is often similar to the attribute, to make it more difficult.

In [None]:
def split_lexically(dataset, word_index=0):
    """
    Split the dataset to train, test, and validation, such that
    the word in word_index (0 = adjective, 1 = noun) doesn't
    repeat across sets.
    """
    instances_per_w = defaultdict(list)
    [instances_per_w[an.split('_')[word_index]].append(
        (sentence, an, start, relation, label)) 
     for (sentence, an, start, relation, label) in dataset]

    train, test, val = [], [], []
    train_size = 8 * len(dataset) // 10
    val_size = test_size = len(dataset) // 10

    words = [w for w, examples in sorted(instances_per_w.items(), key=lambda x: len(x[1]))]
    w_index = 0

    while len(test) < test_size:
        test += instances_per_w[words[w_index]]
        w_index += 1

    print('Test set size: {} (needed: {})'.format(len(test), test_size))

    while len(val) < val_size:
        val += instances_per_w[words[w_index]]
        w_index += 1

    print('Validation set size: {} (needed: {})'.format(len(val), val_size))

    train = [example for i in range(w_index, len(words)) 
             for example in instances_per_w[words[i]]]
    print('Train set size: {} (needed: {})'.format(len(train), train_size))

    # Check the label distribution in the test set
    ctr = Counter([label for (sentence, an, start, relation, label) in test])
    assert(ctr['False'] / ctr['True'] <= 4 and ctr['True'] / ctr['False'] <= 4)
    
    # Make sure the split is lexical among verbs
    test_words = [an.split('_')[word_index] for sentence, an, start, relation, label in test]
    train_words = [an.split('_')[word_index] for sentence, an, start, relation, label in train]
    val_words = [an.split('_')[word_index] for sentence, an, start, relation, label in val]
    assert(len(set(train_words).intersection(set(val_words))) == 0)
    assert(len(set(train_words).intersection(set(test_words))) == 0)
    assert(len(set(test_words).intersection(set(val_words))) == 0)

    print(f'Sizes: train = {len(train)}, test = {len(test)}, validation = {len(val)}')
    return train, test, val
    

data_dir = '../diagnostic_classifiers/data/an_attribute_selection'
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    
train, test, val = split_lexically(context_dataset, word_index=1)

for s, filename in zip([train, test, val], ['train', 'test', 'val']):
    with codecs.open(os.path.join(data_dir, f'{filename}.jsonl'), 'w', 'utf-8') as f_out:
        for sentence, an, start, relation, label in s:
            a, n = an.split('_')
            paraphrase = f'{a} refers to the {relation} of {n}'
            example_dict = {'sentence' : sentence, 'start': start, 'end': start + 1, 
                            'label': label, 'paraphrase': paraphrase}
            f_out.write(json.dumps(example_dict) + '\n') 

Sanity check: majority baseline is not too strong.

In [None]:
def get_majority_label_per_word(train_set, word_index=0):
    """
    Compute the majority label by relation
    """
    per_word_labels = defaultdict(list)
    for _, an, _, _, label in train_set:
        per_word_labels[an.split('_')[word_index]].append(label)
    
    per_word_majority_label = {w: Counter(curr_labels).most_common(1)[0][0] 
                               for w, curr_labels in per_word_labels.items()}
    return per_word_majority_label   


test_labels = [label for _, _, _, _, label in test]
overall_majority_label = Counter([label for _, _, _, _, label in train]).most_common(1)[0][0]
test_predictions_all = [overall_majority_label] * len(test)
print('Majority overall: {:.2f}%'.format(
    100.0 * accuracy_score(test_labels, test_predictions_all)))

per_adj_majority_label = get_majority_label_per_word(train, 0)
test_adj = [an.split('_')[0] for _, an, _, _, _ in test]
test_predictions_adj = [per_adj_majority_label.get(a, overall_majority_label) 
                        for a in test_adj]
print('Majority by adjective: {:.2f}%'.format(
    100.0 * accuracy_score(test_labels, test_predictions_adj)))

per_noun_majority_label = get_majority_label_per_word(train, 1)
test_nouns = [an.split('_')[1] for _, an, _, _, _ in test]
test_predictions_nouns = [per_noun_majority_label.get(n, overall_majority_label) 
                          for n in test_nouns]
print('Majority by noun: {:.2f}%'.format(
    100.0 * accuracy_score(test_labels, test_predictions_nouns)))

In [None]:
data_dir = '../diagnostic_classifiers/data/an_attribute_selection'

sets = []
for filename in ['train', 'test', 'val']:
    curr = []
    with codecs.open(os.path.join(data_dir, f'{filename}.jsonl'), 'r', 'utf-8') as f_in:
        curr = []
        for line in f_in:
            ex = json.loads(line.strip())
            tokens = ex['sentence'].split()
            an = ' '.join((tokens[ex['start']], tokens[ex['end']]))
            curr.append((ex['sentence'], an, ex['relation'], ex['label']))
        
        sets.append(curr)
        
train, test, val = sets

We re-annotated the dataset to compute human performance. 
We assume the annotation results are found under `preprocessing/annotation/an_classification/batch_results`.

In [None]:
def load_batch_results(result_file, remove_bad_workers=False):
    """
    Load the batch results from the CSV
    :param result_file: the batch results CSV file from MTurk
    :return: the workers and the answers
    """
    answer_by_worker, answer_by_hit = defaultdict(dict), defaultdict(dict)
    workers = set()
    incorrect = set()
    workers_wrong_answers = defaultdict(int)
    hit_id_to_instance = {}
    
    with codecs.open(result_file, 'r', 'utf-8') as f_in:
        reader = csv.DictReader(f_in)
        for row in reader:
            hit_id = row['HITId']
            worker_id = row['WorkerId']

            # Input fields
            sent = row['Input.sent']
            orig_label = row['Input.orig_label']
            attribute = row['Input.attribute']
            
            tokens = sent.split()
            
            try:
                a = [t for t in tokens if t.startswith('<mark>')][0].replace('<mark>', '')
                n = [t for t in tokens if t.endswith('</mark>')][0].replace('</mark>', '')
                sent = sent.replace('<mark>', '').replace('</mark>', '').strip()
            except:
                print(f'Warning: skipped "{sentence}"')
                continue
            
            hit_id_to_instance[hit_id] = (sent, a, n, attribute)
            
            # Answer fields
            if row['Answer.label.yes'].lower() == 'true':
                answer = 'true'
            elif row['Answer.label.no'].lower() == 'true':
                answer = 'false'
            # Incorrect
            else:
                incorrect.add(hit_id)
                continue

            if orig_label.lower() != answer:
                workers_wrong_answers[worker_id] += 1
                
            workers.add(worker_id)
            answer_by_worker[worker_id][hit_id] = answer
            answer_by_hit[hit_id][worker_id] = answer
            
    # Remove HITs that were annotated as incorrect by at least one worker
    answer_by_hit = {hit_id: answers_by_hit_id 
                     for hit_id, answers_by_hit_id in answer_by_hit.items()
                     if hit_id not in incorrect}
    
    new_answer_by_worker = {}
    for worker_id, curr_answers in answer_by_worker.items():
        new_answer_by_worker[worker_id] = {hit_id: answer 
                                           for hit_id, answer in curr_answers.items()
                                           if hit_id not in incorrect}
        
    answer_by_worker = new_answer_by_worker
    num_answers = sum([len(answers_by_worker_id) 
                       for answers_by_worker_id in answer_by_worker.values()])
    
    if remove_bad_workers:
        workers_wrong_answers = {worker_id: n * 100.0 / len(answer_by_worker[worker_id])
                                 for worker_id, n in workers_wrong_answers.items()}

        # Remove bad workers: workers that disagreed with many of the previous annotation 
        bad_workers = {worker_id 
                       for worker_id, per in workers_wrong_answers.items() if per > 33}
        print(f'Removing {len(bad_workers)} bad workers:\n{bad_workers}')

        answer_by_worker = {worker_id: answers_by_worker_id 
                            for worker_id, answers_by_worker_id in answer_by_worker.items()
                            if worker_id not in bad_workers}

        for hit_id in answer_by_hit.keys():
            answers_by_hit_id = answer_by_hit[hit_id]
            answer_by_hit[hit_id] = {worker_id: answer 
                                      for worker_id, answer in answers_by_hit_id.items()
                                      if worker_id not in bad_workers}

        num_answers_after_filtering = sum([len(answers_by_worker_id) 
                                           for answers_by_worker_id in answer_by_worker.values()])
        print('Final: {} answers, removed {}.'.format(
            num_answers_after_filtering, 
            num_answers - num_answers_after_filtering))
    
    return workers, answer_by_worker, answer_by_hit, incorrect, hit_id_to_instance


results_file = 'an_classification/batch_results.csv'
workers, answer_by_worker, answer_by_hit, incorrect, hit_id_to_instance = load_batch_results(
    results_file, remove_bad_workers=True)
print(f'Loaded results from {results_file}, loaded {len(answer_by_hit)} answers')

Computes Fleiss Kappa and percent of agreement between the workers.

In [None]:
def compute_agreement(answer_by_hit):
    """
    Compute workers' agreement (Fleiss Kappa and percent) 
    """
    data = []
    percent = 0
    
    for hit_id, worker_answers in answer_by_hit.items():
        curr = [0, 0]

        for answer in worker_answers.values():
            label = 1 if answer == 'true' else 0
            curr[label] += 1
            
        if sum(curr) == 3:
            data.append(curr)
            curr_agreement = sum([max(0, a-1) for a in curr])        
            percent += curr_agreement

    kappa = fleiss_kappa(data)
    percent = percent * 100.0 / (len(data) * 2)
    return kappa, percent


kappa, percent = compute_agreement(answer_by_hit)
print('Fleiss Kappa={:.3f}, Percent={:.2f}%'.format(kappa, percent))

Compute the workers majority which we will use to estimate human performance.

In [None]:
def compute_majority(results):
    """
    Compute the majority label from the worker answers    
    :param results: HIT ID to worker answers dictionary
    """
    distribution = { hit_id : Counter(sent_results.values())
                    for hit_id, sent_results in results.items() }
    
    dataset = {}
    for hit_id, dist in distribution.items():
        if len(dist) > 0 and dist.most_common(1)[0][1] >= 2:
            sentence, a, n, relation = hit_id_to_instance[hit_id]
            sentence = sentence.lower().replace(' ', '')
            label = dist.most_common(1)[0][0]
            dataset[(sentence, ' '.join((a, n)), relation)] = label
   
    return dataset

human_annotations = compute_majority(answer_by_hit)

Compute the human performance on the test set.

In [None]:
items_compared = [1 if human_annotations[
    (sentence.lower().replace(' ', ''), an, relation)] == label.lower() else 0 
                  for sentence, an, relation, label in test
                  if (sentence.lower().replace(' ', ''), an, relation) in human_annotations]
            
human_accuracy = sum(items_compared) * 100.0 / len(items_compared)

print('Number of examples: {}, accuracy: {:.3f}'.format(len(items_compared), human_accuracy))