## Noun Compounds 
### Paraphrasing

We use the dataset from [SemEval 2013 Task 4: Free Paraphrases of Noun Compounds](https://www.cs.york.ac.uk/semeval-2013/task4/index.php). In the original task, given a two-word noun compound, the participating system was asked to produce an explicitly ranked list of its free-form paraphrases. The list was automatically compared and evaluated against a similarly ranked list of paraphrases proposed by human annotators. We cast this as a classification problem whose input is a noun compound and a paraphrase and whose output is a binary value indicating whether the paraphrase is a correct explication of the compound.

In [None]:
import random
random.seed(133)

import os
import re
import csv
import json
import spacy
import shutil
import codecs
import random
import fileinput

from nltk import agreement
from itertools import count
from collections import Counter, defaultdict
from statsmodels.stats.inter_rater import fleiss_kappa

In [None]:
if not os.path.exists('nc_relations'):
    !mkdir -p nc_relations
    !wget https://raw.githubusercontent.com/vered1986/panic/master/paraphrasing/data/semeval_2013/test_gold.txt
    !wget https://raw.githubusercontent.com/vered1986/panic/master/paraphrasing/data/semeval_2013/train_gold.txt
    !mv test_gold.txt nc_relations
    !mv train_gold.txt nc_relations

Keep maximum of 3 paraphrases for each noun compound.

In [None]:
def load_paraphrasing_dataset(file_name):
    """
    Load the SemEval 2013 paraphrase task dataset
    return a list of tuples of noun compound and paraphrase
    """
    dataset = []
    verbs_by_nc = defaultdict(list)
    
    with codecs.open(file_name, 'r', 'utf-8') as f_in:
        for line in f_in:
            keep = False
            data = line.strip().split('\t')
            
            if len(data) != 4:
                continue
                
            w1, w2, paraphrase, score = data
            nc = '_'.join((w1, w2))
            
            # Only keep paraphrases with verbs
            tokens = nlp(paraphrase)
            if len(tokens) < 4:
                continue 
                
            # Require a specific verb
            for t in tokens:
                if t.pos_ == 'VERB' and t.lemma_ not in {
                    'involve', 'concern', 'regard', 'discuss', 
                    'happen', 'deal', 'relate', 'refer'}:
                    keep = True
                    verbs_by_nc[nc].append(t.lemma_)
                
            if keep:
                dataset.append((nc, paraphrase))
            
    return dataset, verbs_by_nc
            
    
curr_dir = 'nc_relations'
nlp = spacy.load('en')
train_paraphrases, verbs_by_nc = load_paraphrasing_dataset(
    os.path.join(curr_dir, 'train_gold.txt'))
test_paraphrases, verbs_by_nc2 = load_paraphrasing_dataset(
    os.path.join(curr_dir, 'test_gold.txt'))
dataset = train_paraphrases + test_paraphrases
verbs_by_nc = {nc: verbs + verbs_by_nc2.get(nc, []) for nc, verbs in verbs_by_nc.items()}

Add sentences from Wikipedia - find sentences for each noun compound.

In [None]:
# Get the noun compounds in the dataset
all_ncs = set([nc for nc, _ in dataset])
print('Number of noun compounds: {}'.format(len(all_ncs)))

corpus = os.path.expanduser('~/corpora/text/en_corpus_tokenized') # change to your corpus path
out_dir = 'nc_relations/sentences'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
    
NUM_PARALLEL = 60

with codecs.open(os.path.join('nc_relations', 'commands.sh'), 'w', 'utf-8') as f_out:
    f_out.write('#!/bin/bash\n')
    commands_written = 0
    for nc in all_ncs:
        nc = nc.replace("'", '')
        
        # Already have sentences from the previous dataset
        if os.path.exists('nc_classification/sentences/{}'.format(nc)):
            shutil.copyfile('nc_classification/sentences/{}'.format(nc), 
                            'nc_relations/sentences/{}'.format(nc))
        else:
            f_out.write('grep -i "{}" {} > sentences/{} &\n'.format(nc.replace('_', ' '), corpus, nc))
            commands_written += 1
            
            if commands_written > 0 and commands_written % NUM_PARALLEL == 0:
                f_out.write('wait\n')

Filter by length.

In [None]:
MIN_SENT_LEN = 15
MAX_SENT_LEN = 25


nc_sentences_filtered = {}
for nc in all_ncs:
    try:
        with codecs.open(os.path.join(out_dir, nc), 'r', 'utf-8') as f_in:
            nc_sentences = [line.strip() for line in f_in]

        nc_sentences_filtered[nc] = [s for s in nc_sentences 
                                     if len(s.split()) <= MAX_SENT_LEN and 
                                     len(s.split()) >= MIN_SENT_LEN]

        with codecs.open(os.path.join(out_dir, nc), 'w', 'utf-8') as f_out:
            for s in nc_sentences_filtered[nc]:
                f_out.write(s + '\n')
    except:
        pass

Add negative examples. For each noun compound we select as many negative examples as the positive ones. To make the negative examples more difficult, we will take them from paraphrases that appeared with noun compounds with either the same head or the same modifier, and we will make sure that the negative paraphrases don't share any verb lemmas with the positive ones. We keep maximum of 5 positive and 5 negative paraphrases for each noun compound.

In [None]:
positive_paraphrases = defaultdict(list)
for nc, paraphrase in dataset:
    positive_paraphrases[nc].append(paraphrase)
    
dataset_with_neg = []

for nc in all_ncs:
    w1, w2 = nc.split('_')
    curr_positive_paraphrases = set([par.replace('a ', '').replace('an ', '') 
                                     for par in positive_paraphrases[nc]])
    
    ncs_with_shared_word = [other for other in all_ncs
                            if other != nc and (w1 == other.split('_')[0]
                                or w2 == other.split('_')[1])
                           ]

    if len(ncs_with_shared_word) == 0:
        continue
        
    curr_verbs = set(verbs_by_nc.get(nc, []))
    negative_paraphrases = set([par.replace(nc2.split('_')[0], w1).replace(
        nc2.split('_')[1], w2).replace('a ', '').replace('an ', '')
                                for nc2 in ncs_with_shared_word
                                for par in positive_paraphrases[nc2]
                                if len(set([t.lemma_ for t in nlp(par)]).intersection(curr_verbs)) == 0])

    negative_paraphrases = [paraphrase for paraphrase in negative_paraphrases 
                            if w1 in paraphrase and w2 in paraphrase]
    
    # Same number of negative paraphrases
    if len(negative_paraphrases) == 0 or len(positive_paraphrases) == 0:
        continue
        
    if len(curr_positive_paraphrases) > 5:
        curr_positive_paraphrases = random.sample(curr_positive_paraphrases, 5)
    
        
    if len(negative_paraphrases) > len(curr_positive_paraphrases):
        negative_paraphrases = random.sample(
            negative_paraphrases, len(curr_positive_paraphrases))

    elif len(curr_positive_paraphrases) > len(negative_paraphrases):
        curr_positive_paraphrases = random.sample(
            list(curr_positive_paraphrases), len(negative_paraphrases))

    print(nc)
    print('Positive:\n=========')
    print('\n'.join(curr_positive_paraphrases) + '\n')
    print('Negative:\n=========')
    print('\n'.join(negative_paraphrases) + '\n')
    
    dataset_with_neg += [(nc, paraphrase, 'True') for paraphrase in curr_positive_paraphrases] 
    dataset_with_neg += [(nc, paraphrase, 'False') for paraphrase in negative_paraphrases]
    
    
print(f'Dataset size: {len(dataset_with_neg)}')

Split the examples to train, test, and validation - lexically by both nouns.

In [None]:
def split_lexically(dataset, word_index=0):
    """
    Split the dataset to train, test, and validation, such that
    the word in word_index (0 = modifier, 1 = head) doesn't
    repeat across sets.
    """
    instances_per_w = defaultdict(list)
    [instances_per_w[nc.split('_')[word_index]].append((nc, paraphrase, label)) 
     for (nc, paraphrase, label) in dataset]

    train, test, val = [], [], []
    train_size = 8 * len(dataset) // 10
    val_size = test_size = len(dataset) // 10

    words = [w for w, examples in sorted(instances_per_w.items(), key=lambda x: len(x[1]))]
    w_index = 0

    while len(test) < test_size:
        test += instances_per_w[words[w_index]]
        w_index += 1

    print('Test set size: {} (needed: {})'.format(len(test), test_size))

    while len(val) < val_size:
        val += instances_per_w[words[w_index]]
        w_index += 1

    print('Validation set size: {} (needed: {})'.format(len(val), val_size))

    train = [example for i in range(w_index, len(words)) 
             for example in instances_per_w[words[i]]]
    print('Train set size: {} (needed: {})'.format(len(train), train_size))

    # Check the label distribution in the test set
    ctr = Counter([label for (nc, paraphrase, label) in test])
    assert(ctr['False'] / ctr['True'] <= 4 and ctr['True'] / ctr['False'] <= 4)
    
    # Make sure the split is lexical among verbs
    test_words = [nc.split('_')[word_index] for nc, paraphrase, label in test]
    train_words = [nc.split('_')[word_index] for nc, paraphrase, label in train]
    val_words = [nc.split('_')[word_index] for nc, paraphrase, label in val]
    assert(len(set(train_words).intersection(set(val_words))) == 0)
    assert(len(set(train_words).intersection(set(test_words))) == 0)
    assert(len(set(test_words).intersection(set(val_words))) == 0)

    print(f'Sizes: train = {len(train)}, test = {len(test)}, validation = {len(val)}')
    return train, test, val
    

data_dir = '../diagnostic_classifiers/data/nc_relations'
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    
train, test, val = split_lexically(dataset_with_neg, word_index=1)

Randomly select a sentence for each example and write it to file.

In [None]:
assert(len(set(train).intersection(set(val))) == 0)
assert(len(set(train).intersection(set(test))) == 0)
assert(len(set(test).intersection(set(val))) == 0)

data_dir = '../diagnostic_classifiers/data/nc_relations'
if not os.path.exists(data_dir):
    os.mkdir(data_dir)

no_sentences = set()
instances = {'train': 0, 'test': 0, 'val': 0}

for s, filename in zip([train, test, val], ['train', 'test', 'val']):

    # Assert label distribution is relatively balanced
    ctr = Counter([item[-1] for item in s])
    sorted_ctr = ctr.most_common()
    most_common, least_common = sorted_ctr[0], sorted_ctr[-1]
    assert(most_common[1] // least_common[1] < 2)
    
    with codecs.open(os.path.join(data_dir, '{}.jsonl'.format(filename)), 'w', 'utf-8') as f_out:
        for nc, paraphrase, label in s:
            if nc in no_sentences:
                continue
            
            curr_sentences = nc_sentences_filtered.get(nc, [])
            if len(curr_sentences) == 0:
                print('No sentences found for {}'.format(nc))
                no_sentences.add(nc)
                continue
            
            start = []
            while len(start) == 0:
                sentence = random.choice(curr_sentences).replace(' +', ' ')
                tokens = sentence.lower().split()
                start = [i for i, (w1, w2) in enumerate(zip(tokens, tokens[1:])) if nc in '_'.join((w1, w2))]

            start = start[0]
            example_dict = {'sentence' : sentence, 'start': start, 'end': start + 1, 
                            'span': nc.replace('_', ' '), 'paraphrase': paraphrase, 'label': label}
            f_out.write(json.dumps(example_dict) + '\n')
            instances[filename] += 1
            
print(instances)

We re-annotated the dataset to compute human performance. 
We assume the annotation results are found under `preprocessing/annotation/nc_relations/batch_results`.

In [None]:
def load_batch_results(result_file, remove_bad_workers=False):
    """
    Load the batch results from the CSV
    :param result_file: the batch results CSV file from MTurk
    :return: the workers and the answers
    """
    answer_by_worker, answer_by_hit = defaultdict(dict), defaultdict(dict)
    workers = set()
    incorrect = set()
    workers_wrong_answers = defaultdict(int)
    hit_id_to_orig_label = {}
    
    with codecs.open(result_file, 'r', 'utf-8') as f_in:
        reader = csv.DictReader(f_in)
        for row in reader:
            hit_id = row['HITId']
            worker_id = row['WorkerId']

            # Input fields
            sent = row['Input.sent']
            orig_label = row['Input.orig_label']
            paraphrase = row['Input.paraphrase']
            
            tokens = sent.split()
            
            try:
                w1 = [t for t in tokens if t.startswith('<mark>')][0].replace('<mark>', '')
                w2 = [t for t in tokens if t.endswith('</mark>')][0].replace('</mark>', '')
                sent = sent.replace('<mark>', '').replace('</mark>', '').strip()
            except:
                print(f'Warning: skipped "{sentence}"')
                continue
            
            hit_id_to_orig_label[hit_id] = orig_label
            
            # Answer fields
            if row['Answer.label.yes'].lower() == 'true':
                answer = 'true'
            elif row['Answer.label.no'].lower() == 'true':
                answer = 'false'
            # Incorrect
            else:
                incorrect.add(hit_id)
                continue

            if orig_label.lower() != answer:
                workers_wrong_answers[worker_id] += 1
                
            workers.add(worker_id)
            answer_by_worker[worker_id][hit_id] = answer
            answer_by_hit[hit_id][worker_id] = answer
            
    # Remove HITs that were annotated as incorrect by at least one worker
    answer_by_hit = {hit_id: answers_by_hit_id 
                     for hit_id, answers_by_hit_id in answer_by_hit.items()
                     if hit_id not in incorrect}
    
    new_answer_by_worker = {}
    for worker_id, curr_answers in answer_by_worker.items():
        new_answer_by_worker[worker_id] = {hit_id: answer 
                                           for hit_id, answer in curr_answers.items()
                                           if hit_id not in incorrect}
        
    answer_by_worker = new_answer_by_worker
    num_answers = sum([len(answers_by_worker_id) 
                       for answers_by_worker_id in answer_by_worker.values()])
    
    if remove_bad_workers:
        workers_wrong_answers = {worker_id: n * 100.0 / len(answer_by_worker[worker_id])
                                 for worker_id, n in workers_wrong_answers.items()}

        # Remove bad workers: workers that disagreed with many of the previous annotation 
        bad_workers = {worker_id 
                       for worker_id, per in workers_wrong_answers.items() if per > 33}
        print(f'Removing {len(bad_workers)} bad workers:\n{bad_workers}')

        answer_by_worker = {worker_id: answers_by_worker_id 
                            for worker_id, answers_by_worker_id in answer_by_worker.items()
                            if worker_id not in bad_workers}

        for hit_id in answer_by_hit.keys():
            answers_by_hit_id = answer_by_hit[hit_id]
            answer_by_hit[hit_id] = {worker_id: answer 
                                      for worker_id, answer in answers_by_hit_id.items()
                                      if worker_id not in bad_workers}

        num_answers_after_filtering = sum([len(answers_by_worker_id) 
                                           for answers_by_worker_id in answer_by_worker.values()])
        print('Final: {} answers, removed {}.'.format(
            num_answers_after_filtering, 
            num_answers - num_answers_after_filtering))
    
    return workers, answer_by_worker, answer_by_hit, incorrect, hit_id_to_orig_label


results_file = 'nc_relations/batch_results.csv'
workers, answer_by_worker, answer_by_hit, incorrect, hit_id_to_orig_label = load_batch_results(
    results_file, remove_bad_workers=True)
print(f'Loaded results from {results_file}, loaded {len(answer_by_hit)} HITs')

Computes Fleiss Kappa and percent of agreement between the workers.

In [None]:
def compute_agreement(answer_by_hit):
    """
    Compute workers' agreement (Fleiss Kappa and percent) 
    """
    data = []
    percent = 0
    
    for hit_id, worker_answers in answer_by_hit.items():
        curr = [0, 0]

        for answer in worker_answers.values():
            label = 1 if answer == 'true' else 0
            curr[label] += 1
            
        if sum(curr) == 3:
            data.append(curr)
            curr_agreement = sum([max(0, a-1) for a in curr])        
            percent += curr_agreement

    kappa = fleiss_kappa(data)
    percent = percent * 100.0 / (len(data) * 2)
    return kappa, percent


kappa, percent = compute_agreement(answer_by_hit)
print('Fleiss Kappa={:.3f}, Percent={:.2f}%'.format(kappa, percent))

Compute the workers majority which we will use to estimate human performance.

In [None]:
def compute_majority(results):
    """
    Compute the majority label from the worker answers    
    :param results: HIT ID to worker answers dictionary
    """
    distribution = { hit_id : Counter(sent_results.values())
                    for hit_id, sent_results in results.items() }
    
    dataset = {}
    for hit_id, dist in distribution.items():
        if len(dist) > 0 and dist.most_common(1)[0][1] >= 2:
            label = dist.most_common(1)[0][0]
            dataset[hit_id] = label
   
    return dataset

human_annotations = compute_majority(answer_by_hit)

Compute the human performance on the test set.

In [None]:
items_compared = [1 if human_annotations[hit_id].lower() == hit_id_to_orig_label[hit_id].lower() else 0 
                  for hit_id in human_annotations.keys()]
            
human_accuracy = sum(items_compared) * 100.0 / len(items_compared)

print('Number of examples: {}, accuracy: {:.3f}'.format(len(items_compared), human_accuracy))