## Light Verb Constructions
### Distinguishing LVCs from verb + argument

We use the dataset in [Tu and Roth (2011)](https://dl.acm.org/citation.cfm?id=2021131) which contains 2,162 sentences from BNC in which a potential light verb construction was found (with verbs from among the 6 most frequently used English light verbs: _do, get, give, have, make_ and _take_), annotated to whether it is an LVC in this context or not.

In [None]:
import random
random.seed(133)

import os
import re
import csv
import json
import codecs
import random

import numpy as np

from fuzzywuzzy import fuzz
from nltk.corpus import wordnet as wn
from sklearn.metrics import accuracy_score
from collections import Counter, defaultdict
from statsmodels.stats.inter_rater import fleiss_kappa

First, download the dataset:

In [None]:
if not os.path.exists('lvc/lvc_BNC.txt'):
    !mkdir -p lvc
    !wget http://cogcomp.org/software/tools/MWE_LVC_token.tar.gz
    !tar -zxvf MWE_LVC_token.tar.gz
    !mv tokenLVC/lvc_BNC.txt lvc
    !rm -r tokenLVC
    !rm -r MWE_LVC_token.tar.gz

In [None]:
with codecs.open('lvc/lvc_BNC.txt', 'r', 'utf-8') as f_in:
    dataset = [line.strip().split('\t') for line in f_in]
    
# The dataset fields are: bnc_id, span text, and label (+/-).
dataset = [(bnc_id, span_text, 'true' if label == '+' else 'false') 
           for bnc_id, span_text, label in dataset]

print('Dataset size: {}'.format(len(dataset)))    
print('\n'.join(['\t'.join(item) for item in dataset[:10]]))

Split the dataset to train/validation/test. The split is lexical by auxiliary verb, to make it more difficult.

In [None]:
def split_lexically(dataset, word_index=0):
    """
    Split the dataset to train, test, and validation, such that
    the word in index 0 (auxiliary verb) or -1 (noun) doesn't repeat across sets.
    """
    instances_per_w = defaultdict(list)
    [instances_per_w[span_text.split('_')[word_index]].append(
        (bnc_id, span_text, label)) 
     for (bnc_id, span_text, label) in dataset]

    train, test, val = [], [], []
    train_size = 8 * len(dataset) // 10
    val_size = test_size = len(dataset) // 10

    words = [w for w, examples in sorted(instances_per_w.items(), key=lambda x: len(x[1]))]
    w_index = 0

    while len(test) < test_size:
        test += instances_per_w[words[w_index]]
        w_index += 1

    print('Test set size: {} (needed: {})'.format(len(test), test_size))

    while len(val) < val_size:
        val += instances_per_w[words[w_index]]
        w_index += 1

    print('Validation set size: {} (needed: {})'.format(len(val), val_size))

    train = [example for i in range(w_index, len(words)) 
             for example in instances_per_w[words[i]]]
    print('Train set size: {} (needed: {})'.format(len(train), train_size))

    # Check the label distribution in the test set
    ctr = Counter([label for (bnc_id, span_text, label) in test])
    assert(ctr['false'] / ctr['true'] <= 4 and ctr['true'] / ctr['false'] <= 4)
    
    # Make sure the split is lexical among verbs
    test_words = [span_text.split('_')[word_index] for _, span_text, _ in test]
    train_words = [span_text.split('_')[word_index] for _, span_text, _ in train]
    val_words = [span_text.split('_')[word_index] for _, span_text, _ in val]
    assert(len(set(train_words).intersection(set(val_words))) == 0)
    assert(len(set(train_words).intersection(set(test_words))) == 0)
    assert(len(set(test_words).intersection(set(val_words))) == 0)

    print(f'Sizes: train = {len(train)}, test = {len(test)}, validation = {len(val)}')
    return train, test, val
    

data_dir = '../diagnostic_classifiers/data/lvc_classification'
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    
train, test, val = split_lexically(dataset)

for s, filename in zip([train, test, val], ['train', 'test', 'val']):
    with codecs.open(os.path.join(data_dir, 'ids_{}.jsonl'.format(filename)), 'w', 'utf-8') as f_out:
        for bnc_id, span_text, label in s:
            example = {'bnc_id': bnc_id, 'span_text': span_text.replace('_', ' '), 'label': label}
            f_out.write(json.dumps(example) + '\n')

Sanity check: majority baseline is not too strong.

In [None]:
def get_majority_label_per_word(train_set, word_index=0):
    """
    Compute the majority label by word
    :word_index: 0 for verb, -1 for noun
    """
    per_word_labels = defaultdict(list)
    for _, span_text, label in train_set:
        w = span_text.split('_')[word_index]
        per_word_labels[w].append(label)
        
    per_word_majority_label = {w: Counter(curr_labels).most_common(1)[0][0] 
                               for w, curr_labels in per_word_labels.items()}
    return per_word_majority_label   


test_labels = [label for _, _, label in test]
overall_majority_label = Counter([label for _, _, label in train]).most_common(1)[0][0]
test_predictions_all = [overall_majority_label] * len(test)
print('Majority overall: {:.2f}%'.format(
    100.0 * accuracy_score(test_labels, test_predictions_all)))

per_verb_majority_label = get_majority_label_per_word(train, word_index=0)
test_verbs = [span_text.split('_')[0] for _, span_text, _ in test]
test_predictions_verb = [per_verb_majority_label.get(v, overall_majority_label) 
                         for v in test_verbs]
print('Majority by verb: {:.2f}%'.format(
    100.0 * accuracy_score(test_labels, test_predictions_verb)))

per_noun_majority_label = get_majority_label_per_word(train, word_index=-1)
test_nouns = [span_text.split('_')[-1] for _, span_text, _ in test]
test_predictions_noun = [per_noun_majority_label.get(n, overall_majority_label) 
                         for n in test_nouns]
print('Majority by noun: {:.2f}%'.format(
    100.0 * accuracy_score(test_labels, test_predictions_noun)))

Notice that the dataset is given with the sentence IDs and without the sentences themselves, to comply with the BNC corpus license. To get the sentences, follow the instructions in the repository README file.

We re-annotated a sample from the test set to compute human performance. 

First, let's create the batch instances. We show the annotators the candidate LVC (e.g. *make a difficult decision*) in a sentence, and ask them to mark all that applies:

1. It describes an action of *make decision*.
2. It describes an action of *making something*, in the common meaning of *make*.
3. The essence of the action is described by *decision*.
4. The span could be rephrased without *make* but with a verb like *decide* without changing the meaning of the sentence.
5. The sentence does not make sense or is ungrammatical.

We consider something as a LVC if: 1) the answer to 2 is no and the answer to 3 is yes; or 2) the answer to 4 is yes. 

In [None]:
with codecs.open('../diagnostic_classifiers/data/lvc_classification/test.jsonl', 'r', 'utf-8') as f_in:
    field_names = ['sent_id', 'sent', 'aux', 'noun', 'span_text', 
                   'simple_lvc', 'substitute_verb', 'original_label']

    with codecs.open('lvc/batch_instances.csv', 'w', 'utf-8') as f_out:
        writer = csv.DictWriter(f_out, fieldnames=field_names)
        writer.writeheader()
                
        for line in f_in:
            example = json.loads(line.strip())

            # Create the highlighted sentence
            start, end = example['start'], example['end']
            sent = example['sentence']
            sent = sent[0].upper() + sent[1:]
            tokens = sent.split()
            tokens[start] = '<mark>' + tokens[start]
            tokens[end] += '</mark>'
            new_sent = ' '.join(tokens)

            span_tokens = example['span_text'].split()
            aux = span_tokens[0]
            noun = span_tokens[-1]
            simple_lvc = ' '.join((aux, noun))
            
            syns = wn.synsets(noun, 'n')

            potential_lemmas = { verb.name() 
                                for syn in syns
                                for lemma in syn.lemmas()
                                for verb in lemma.derivationally_related_forms()
                                if (fuzz.ratio(lemma.name(), noun) >= 50  or 
                                    fuzz.partial_ratio(lemma.name(), noun) >= 50)
                                and verb.synset().pos() == 'v'
                               }

            if len(potential_lemmas) > 0:
                sorted_verbs = list(sorted([(verb, 
                                             fuzz.ratio(verb, noun) * fuzz.partial_ratio(verb, noun)) 
                                            for verb in potential_lemmas],
                                           key=lambda x: x[1]))
                
                best_verb = sorted_verbs[-1]
                
                if best_verb[1] >= 50 or fuzz.partial_ratio(best_verb[0], noun) >= 50:
                    new_instance = {'sent_id': example['bnc_id'],
                                    'sent': new_sent,
                                    'aux': aux,
                                    'noun': noun,
                                    'span_text': example['span_text'],
                                    'original_label': example['label'],
                                    'simple_lvc': simple_lvc,
                                    'substitute_verb': best_verb[0]
                                   }
                
                    writer.writerow(new_instance)

We assume the annotation results are found under `preprocessing/annotation/lvc/batch_results`.

In [None]:
def load_batch_results(result_file, remove_bad_workers=False):
    """
    Load the batch results from the CSV
    :param result_file: the batch results CSV file from MTurk
    :return: the workers and the answers
    """
    answer_by_worker, answer_by_hit = defaultdict(dict), defaultdict(dict)
    workers = set()
    incorrect = set()
    workers_wrong_answers = defaultdict(int)
    inputs_by_sent_id = {}
    
    with codecs.open(result_file, 'r', 'utf-8') as f_in:
        reader = csv.DictReader(f_in)
        for row in reader:
            hit_id = row['HITId']
            worker_id = row['WorkerId']

            # Input fields
            sent_id = row['Input.sent_id']
            sent = row['Input.sent'].replace('<mark>', '').replace('</mark>', '')
            aux = row['Input.aux']
            noun = row['Input.noun']
            span_text = row['Input.span_text']
            simple_lvc = row['Input.simple_lvc']
            substitute_verb = row['Input.substitute_verb']
            original_label = row['Input.original_label']
            inputs_by_sent_id[sent_id] = (sent, aux, noun, span_text, simple_lvc, 
                                          substitute_verb, original_label)
            
            # Answer fields
            is_simple_lvc = row['Answer.answer.simple_lvc'].lower()
            meaning_noun = row['Answer.answer.meaning_noun'].lower()
            meaning_verb = row['Answer.answer.meaning_verb'].lower()
            is_substitutable = row['Answer.answer.substitute'].lower()
            is_incorrect = row['Answer.answer.incorrect'].lower()
            
            # Incorrect
            if is_incorrect == 'true':
                incorrect.add(sent_id)
                continue
                
            # Compute aggregated label
            lvc = (meaning_noun and not meaning_verb) or is_substitutable
            answer = (is_simple_lvc, meaning_noun, meaning_verb, is_substitutable, lvc)
            
            if original_label != lvc:
                workers_wrong_answers[worker_id] += 1
                
            workers.add(worker_id)
            answer_by_worker[worker_id][sent_id] = answer
            answer_by_hit[sent_id][worker_id] = answer
            
    # Remove HITs that were annotated as incorrect by at least one worker
    answer_by_hit = {sent_id: answers_by_sent_id 
                     for sent_id, answers_by_sent_id in answer_by_hit.items()
                     if sent_id not in incorrect}
    
    new_answer_by_worker = {}
    for worker_id, curr_answers in answer_by_worker.items():
        new_answer_by_worker[worker_id] = {sent_id: answer 
                                           for sent_id, answer in curr_answers.items()
                                           if sent_id not in incorrect}
        
    answer_by_worker = new_answer_by_worker
    num_answers = sum([len(answers_by_worker_id) 
                       for answers_by_worker_id in answer_by_worker.values()])
    
    if remove_bad_workers:
        workers_wrong_answers = {worker_id: n * 100.0 / len(answer_by_worker[worker_id])
                                 for worker_id, n in workers_wrong_answers.items()}

        # Remove bad workers: workers that disagreed with many of the previous annotation 
        bad_workers = {worker_id 
                       for worker_id, per in workers_wrong_answers.items() if per > 32}
        print(f'Removing {len(bad_workers)} bad workers:\n{bad_workers}')

        answer_by_worker = {worker_id: answers_by_worker_id 
                            for worker_id, answers_by_worker_id in answer_by_worker.items()
                            if worker_id not in bad_workers}

        for sent_id in answer_by_hit.keys():
            answers_by_sent_id = answer_by_hit[sent_id]
            answer_by_hit[sent_id] = {worker_id: answer 
                                      for worker_id, answer in answers_by_sent_id.items()
                                      if worker_id not in bad_workers}

        num_answers_after_filtering = sum([len(answers_by_worker_id) 
                                           for answers_by_worker_id in answer_by_worker.values()])
        print('Final: {} answers, removed {}.'.format(
            num_answers_after_filtering, 
            num_answers - num_answers_after_filtering))
    
    return workers, answer_by_worker, answer_by_hit, incorrect, inputs_by_sent_id


results_file = 'lvc/batch_results.csv'
workers, answer_by_worker, answer_by_hit, incorrect, inputs_by_sent_id = load_batch_results(
    results_file, remove_bad_workers=True)
print(f'Loaded results from {results_file}')
print(f'Removed {len(incorrect)} incorrect instances.')

Computes Fleiss Kappa and percent of agreement between the workers.

In [None]:
def compute_agreement(answer_by_hit, field_index):
    """
    Compute workers' agreement (Fleiss Kappa and percent) 
    :field_index which answer field to compute agreement on
    """
    data = []
    percent = 0
    
    for sent_id, worker_answers in answer_by_hit.items():
        curr = [0, 0]

        for answer in worker_answers.values():
            label = 1 if answer[field_index] == 'true' else 0
            curr[label] += 1

        if sum(curr) == 3:
            data.append(curr)
            curr_agreement = sum([max(0, a-1) for a in curr])        
            percent += curr_agreement

    kappa = fleiss_kappa(data)
    percent = percent * 100.0 / (len(data) * 2)
    return kappa, percent


for field_index, field_name in enumerate([
    'is_simple_lvc', 'meaning_noun', 'meaning_verb', 'is_substitutable', 'lvc']):
    kappa, percent = compute_agreement(answer_by_hit, field_index)
    print('Field: {}, Fleiss Kappa={:.3f}, Percent={:.2f}%'.format(field_name, kappa, percent))

Compute the workers majority which we will use to estimate human performance.

In [None]:
def compute_majority(results):
    """
    Compute the majority label from the worker answers    
    :param results: sentence ID to worker answers dictionary
    """
    distribution = { sent_id : Counter([answer[-1] for answer in sent_results.values()])
                    for sent_id, sent_results in results.items() }
    
    dataset = [{'sent_id': sent_id, 
                'span_text': inputs_by_sent_id[sent_id][3],
                'label': dist.most_common(1)[0][0]}
               for sent_id, dist in distribution.items()
               if len(dist) > 0 and dist.most_common(1)[0][1] >= 2]
   
    return dataset

human_annotations = compute_majority(answer_by_hit)

Compute the human performance on the test set.

In [None]:
gold_by_sent_id = {e['sent_id']: e['label'] for e in human_annotations}
test_annotations = {sent_id: label for sent_id, _, label in test}

human_accuracy = sum([1 if label == test_annotations[sent_id] else 0 
                      for sent_id, label in gold_by_sent_id.items()
                     ]) * 100.0 / len(gold_by_sent_id)

print('Number of examples: {}, accuracy: {:2.2f}'.format(
    len(gold_by_sent_id), human_accuracy))