In [12]:
# convert SemEval data to the standard format

from glob import glob 
import codecs
from xml.etree import ElementTree as et
from traceback import format_exc
from collections import defaultdict
from nltk.corpus import wordnet as wn
from traceback import format_exc


def get_related_by_sensekey(sense_key, verbose=False):
    """ from sense key like 'window%1:06:00::' return list of related words """
    
    related = []
    try:
        
        sense_key = sense_key.split("/")[0]

        synset = wn.lemma_from_key(sense_key).synset()
        lemmas = synset.lemma_names()
        definition = synset.definition()
        examples = synset.examples()

        hypernyms = []
        for hypernym in synset.hypernyms():
            for lemma in hypernym.lemmas():
                hypernyms.append(lemma.name())

        hyponyms = []
        for hyponym in synset.hyponyms():
            for lemma in hyponym.lemmas():
                hyponyms.append(lemma.name())

        related = lemmas + hyponyms + hypernyms
        related = [r.lower().replace("_"," ") for r in related]

        if verbose:
            print "synset:", lemmas
            print "definition:", definition
            print "examples:", examples
            print "hypernyms:", hypernyms
            print "hyponyms:", hyponyms
            print "related:", related
    
    except:
        print "Bad key:", sense_key
        print format_exc()
        
    return set(related)


def semeval_xml2csv(contexts_fpaths, keys_fpath, output_fpath):
    # get keys
    with codecs.open(keys_fpath, "r", "utf-8") as keys:
        context_id2sense_ids = {}
        for line in keys:
            try:
                fields = line.split()
                target = fields[0]
                context_id = fields[1]
                golden_sense_ids = fields[2:]
                context_id2sense_ids[context_id] = golden_sense_ids
            except:
                print "bad line: '%s'" % line.strip()
                print format_exc()

    # parse xml
    # "<instance id="appear.v.1" lemma="appear" partOfSpeech="v" token="appear" tokenEnd="65" tokenStart="59">Tone it down a tad, or at least bring a froth cup when you appear        before cameras.)</instance>"
    with codecs.open(output_fpath, "w", "utf-8") as out:
        print >> out, "context_id\ttarget\ttarget_pos\ttarget_position\tgold_sense_ids\tpredict_sense_ids\tgolden_related\tpredict_related\tcontext"

        for word_fpath in glob(contexts_fpaths):
            #print word_fpath
            tree = et.parse(word_fpath)
            root = tree.getroot()

            for child in root:
                if child.tag == "instance":
                    golden_related = set()
                    for sense_key in context_id2sense_ids[child.attrib["id"]]:
                        golden_related = golden_related.union(get_related_by_sensekey(sense_key))
                    #print child.attrib["lemma"], ">>>", golden_related
                    
                    print >> out, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                                              child.attrib["id"],
                                              child.attrib["lemma"],
                                              child.attrib["partOfSpeech"],
                                              child.attrib["tokenStart"]+  "," + child.attrib["tokenEnd"], 
                                              ",".join(context_id2sense_ids[child.attrib["id"]]),
                                              "",
                                              ",".join(golden_related),
                                              "",
                                              child.text)
    print output_fpath
        
    
contexts_fpaths = "/Users/alex/work/joint/eval/contextualization-eval/semeval_2013_13/contexts/xml-format/*.xml"
keys_fpath = "/Users/alex/work/joint/eval/contextualization-eval/semeval_2013_13/keys/gold/all.key"
output_fpath = "/Users/alex/Desktop/output-semeval-2013.csv"
semeval_xml2csv(contexts_fpaths, keys_fpath, output_fpath)


Bad key: lose%2:30:05::
Traceback (most recent call last):
  File "<ipython-input-12-9102932fc733>", line 20, in get_related_by_sensekey
    synset = wn.lemma_from_key(sense_key).synset()
  File "/usr/local/lib/python2.7/site-packages/nltk/corpus/reader/wordnet.py", line 1202, in lemma_from_key
    raise WordNetError("No synset found for key %r" % key)
WordNetError: No synset found for key u'lose%2:30:05::'

Bad key: number%1:10:07::
Traceback (most recent call last):
  File "<ipython-input-12-9102932fc733>", line 20, in get_related_by_sensekey
    synset = wn.lemma_from_key(sense_key).synset()
  File "/usr/local/lib/python2.7/site-packages/nltk/corpus/reader/wordnet.py", line 1202, in lemma_from_key
    raise WordNetError("No synset found for key %r" % key)
WordNetError: No synset found for key u'number%1:10:07::'

Bad key: part%1:06:01::
Traceback (most recent call last):
  File "<ipython-input-12-9102932fc733>", line 20, in get_related_by_sensekey
    synset = wn.lemma_from_key(sens

In [19]:
import codecs
from pandas import read_csv

dataset_fpath = "/Users/alex/work/joint/eval/contextualization-eval/data/Dataset-SemEval-2013-13-adagram-ukwac-wacky-raw.csv" 
output_fpath = "/Users/alex/Desktop/adagram.key"


/Users/alex/Desktop/adagram.key


In [16]:
import codecs
from pandas import read_csv
import argparse

#def evaluate_related(dataset_fpath):

dataset_fpath = "/Users/alex/work/joint/eval/contextualization-eval/data/Dataset-SemEval-2013-13-adagram-ukwac-wacky-raw.csv"

df = read_csv(dataset_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False)

for i, row in df.iterrows():
    try:
        golden = set(row.golden_related.split(","))
        predicted = set(row.predict_related.split(",")[:50])
        print row.target.upper()
        print "golden (%d): %s" % (len(golden), golden)
        print "\npredicted (%d): %s" % (len(predicted), predicted)
        print "\nintersection (%d): %s" % (len(golden.intersection(predicted)), golden.intersection(predicted))
        print "\n\n"
    except:
        pass

ADD
golden (11): set([u'insert', u'sneak in', u'supply', u'slip in', u'stick in', u'tell', u'add', u'state', u'say', u'toss in', u'append'])

predicted (50): set([u'respond', u'represent', u'compare', u'inspire', u'give', u'imitate', u'relate', u'contribute', u'prefer', u'emphasise', u'explain', u'convey', u'dictate', u'attest', u'adding', u'memorize', u'attach', u'create', u'utilize', u'tend', u'allude', u'encourage', u'adapt', u'recommend', u'alter', u'emphasize', u'draw', u'recreate', u'incorporate', u'lend', u'emulate', u'ascribe', u'deliver', u'introduce', u'reflect', u'produce', u'induce', u'pertaining', u'appealed', u'interpret', u'conform', u'insert', u'exaggerate', u'pertain', u'borrow', u'ignore', u'adjust', u'demonstrate', u'assign', u'added'])

intersection (1): set([u'insert'])



ADD
golden (11): set([u'insert', u'sneak in', u'supply', u'slip in', u'stick in', u'tell', u'add', u'state', u'say', u'toss in', u'append'])

predicted (50): set([u'respond', u'represent', u'comp

In [14]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999999;

<IPython.core.display.Javascript object>