# Building Sense Embeddings

The sense embeddings of a certain sense is calculated by averaging the context embeddings of all context in which certain sense exists. There exists several different methods for combining words embeddings to form context embeddings. Our starting poing is applying plain average (bag of word). 

Reference: Iaacobaci et al, Embeddings for Word Sense Disambiguation: An Evaluation Study
http://aclweb.org/anthology/P/P16/P16-1085.pdf

In [22]:
# Import neccesary libraries
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import semcor
import numpy as np
import collections
import os
import pickle
import dill

In [2]:
#Load an example embeddings
embedding_dict = pickle.load(open('glove_50d_50kvoc.pk','rb'))
example_sentence = semcor.sents()[0]

In [3]:
example_chunk = semcor.tagged_sents(tag='sem')[0]

In [4]:
example_sentence_list = semcor.tagged_sents(tag='sem')[:10]

In [5]:
# Build a function to combine word to form context embedding:
def getContextEmb(sentence,center,window_size,embedding_dict,emb_size):
    # Input introductions
    # sentence: an array of tokens of untagged sentence. 
    # center: position of the center word
    # window_size: size of context window
    # embedding_Dict: embedding dictionary used to calculate context
    ################################################################
    start_pos = max([0,center-window_size])
    end_pos = min([len(sentence),(center+window_size)+1])
    context_tokens = sentence[start_pos:end_pos]
    output_embedding = np.zeros(emb_size)
    for word in context_tokens:
        try:
            output_embedding+=embedding_dict[word]
        except:
            output_embedding+=np.random.uniform(1,-1,emb_size)
    return output_embedding

Trying to create a method to form a dictionary of sense embeddings.

In [7]:
def buildSemEmb(tagged_sents,emb_size,embedding_dict,context_builder = getContextEmb):
    output_dict = collections.defaultdict(lambda: np.zeros(emb_size))
    for sentence in tagged_sents:
        #print(sentence)
        for idx,chunk in enumerate(sentence):
            if(type(chunk))==list:
                continue
            else:
                #Use try except handling since some of the label is broken
                try:
                    sense_index = chunk.label().synset().name()
                except:
                    continue
                context_emb = context_builder(sentence,idx,3,embedding_dict,emb_size)
                output_dict[sense_index]+=context_emb
    return output_dict

Now we build a sense embedding dictionary for prediction. Notice that the ouput dictionary of buildSemEmb() is a collection.defaultdict() with default value being the uniform random vector. Hence it returns a uniform random vector when some sense does not exists.

In [8]:
#Build sense dictionary for semcor corpus
semcor_senseEmb = buildSemEmb(semcor.tagged_sents(tag='sem'),50,embedding_dict)

In [20]:
semcor_senseEmb['commitment.n.03']

array([ 2.41965715, -0.70601881, -0.64355166,  0.81245141, -1.84942576,
       -1.29362126,  0.75211755, -1.47197524,  1.61507373, -1.16748291,
       -1.13113449, -2.64405335,  0.92219797,  1.38874225,  0.61148661,
        2.24869574,  2.09829951,  1.77529734,  1.30615384, -1.01640932,
        1.02975797,  3.60376203,  1.59939857, -0.1726787 , -1.75190285,
        0.22106166, -0.66422232, -0.05908176, -0.30704792,  0.50734715,
        3.40579833,  1.59532228,  4.21812127,  2.07470056, -0.11801319,
        0.39660769, -1.76262552,  1.43322866, -0.67192988, -1.02756017,
        0.94464215,  1.84417544,  0.68029068, -0.57246716, -0.30492983,
       -1.60090041, -0.04069953,  2.21882938,  0.79532995,  0.78956593])

## Expriment: bag of word comparison with sense embeddings

In [27]:
len(semcor_senseEmb)

25820