In [1]:
from main import parse_data

train = parse_data("data/English-train.xml")
test = parse_data("data/English-dev.xml")

In [2]:
from B import vectorize
from collections import defaultdict

INST_ID = 0
L_CON = 1
HEAD = 2
R_CON = 3
SENSE_ID = 4

## Data Exploration
Let's look at what is inside the training dataset

In [3]:
type(train)

dict

In [4]:
train_keys = train.keys()
train_keys[0]

u'begin.v'

In [5]:
type(train[train_keys[0]])

list

In [6]:
train[train_keys[0]][0]

(u'begin.v.bnc.00000159',
 u"uk director and organiser of the conference ,  peter johnson ,  said ,  for many ,  the day provided a reaffirmation of the vision for acet and a marvellous sense of unity .  it was the first time our national and international network had gathered together in one place and made us all realise just how much the work has grown .  three visitors from frankfurt , germany , recently visited acet 's offices before returning home to ",
 u'begin',
 u' a similar service , christian aids help ( cah ) . interest was expressed in all of the organisational aspects of home care including nursing ,  equipment loans and the volunteer programme .  like acet ,  cah will give both medical and practical help to aids patients in the home .  ',
 '369204')

## Feature Development
For the purpose of developing features, let's look at a single instance and develop algorithms.

In [7]:
test_inst = train[train_keys[0]][0]

test_inst

(u'begin.v.bnc.00000159',
 u"uk director and organiser of the conference ,  peter johnson ,  said ,  for many ,  the day provided a reaffirmation of the vision for acet and a marvellous sense of unity .  it was the first time our national and international network had gathered together in one place and made us all realise just how much the work has grown .  three visitors from frankfurt , germany , recently visited acet 's offices before returning home to ",
 u'begin',
 u' a similar service , christian aids help ( cah ) . interest was expressed in all of the organisational aspects of home care including nursing ,  equipment loans and the volunteer programme .  like acet ,  cah will give both medical and practical help to aids patients in the home .  ',
 '369204')

In [8]:
lContext = test_inst[L_CON].lower().split()
rContext = test_inst[R_CON].lower().split()

def countElts(l):
    result = defaultdict(int)
    for i in l:
        result[i] += 1
    return result

def bagOfWordsFeatures(lContext, rContext, window):
    '''
    :param lContext: a list of words appearing to the LEFT of the target
    :param rContext: a list of words appearing to the RIGHT of the target
    :param window: an integer number specifying a maximum distance from
        the target from which to select the context
    
    :return: dict where the key is a word and the value is its count
        (the number of appearances within the context)
    '''
    lCon = lContext[-window:]
    if len(lCon) < window:
        lCon = ['<B>'] * (window - len(lCon)) + lCon
    # END if

    rCon = rContext[:window]
    if len(rCon) < window:
        rCon = rCon + ['<E>'] * (window - len(rCon))
    # END if
    
    context = [elt + ".BOW" for elt in lCon + rCon]
    bagCounts = countElts(context)
    
    return bagCounts

bagOfWordsFeatures(lContext, rContext, 5)

defaultdict(int,
            {u',.BOW': 1,
             u'a.BOW': 1,
             u'before.BOW': 1,
             u'christian.BOW': 1,
             u'home.BOW': 1,
             u'offices.BOW': 1,
             u'returning.BOW': 1,
             u'service.BOW': 1,
             u'similar.BOW': 1,
             u'to.BOW': 1})

In [10]:
import math

def collocationFeatures(lContext, rContext, window, fn=lambda x: x):
    '''
    :param lContext: a list of words appearing to the LEFT of the target
    :param rContext: a list of words appearing to the RIGHT of the target
    :param window: an integer number specifying a maximum distance from
        the target from which to select the context
    :param fn: an optional scaling function to calculate a value for each
        collocation.
    
    :return: dict where the key is a word and the value is fn(distance)
        from the target word.  In cases where there are multiple appearances
        the closest distance is selected.
    '''
    lCon = lContext[-window:]
    if len(lCon) < window:
        lCon = ['<B>'] * (window - len(lCon)) + lCon
    # END if

    rCon = rContext[:window]
    if len(rCon) < window:
        rCon = rCon + ['<E>'] * (window - len(rCon))
    # END if
    
    result = {}
    for k in range(0, window):
        dist = fn(k)
        rFeat = rCon[k] + '.COLLOC'
        lFeat = lCon[-(k+1)] + '.COLLOC'
        if rFeat not in result.keys():
            result[rFeat] = dist
        if lFeat not in result.keys():
            result[lFeat] = -dist
    # END for
    
    return result

collocationFeatures(lContext, rContext, 5)

{u',.COLLOC': 3,
 u'a.COLLOC': 0,
 u'before.COLLOC': -3,
 u'christian.COLLOC': 4,
 u'home.COLLOC': -1,
 u'offices.COLLOC': -4,
 u'returning.COLLOC': -2,
 u'service.COLLOC': 2,
 u'similar.COLLOC': 1,
 u'to.COLLOC': 0}

In [11]:
import nltk

def posCollocationFeatures(target, lContext, rContext, window):
    '''
    :param target: the target word to be disambiguated
    :param lContext: a list of words appearing to the LEFT of the target
    :param rContext: a list of words appearing to the RIGHT of the target
    :param window: an integer number specifying a maximum distance from
        the target from which to select the context
    
    :return: dict where the key is a part of speech and the value is the distance
        from the target word.
    '''
    lCon = lContext[-window:]
    if len(lCon) < window:
        lCon = ['<B>'] * (window - len(lCon)) + lCon
    # END if

    rCon = rContext[:window]
    if len(rCon) < window:
        rCon = rCon + ['<E>'] * (window - len(rCon))
    # END if
    
    context = lCon + [target] + rCon
    context_Tagged = nltk.pos_tag(context)
    
    result = {}
    for k in range(window-1, -1, -1):
        rFeat = context_Tagged[-(k+1)][1] + '.POS'
        lFeat = context_Tagged[k][1] + '.POS'
        if rFeat not in result.keys():
            result[rFeat] = window - k
        if lFeat not in result.keys():
            result[lFeat] = -(window - k)
    # END for
    result['HEAD.POS'] = context_Tagged[window][1]
    
    return result

posCollocationFeatures(test_inst[HEAD].lower(), lContext, rContext, 5)

{',.POS': 4,
 'DT.POS': 1,
 'HEAD.POS': 'VB',
 'IN.POS': -4,
 'JJ.POS': 2,
 'NN.POS': -2,
 'NNS.POS': -5,
 'TO.POS': -1,
 'VBG.POS': -3}

In [17]:
from nltk.corpus import stopwords
import string

def rmStopWords(l):
    stops = set(stopwords.words('english'))
    return [w for w in l if w not in stops]

def rmPunct(s):
    tab = {ord(c): None for c in string.punctuation}
    return s.translate(tab)

print rmStopWords(lContext)
print rmPunct(test_inst[R_CON])

[u'uk', u'director', u'organiser', u'conference', u',', u'peter', u'johnson', u',', u'said', u',', u'many', u',', u'day', u'provided', u'reaffirmation', u'vision', u'acet', u'marvellous', u'sense', u'unity', u'.', u'first', u'time', u'national', u'international', u'network', u'gathered', u'together', u'one', u'place', u'made', u'us', u'realise', u'much', u'work', u'grown', u'.', u'three', u'visitors', u'frankfurt', u',', u'germany', u',', u'recently', u'visited', u'acet', u"'s", u'offices', u'returning', u'home']
 a similar service  christian aids help  cah   interest was expressed in all of the organisational aspects of home care including nursing   equipment loans and the volunteer programme   like acet   cah will give both medical and practical help to aids patients in the home   


In [21]:
window_size = 10

DEFAULT_OPS = {"STOPWORDS":    False, \
               "PUNCTUATION":  False, \
               "BAGOFWORDS":   True, \
               "COLLOCATION":  True, \
               "PARTOFSPEECH": True \
               }

def getFeatures(inst, ops):
    result = {}

    lContext = inst[L_CON].lower()
    rContext = inst[R_CON].lower()

    if ops['PUNCTUATION']:
        lContext = rmStopWords(lContext)
        rContext = rmStopWords(rContext)

    lContext = lContext.split()
    rContext = rContext.split()

    if ops['STOPWORDS']:
        lContext = rmStopWords(lContext)
        rContext = rmStopWords(rContext)
    if ops['BAGOFWORDS']:
        result.update(bagOfWordsFeatures(lContext, rContext, window_size))
    if ops['COLLOCATION']:
        result.update(collocationFeatures(lContext, rContext, window_size))
    if ops['PARTOFSPEECH']:
        result.update(posCollocationFeatures(inst[HEAD], lContext, rContext, window_size))

    return result

test_features = {"TEST": getFeatures(test_inst, DEFAULT_OPS)}

In [22]:
vectorize(test_features, test_features)

({'TEST': array([  1.,  -5.,   1.,   7.,   8.,   1.,   9.,  10.,   2.,   3.,   4.,
           1.,   1.,  -4.,   2.,  -2.,  -5.,  -6.,  -9.,  -1.,  -8.,  -3.,
           1.,   0.,   1.,  -6.,   1.,   5.,   1.,  -3.,   1.,   8.,   1.,
           4.,   1.,   6.,   1.,  -1.,   1.,  -4.,   1.,  -8.,   1.,  -2.,
           1.,   2.,   1.,   1.,   1.,   0.,   1.,  -7.])},
 {'TEST': array([  1.,  -5.,   1.,   7.,   8.,   1.,   9.,  10.,   2.,   3.,   4.,
           1.,   1.,  -4.,   2.,  -2.,  -5.,  -6.,  -9.,  -1.,  -8.,  -3.,
           1.,   0.,   1.,  -6.,   1.,   5.,   1.,  -3.,   1.,   8.,   1.,
           4.,   1.,   6.,   1.,  -1.,   1.,  -4.,   1.,  -8.,   1.,  -2.,
           1.,   2.,   1.,   1.,   1.,   0.,   1.,  -7.])})