# Distributional Semantics Model: Word-Window

** Note: Similarity Measure **

* **Cosine**: $ sim(u,v) = \frac{\sum_iu_iv_i}{\sqrt{\sum u_i^2}\sqrt{\sum v_i^2}} $


* **PPMI**: $ sim(w_i,w_j) = max\{log\frac{P(w_i,w_j)}{P(w_i)\cdot P(w_j)}, 0\} $


* **Jaccard**: $ sim(u,v) = \frac{\sum_imin\{u_i,v_i\}}{\sum_imax\{u_i,v_i\}} $ 

## I. "Traditional"

### A. Data Loading Facilities

In [1]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03")

In [2]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from collections import defaultdict
from functools import partial
from itertools import permutations, product
from string import punctuation

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def load_wiki(cutoffFreq=50):
    
    print "... extracting data"
    with open('wikicorpus.txt','rb') as f:
        raw = f.readlines()
    raw = [sent.split() for sent in raw if sent.startswith('<c>')] 
        # extract sentences; split sentences into word complexes.
    raw = [[map(partial(str.split, word), '|') for word in sent] for sent in raw] 
        # split word complexes into words.
    
    print "... cleaning data"
    sents = [[word[0][1].lower() for word in sent if len(word[0])>1 
              and word[0][2].startswith('N')
              and word[0][1].lower() not in stopwords
              and word[0][1] not in punctuation] for sent in raw]
        # extract lemmas => complete sents corpus .
    
    print "... building token list and vocabulary"
    tokens = [word for sent in sents for word in sent]
        # type: list of words.
    fdist = nltk.FreqDist(tokens)
    vocab = list(set(tokens))   
    
    print "... saving top %d-frequent in vocabulary" % cutoffFreq
    vocab = [word for word in vocab if fdist[word] >= cutoffFreq]
        # vocab is not returned, because the k-frequent cut latter can change it.
    sents = [[word.decode('utf-8','ignore') for word in sent if word in vocab] for sent in sents]
        # type: list of lists of words.
        
    return sents

In [5]:
sents = load_wiki()

... extracting data
... cleaning data
... building token list and vocabulary
... saving top 50-frequent in vocabulary


### B1. Analyzer: Cooccurrence Matrix Based

#### a. Model

In [6]:
import numpy as np

In [7]:
# SIMILARITY MEASURES
def cosine(w2w):
    w2w_norm = w2w / np.apply_along_axis(lambda r: np.sqrt(np.dot(r,r))
                               , 1, w2w)[:,np.newaxis]
    return np.dot(w2w_norm, w2w_norm.T)
    
def ppmi(w2w):
    rowSums, colSums, totalSums = w2w.sum(axis=1), w2w.sum(axis=0), w2w.sum()
    pwi, pwj, ppmiMatrix = rowSums/totalSums, colSums/totalSums, w2w/totalSums
    ppmiMatrix /= pwi[:,np.newaxis] # * 1/pwi by row.
    ppmiMatrix /= pwj # * 1/pwj by col.
    ppmiMatrix = np.nan_to_num(np.log(ppmiMatrix)) # compute pmi.
    ppmiMatrix = np.maximum(ppmiMatrix, 0) # compute ppmi.
    return ppmiMatrix

def jaccard(w2w):
    jaccardSimilarities = np.zeros(w2w.shape)
    for i,wVec_i in enumerate(w2w):
        for j,wVec_j in enumerate(w2w):
            jaccardSimilarities[i][j] = sum(min(wVec_i,wVec_j) 
                                                for wVec_i,wVec_j in zip(wVec_i,wVec_j))/ \
                                        sum(max(wVec_i,wVec_j) 
                                                for wVec_i,wVec_j in zip(wVec_i,wVec_j))
    return jaccardSimilarities
    # SCIKIT-LEARN JACCARD
    # >>> import numpy as np
    # >>> from sklearn.metrics import jaccard_similarity_score
    # >>> y_pred = [0, 2, 1, 3]
    # >>> y_true = [0, 1, 2, 3]
    # >>> jaccard_similarity_score(y_true, y_pred)
    # 0.5
    # >>> jaccard_similarity_score(y_true, y_pred, normalize=False)
    # 2

In [9]:
class SimpleDistSem:
    
    def __init__(self, data=sents):
        self.sents = sents
        self.vocab = list({word for sent in self.sents for word in sent})
        self.wordToIndex = {word:i for i,word in enumerate(self.vocab)}
        self.indexToWord = {i:word for word,i in self.wordToIndex.iteritems()}
    
    def build_w2w_matrix(self):
        
        print "... counting words"
        cooccurrenceDict = defaultdict(int)
        def add_entry(word,contexts):
            for context in contexts:
                cooccurrenceDict[(self.wordToIndex[word],self.wordToIndex[context])] += 1
        for sent in self.sents:
            for word in sent:
                contexts = [w for w in sent if w!=word]
                add_entry(word,contexts)
                
        print "... building cooccurrence matrix"
        self.w2w = np.zeros((len(self.vocab),len(self.vocab)))
        for (widx_i,widx_j),count in cooccurrenceDict.iteritems():
            self.w2w[widx_i][widx_j] = count
    
    def build_similarity_matrix(self, similarity=ppmi):
        self.simMatrix = similarity(self.w2w)
    
    def k_most_similar(self, words, k=20):
        assert len(words)==len(filter(lambda w:1 if w in self.vocab else 0, words))
        w2sim = {}
        for word in words:
            simList = self.simMatrix[self.wordToIndex[word]]
            w2sim[word] = map(lambda idx:(self.indexToWord[idx],
                                          self.simMatrix[self.wordToIndex[word]][idx]),
                              np.argsort(simList)[::-1][:k])
            # [:k] -> [1:k+1] to skip self-similarty.
        return w2sim
    

In [10]:
%%time
ds = SimpleDistSem(sents)

CPU times: user 414 ms, sys: 75 ms, total: 489 ms
Wall time: 624 ms


In [11]:
%%time
ds.build_w2w_matrix()

... counting words
... building cooccurrence matrix
CPU times: user 18.9 s, sys: 544 ms, total: 19.5 s
Wall time: 19.4 s


#### b. Evaluator: K-Frequent

In [12]:
%%time
ds.build_similarity_matrix(ppmi)

CPU times: user 1.76 s, sys: 677 ms, total: 2.44 s
Wall time: 2.45 s


In [13]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
w2sim = ds.k_most_similar(words)

CPU times: user 8.81 ms, sys: 685 µs, total: 9.49 ms
Wall time: 8.61 ms


In [14]:
w2sim['car']

[(u'bugatti', 4.9527736586389013),
 (u'brabham', 4.9161339514215019),
 (u'racing', 4.5414990646119815),
 (u'audi', 4.4569545520444169),
 (u'bogie', 4.4533767306965331),
 (u'clutch', 4.34957993701489),
 (u'brake', 4.303781867405136),
 (u'aston', 4.2316627721919184),
 (u'chassis', 4.2010965852956685),
 (u'grip', 4.1760155934775263),
 (u'earnhardt', 4.1697026796422909),
 (u'motors', 4.1104319795697029),
 (u'bentley', 4.1001287045704835),
 (u'cable', 4.000996428129918),
 (u'bmw', 3.9755909610087432),
 (u'prix', 3.8449707785916787),
 (u'driving', 3.8449707785916787),
 (u'tire', 3.8328002429714236),
 (u'drag', 3.7433277393339845),
 (u'wheel', 3.7327795828580266)]

In [20]:
%%time
ds.build_similarity_matrix(cosine)

CPU times: user 23.5 s, sys: 335 ms, total: 23.9 s
Wall time: 4.37 s


In [21]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
w2sim = ds.k_most_similar(words)

CPU times: user 13.1 ms, sys: 1.54 ms, total: 14.6 ms
Wall time: 12.9 ms


In [22]:
w2sim['piano']

[(u'<text', nan),
 (u'piano', 0.99999999999999967),
 (u'composer', 0.69966561447368569),
 (u'cello', 0.65158806649998791),
 (u'concerto', 0.62814468385563815),
 (u'mozart', 0.61481959245104545),
 (u'violin', 0.60492309456281401),
 (u'repertoire', 0.5978334029835044),
 (u'bartk', 0.59402148569267632),
 (u'debussy', 0.59038513889274702),
 (u'bass', 0.58965311065244119),
 (u'quartet', 0.58928766235639241),
 (u'instrument', 0.57495837539565331),
 (u'orchestra', 0.57134920784866217),
 (u'trio', 0.56447162052240374),
 (u'sonata', 0.55335303018882542),
 (u'jazz', 0.54858777882388832),
 (u'vivaldi', 0.54782211463701003),
 (u'style', 0.54546613384641018),
 (u'ensemble', 0.53629183792580293)]

In [19]:
%%time
ds.build_similarity_matrix(jaccard)

In [None]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
w2sim = ds.k_most_similar(words)

In [None]:
w2sim['piano']

#### c. Evaluator: BLESS

In [23]:
import random

In [24]:
def bless_evaluator(simMatrix=None, indexers=[None,None]):
    wordToIndex, indexToWord = indexers
    path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03/BLESS_part.txt'
    with open(path,'rb') as f:
        bless = f.readlines()
    bless = [line.split('\t') for line in bless] # split into (concept, _, relation, relatum).
    crPairs = [(c.split('-')[0],r.split('-')[0],rel) for c,_,rel,r in bless]
    posPairs = [(c,r) for c,r,rel in crPairs if rel=='hyper']
    negPairs = [(c,r) for c,r,rel in crPairs if rel=='mero']
    
    return [map(lambda (c,r):(c,r,simMatrix[wordToIndex[c]][wordToIndex[r]]), posPairs),
            map(lambda (c,r):(c,r,simMatrix[wordToIndex[c]][wordToIndex[r]]), negPairs)]

##### PPMI

In [25]:
%%time
ds.build_similarity_matrix(ppmi)
posEval, negEval = bless_evaluator(ds.simMatrix, indexers=[ds.wordToIndex, ds.indexToWord])

CPU times: user 1.64 s, sys: 637 ms, total: 2.27 s
Wall time: 2.28 s


In [26]:
print "Examples of Evaluation on Positive Relations (PPMI): "
print random.sample(posEval, 5)
print "Average PPMI: ", np.mean([ppmiVal for _,_,ppmiVal in posEval])

Examples of Evaluation on Positive Relations (PPMI): 
[('train', 'transport', 2.2302988178089156), ('car', 'transport', 1.1434975075691654), ('dress', 'clothing', 3.6998540116782288), ('piano', 'device', 0.54294334951781631), ('jet', 'craft', 0.0)]
Average PPMI:  1.58804713869


In [27]:
print "Examples of Evaluation on Negative Relations (PPMI): "
print random.sample(negEval, 5)
print "Average PPMI: ", np.mean([ppmiVal for _,_,ppmiVal in negEval])

Examples of Evaluation on Negative Relations (PPMI): 
[('van', 'belt', 2.3745710574629171), ('fighter', 'missile', 2.4072065468530175), ('car', 'brake', 4.303781867405136), ('horse', 'neck', 1.6025628271359271), ('coat', 'pattern', 2.1242004230418874)]
Average PPMI:  1.37760253331


##### Cosine

In [28]:
%%time
ds.build_similarity_matrix(cosine)
posEval, negEval = bless_evaluator(ds.simMatrix, indexers=[ds.wordToIndex, ds.indexToWord])

CPU times: user 23.4 s, sys: 282 ms, total: 23.7 s
Wall time: 4.17 s


In [29]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(posEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in posEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('lizard', 'vertebrate', 0.30616568218162393), ('tiger', 'beast', 0.22822836358380735), ('bag', 'artifact', 0.18154098388172293), ('trumpet', 'device', 0.11180487394114597), ('beetle', 'insect', 0.36960661108537279)]
Average Cosine:  0.295899824705


In [30]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(negEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in negEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('fork', 'tooth', 0.096313490741254823), ('pub', 'alcohol', 0.23298932669551869), ('cow', 'leg', 0.24205085152973466), ('pistol', 'barrel', 0.3866296621449728), ('hat', 'wool', 0.17767521564980449)]
Average Cosine:  0.250902348547


##### Jaccard

In [None]:
%%time
# ds.build_similarity_matrijaccardccard) # don't run this again if ran, super slow (1hr)
posEval, negEval = bless_evaluator(ds.simMatrix, indexers=[ds.wordToIndex, ds.indexToWord])

In [None]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(posEval, 5)
print "Average Cosine: ", np.mean([jaccardVal for _,_,jaccardVal in posEval])

In [None]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(negEval, 5)
print "Average Cosine: ", np.mean([jaccardVal for _,_,jaccardVal in negEval])

### B2. Analyzer: Tf-Idf Matrix Based

#### a. Model

In [11]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import theano.tensor as T
from theano import function
from sklearn.decomposition import TruncatedSVD

In [35]:
def cosine(tfidfMatrix):
    # no difference from the previous cosine, only this is theano-based.
    v = T.vector()
    vLen = T.sqrt(T.dot(v,v))
    vector_length = function([v], vLen)
    M = T.matrix()
    MtimesMT = T.dot(M,T.transpose(M))
    multiply_matrix = function([M], MtimesMT)
    tfidfMatrix_norm = tfidfMatrix / np.apply_along_axis(lambda r: vector_length(r).item(), 
                                                         1, tfidfMatrix)[:,np.newaxis]
    return multiply_matrix(tfidfMatrix_norm)


In [15]:
class SimpleDistSem:
    
    def __init__(self, data=load_wiki, kFrequent=50):
        self.sents = load_wiki(kFrequent)
   
    def build_tfidf_matrix(self, dimension=100):
        
        print "... building model"
        tfidf = TfidfVectorizer()
        doc2vocab = tfidf.fit_transform([' '.join(sent) for sent in self.sents])
            # this is a "doc x vocab" matrix (doc=sent in this case).
        self.vocab = tfidf.vocabulary_.keys()
        self.wordToIndex = tfidf.vocabulary_
        self.indexToWord = {i:word for word,i in self.wordToIndex.iteritems()}
        
        print "... building matrix"
        vocab2doc = doc2vocab.A.T
            # this is a "vocab x doc" matrix.
            
        print "... dimension reduction to %d" % dimension
        self.tfidfMatrix = TruncatedSVD(n_components=dimension).fit_transform(vocab2doc)        
    
    def build_similarity_matrix(self, similarity=cosine):
        self.simMatrix = similarity(self.tfidfMatrix)
    
    def k_most_similar(self, words, k=20):
        assert len(words)==len(filter(lambda w:1 if w in self.vocab else 0, words))
        w2sim = {}
        for word in words:
            simList = self.simMatrix[self.wordToIndex[word]]
            w2sim[word] = map(lambda idx:(self.indexToWord[idx],
                                          self.simMatrix[self.wordToIndex[word]][idx]),
                              np.argsort(simList)[::-1][:k])
        return w2sim

In [16]:
%%time
ds = SimpleDistSem(kFrequent=50)

... extracting data
... cleaning data
... building token list and vocabulary
... saving top 50-frequent in vocabulary
CPU times: user 4min 51s, sys: 7.79 s, total: 4min 58s
Wall time: 4min 55s


In [17]:
%%time
ds.build_tfidf_matrix()

... building model
... building matrix
... dimension reduction to 100
CPU times: user 11min 29s, sys: 7min 16s, total: 18min 45s
Wall time: 5min 59s


#### b. Evaluator: k-Frequent

In [47]:
%%time
ds.build_similarity_matrix(cosine)

CPU times: user 825 ms, sys: 133 ms, total: 958 ms
Wall time: 743 ms


In [48]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
w2sim = ds.k_most_similar(words)

CPU times: user 10.6 ms, sys: 551 µs, total: 11.2 ms
Wall time: 10.3 ms


In [31]:
w2sim['car']

[(u'car', 1.0),
 (u'automobile', 0.78198512433256162),
 (u'manufacturer', 0.76825489012288861),
 (u'truck', 0.75936046819264114),
 (u'vehicle', 0.74034507695965179),
 (u'racing', 0.73296941278888672),
 (u'dodge', 0.73272329156215543),
 (u'ducati', 0.73025635105670872),
 (u'motorcycle', 0.72987873478148879),
 (u'aston', 0.72771677360033016),
 (u'factory', 0.7253843067022604),
 (u'motor', 0.7139360763061996),
 (u'bentley', 0.71382111106726009),
 (u'chrysler', 0.71184611174658552),
 (u'limited', 0.69660513896349774),
 (u'stock', 0.69489919650293464),
 (u'locomotive', 0.69143386648364602),
 (u'cable', 0.68722824072170041),
 (u'ford', 0.68420194003868762),
 (u'lease', 0.68215127135050091)]

#### c. Evaluator: BLESS

In [42]:
import random

In [43]:
def bless_evaluator(simMatrix=None, indexers=[None,None]):
    wordToIndex, indexToWord = indexers
    path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03/BLESS_part.txt'
    with open(path,'rb') as f:
        bless = f.readlines()
    bless = [line.split('\t') for line in bless] # split into (concept, _, relation, relatum).
    crPairs = [(c.split('-')[0],r.split('-')[0],rel) for c,_,rel,r in bless]
    posPairs = [(c,r) for c,r,rel in crPairs if rel=='hyper']
    negPairs = [(c,r) for c,r,rel in crPairs if rel=='mero']
    
    return [map(lambda (c,r):(c,r,simMatrix[wordToIndex[c]][wordToIndex[r]]), posPairs),
            map(lambda (c,r):(c,r,simMatrix[wordToIndex[c]][wordToIndex[r]]), negPairs)]

##### Cosine

In [49]:
%%time
# ds.build_similarity_matrix(cosine) # comment out if has been computed previously.
posEval, negEval = bless_evaluator(ds.simMatrix, indexers=[ds.wordToIndex, ds.indexToWord])

CPU times: user 13.9 ms, sys: 4.95 ms, total: 18.8 ms
Wall time: 19.2 ms


In [50]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(posEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in posEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('turtle', 'pet', 0.45376598151239955), ('clarinet', 'artifact', 0.14487351849329463), ('battleship', 'ship', 0.75923317149541025), ('television', 'object', 0.066853681127950509), ('castle', 'building', 0.43025187595393954)]
Average Cosine:  0.395534739769


In [51]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(negEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in negEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('van', 'gear', 0.41969753514314606), ('bus', 'plate', 0.11086664248226279), ('sword', 'point', -0.00087320182307122163), ('helicopter', 'wheel', 0.49287113499505791), ('car', 'door', 0.36863646621214197)]
Average Cosine:  0.363030447208


## II. Word Embedding Based (Gensim)

### A. Load Data

In [54]:
sents = load_wiki()

... extracting data
... cleaning data
... building token list and vocabulary
... saving top 50-frequent in vocabulary


In [55]:
sents[:5]

[[u'anarchism'],
 [u'anarchism', u'philosophy', u'theory', u'attitude', u'state'],
 [u'anarchist', u'criterion', u'anarchism', u'criterion'],
 [u'oxford',
  u'companion',
  u'philosophy',
  u'position',
  u'anarchist',
  u'anarchist',
  u'share',
  u'family',
  u'resemblance'],
 [u'type', u'tradition', u'anarchism']]

### B. Build Model

In [56]:
from gensim.models import Word2Vec

In [58]:
%%time
w2v_model = Word2Vec(sents, size=100, window=4) 
    # if sents is raw data (i.e. low-freq. words unfiltered),
    #  use 'min_count' to control.

CPU times: user 22.8 s, sys: 1.29 s, total: 24.1 s
Wall time: 9.2 s


In [59]:
cosineSimilarities = np.asarray([w2v_model[word] for word])

array([ -2.75150329e-01,   6.99418545e-01,  -5.11950314e-01,
         6.59707561e-02,   4.91376370e-01,  -4.77761716e-01,
         5.42201400e-01,  -1.97092444e-01,   5.33426642e-01,
        -3.37237179e-01,  -8.23528618e-02,  -4.95483190e-01,
        -1.47832707e-01,  -5.31127512e-01,  -5.64234257e-01,
         2.55174309e-01,  -6.66471720e-01,  -1.57762356e-02,
         3.95696431e-01,   8.59119296e-02,   4.43060737e-04,
        -5.26001334e-01,   2.14888588e-01,  -7.58717954e-02,
        -4.19115603e-01,   1.17773570e-01,  -1.55215964e-01,
         1.13844804e-01,   7.16124177e-01,  -6.85070634e-01,
        -1.87814653e-01,   2.55785659e-02,  -2.51530170e-01,
         6.57502770e-01,  -5.09228885e-01,  -1.33832723e-01,
        -3.79972667e-01,  -7.30716437e-02,  -2.47103795e-02,
        -1.50842983e-02,   1.57926455e-01,   1.72145128e-01,
         2.03344733e-01,  -2.37459227e-01,  -7.21358359e-01,
        -1.29057765e-01,   3.44870687e-01,   1.70839652e-01,
        -9.33472216e-02,

In [76]:
vocab = w2v_model.vocab.keys()
wordToIndex = {word:i for i,word in enumerate(vocab)}
indexToWord = {i:word for word,i in wordToIndex.iteritems()}
w2vMatrix = np.asarray([w2v_model[word] for word in vocab])

In [77]:
cosineSimilarities = cosine(w2vMatrix)

### C. Evaluator: K-Frequent

In [78]:
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']

In [82]:
assert len(words)==len(filter(lambda w:1 if w in vocab else 0, words))
w2sim = {}
for word in words:
    simList = cosineSimilarities[wordToIndex[word]]
    w2sim[word] = map(lambda idx:(indexToWord[idx],
                                  cosineSimilarities[wordToIndex[word]][idx]),
                      np.argsort(simList)[::-1][1:20+1])

In [83]:
w2sim['car']

[(u'racing', 0.80745568246577037),
 (u'motorcycle', 0.77827624676000107),
 (u'truck', 0.77548379907232889),
 (u'ducati', 0.76853153410659369),
 (u'driver', 0.76733914567819295),
 (u'vehicle', 0.75342257784875111),
 (u'automobile', 0.75091631398808334),
 (u'bmw', 0.75091198003567983),
 (u'audi', 0.74906836369815033),
 (u'bugatti', 0.74045194356275501),
 (u'brabham', 0.7319402436920569),
 (u'locomotive', 0.73175412390384009),
 (u'motor', 0.71965120134839877),
 (u'bogie', 0.71595481866357158),
 (u'bike', 0.7138517139922359),
 (u'taxi', 0.70554621519881633),
 (u'carriage', 0.70307595216451924),
 (u'bicycle', 0.69848066962656186),
 (u'rider', 0.69688658673939274),
 (u'competitor', 0.69254138733413384)]

### D. Evaluator: BLESS

In [84]:
posEval, negEval = bless_evaluator(cosineSimilarities, indexers=[wordToIndex, indexToWord])

In [85]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(posEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in posEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('guitar', 'artifact', 0.17882556861542048), ('deer', 'animal', 0.55486907926090812), ('turtle', 'creature', 0.59347283689587227), ('box', 'object', 0.099544941891068189), ('radio', 'artifact', -0.08771763428802383)]
Average Cosine:  0.443507618235


In [86]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(negEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in negEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('castle', 'furniture', 0.27074505246656916), ('dress', 'button', 0.3428140659476267), ('truck', 'mirror', 0.29521023604557545), ('whale', 'mouth', 0.66658632206587454), ('cello', 'string', 0.81084896388357219)]
Average Cosine:  0.426877469438
