# Distributional Semantics Model

** Note: Similarity Measure **

* **Cosine**: $ sim(u,v) = \frac{\sum_iu_iv_i}{\sqrt{\sum u_i^2}\sqrt{\sum v_i^2}} $


* **PPMI**: $ sim(w_i,w_j) = max\{log\frac{P(w_i,w_j)}{P(w_i)\cdot P(w_j)}, 0\} $


* **Jaccard**: $ sim(u,v) = \frac{\sum_imin\{u_i,v_i\}}{\sum_imax\{u_i,v_i\}} $ 

## I. "Traditional"

### A. Data Loading Facilities

In [1]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03")

In [2]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from collections import defaultdict
from functools import partial
from itertools import permutations, product
from string import punctuation

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def load_wiki(cutoffFreq=50):
    
    print "... extracting data"
    with open('wikicorpus.txt','rb') as f:
        raw = f.readlines()
    raw = [sent.split() for sent in raw if sent.startswith('<c>')] 
        # extract sentences; split sentences into word complexes.
    raw = [[map(partial(str.split, word), '|') for word in sent] for sent in raw] 
        # split word complexes into words.
    
    print "... cleaning data"
    sents = [[word[0][1].lower() for word in sent if len(word[0])>1 
              and word[0][2].startswith('N')
              and word[0][1].lower() not in stopwords
              and word[0][1] not in punctuation] for sent in raw]
        # extract lemmas => complete sents corpus .
    
    print "... building token list and vocabulary"
    tokens = [word for sent in sents for word in sent]
        # type: list of words.
    fdist = nltk.FreqDist(tokens)
    vocab = list(set(tokens))   
    
    print "... saving top %d-frequent in vocabulary" % cutoffFreq
    vocab = [word for word in vocab if fdist[word] >= cutoffFreq]
        # vocab is not returned, because the k-frequent cut latter can change it.
    sents = [[word.decode('utf-8','ignore') for word in sent if word in vocab] for sent in sents]
        # type: list of lists of words.
        
    return sents

### B1. Analyzer: Cooccurrence Matrix Based

#### a. Model

In [27]:
import numpy as np

In [28]:
# SIMILARITY MEASURES
def cosine(w2w):
    w2w_norm = w2w / np.apply_along_axis(lambda r: np.sqrt(np.dot(r,r))
                               , 1, w2w)[:,np.newaxis]
    return np.dot(w2w_norm, w2w_norm.T)
    
def ppmi(w2w):
    rowSums, colSums, totalSums = w2w.sum(axis=1), w2w.sum(axis=0), w2w.sum()
    pwi, pwj, ppmiMatrix = rowSums/totalSums, colSums/totalSums, w2w/totalSums
    ppmiMatrix /= pwi[:,np.newaxis] # * 1/pwi by row.
    ppmiMatrix /= pwj # * 1/pwj by col.
    ppmiMatrix = np.nan_to_num(np.log(ppmiMatrix)) # compute pmi.
    ppmiMatrix = np.maximum(ppmiMatrix, 0) # compute ppmi.
    return ppmiMatrix


In [29]:
class SimpleDistSem:
    
    def __init__(self, data=load_wiki, kFrequent=50):
        self.sents = load_wiki(kFrequent)
        self.vocab = list({word for sent in self.sents for word in sent})
        self.wordToIndex = {word:i for i,word in enumerate(self.vocab)}
        self.indexToWord = {i:word for word,i in self.wordToIndex.iteritems()}
    
    def build_w2w_matrix(self):
        
        print "... counting words"
        cooccurrenceDict = defaultdict(int)
        for sent in self.sents:
            for w_i,w_j in product(sent,repeat=2):
                cooccurrenceDict[(self.wordToIndex[w_i],self.wordToIndex[w_j])] += 1
                
        print "... building cooccurrence matrix"
        self.w2w = np.zeros((len(self.vocab),len(self.vocab)))
        for (widx_i,widx_j),count in cooccurrenceDict.iteritems():
            self.w2w[widx_i][widx_j] = count
    
    def build_similarity_matrix(self, similarity=ppmi):
        self.simMatrix = similarity(self.w2w)
    
    def k_most_similar(self, words, k=20):
        assert len(words)==len(filter(lambda w:1 if w in self.vocab else 0, words))
        w2sim = {}
        for word in words:
            simList = self.simMatrix[self.wordToIndex[word]]
            w2sim[word] = map(lambda idx:(self.indexToWord[idx],
                                          self.simMatrix[self.wordToIndex[word]][idx]),
                              np.argsort(simList)[::-1][:k])
            # [:k] -> [1:k+1] to skip self-similarty.
        return w2sim
    

In [30]:
%%time
ds = SimpleDistSem(kFrequent=50)

... extracting data
... cleaning data
... building token list and vocabulary
... saving top 50-frequent in vocabulary
CPU times: user 4min 40s, sys: 6.35 s, total: 4min 47s
Wall time: 4min 44s


In [31]:
%%time
ds.build_w2w_matrix()

... counting words
... building cooccurrence matrix
CPU times: user 16.3 s, sys: 433 ms, total: 16.7 s
Wall time: 16.7 s


#### b. Evaluator: K-Frequent

In [32]:
%%time
ds.build_similarity_matrix(ppmi)

CPU times: user 1.65 s, sys: 559 ms, total: 2.21 s
Wall time: 2.21 s


In [33]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
w2sim = ds.k_most_similar(words)

CPU times: user 8.8 ms, sys: 639 µs, total: 9.44 ms
Wall time: 9.43 ms


In [34]:
w2sim['car']

[(u'car', 5.5676816083512408),
 (u'brabham', 4.7370186924636686),
 (u'bugatti', 4.7237837775965463),
 (u'racing', 4.3543655930385992),
 (u'bogie', 4.1806549399146231),
 (u'audi', 4.1720831279024368),
 (u'brake', 4.110064293137321),
 (u'clutch', 4.0859371771892361),
 (u'aston', 4.0789773262541758),
 (u'chassis', 4.0359974554728177),
 (u'grip', 3.9920710409571445),
 (u'motors', 3.9754986673044659),
 (u'earnhardt', 3.9556109155136614),
 (u'bentley', 3.8900626539905776),
 (u'cable', 3.8163895589909154),
 (u'bmw', 3.7358020900909055),
 (u'prix', 3.6938950894132119),
 (u'driving', 3.6889025678100911),
 (u'tire', 3.6707417844384955),
 (u'drag', 3.5695512828426086)]

In [35]:
%%time
ds.build_similarity_matrix(cosine)

CPU times: user 20.2 s, sys: 257 ms, total: 20.4 s
Wall time: 3.36 s


In [36]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
w2sim = ds.k_most_similar(words)

CPU times: user 12 ms, sys: 620 µs, total: 12.7 ms
Wall time: 11.7 ms


In [37]:
w2sim['piano']

[(u'piano', 1.0),
 (u'sonata', 0.45607377928410825),
 (u'trio', 0.35838892239353981),
 (u'concerto', 0.32427804909836766),
 (u'pianist', 0.3046071109326966),
 (u'violin', 0.3016548704228233),
 (u'bartk', 0.28394365707596747),
 (u'beethoven', 0.2642449285886056),
 (u'cello', 0.2330752071904453),
 (u'mozart', 0.2111765802790872),
 (u'music', 0.20778712041515557),
 (u'quartet', 0.19455367242470653),
 (u'guitar', 0.18879834891146863),
 (u'satie', 0.18168601655873329),
 (u'flute', 0.18126266038699335),
 (u'composer', 0.17935882766467443),
 (u'shostakovich', 0.17870131920452129),
 (u'debussy', 0.17265077192418996),
 (u'bass', 0.17134824877272578),
 (u'percussion', 0.16699620460987527)]

#### c. Evaluator: BLESS

In [49]:
import random

In [38]:
def bless_evaluator(simMatrix=None, indexers=[None,None]):
    wordToIndex, indexToWord = indexers
    path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03/BLESS_part.txt'
    with open(path,'rb') as f:
        bless = f.readlines()
    bless = [line.split('\t') for line in bless] # split into (concept, _, relation, relatum).
    crPairs = [(c.split('-')[0],r.split('-')[0],rel) for c,_,rel,r in bless]
    posPairs = [(c,r) for c,r,rel in crPairs if rel=='hyper']
    negPairs = [(c,r) for c,r,rel in crPairs if rel=='mero']
    
    return [map(lambda (c,r):(c,r,simMatrix[wordToIndex[c]][wordToIndex[r]]), posPairs),
            map(lambda (c,r):(c,r,simMatrix[wordToIndex[c]][wordToIndex[r]]), negPairs)]

##### PPMI

In [47]:
%%time
ds.build_similarity_matrix(ppmi)
posEval, negEval = bless_evaluator(ds.simMatrix, indexers=[ds.wordToIndex, ds.indexToWord])

CPU times: user 1.65 s, sys: 551 ms, total: 2.2 s
Wall time: 2.2 s


In [55]:
print "Examples of Evaluation on Positive Relations (PPMI): "
print random.sample(posEval, 5)
print "Average PPMI: ", np.mean([ppmiVal for _,_,ppmiVal in posEval])

Examples of Evaluation on Positive Relations (PPMI): 
[('hammer', 'artifact', 0.0), ('goat', 'food', 1.9087405887647915), ('horse', 'creature', 1.2505876398833882), ('fox', 'beast', 2.852315961784635), ('fighter', 'vehicle', 1.3481053885271306)]
Average PPMI:  1.48619039202


In [56]:
print "Examples of Evaluation on Negative Relations (PPMI): "
print random.sample(negEval, 5)
print "Average PPMI: ", np.mean([ppmiVal for _,_,ppmiVal in negEval])

Examples of Evaluation on Negative Relations (PPMI): 
[('helicopter', 'tail', 2.7288466816579824), ('cannon', 'ammunition', 4.3424453266452474), ('motorcycle', 'tank', 0.0), ('hammer', 'steel', 1.9859433074758956), ('hospital', 'window', 0.0)]
Average PPMI:  1.27640054163


##### Cosine

In [57]:
%%time
ds.build_similarity_matrix(cosine)
posEval, negEval = bless_evaluator(ds.simMatrix, indexers=[ds.wordToIndex, ds.indexToWord])

CPU times: user 20.2 s, sys: 295 ms, total: 20.5 s
Wall time: 3.5 s


In [58]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(posEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in posEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('motorcycle', 'transport', 0.029442142376491426), ('hotel', 'structure', 0.017216513144524249), ('television', 'equipment', 0.058559154251075898), ('coyote', 'creature', 0.019324295262112278), ('turtle', 'vertebrate', 0.037552685932305083)]
Average Cosine:  0.0603423768913


In [59]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(negEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in negEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('cello', 'back', 0.053753511464451946), ('snake', 'poison', 0.033404743658572426), ('pub', 'glass', 0.022851286650711126), ('salmon', 'mouth', 0.061380927946826883), ('hotel', 'window', 0.025526976678457877)]
Average Cosine:  0.0468720110053


### B2. Analyzer: Tf-Idf Matrix Based

#### a. Model

In [11]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import theano.tensor as T
from theano import function
from sklearn.decomposition import TruncatedSVD

In [35]:
def cosine(tfidfMatrix):
    # no difference from the previous cosine, only this is theano-based.
    v = T.vector()
    vLen = T.sqrt(T.dot(v,v))
    vector_length = function([v], vLen)
    M = T.matrix()
    MtimesMT = T.dot(M,T.transpose(M))
    multiply_matrix = function([M], MtimesMT)
    tfidfMatrix_norm = tfidfMatrix / np.apply_along_axis(lambda r: vector_length(r).item(), 
                                                         1, tfidfMatrix)[:,np.newaxis]
    return multiply_matrix(tfidfMatrix_norm)

def jaccard(tfidfMatrix):
    jaccardSimilarities = np.zeros((tfidfMatrix.shape[0],tfidfMatrix.shape[0]))
    for i,wVec_i in enumerate(tfidfMatrix):
        for j,wVec_j in enumerate(tfidfMatrix):
            jaccardSimilarities[i][j] = sum(min(wVec_i,wVec_j) 
                                                for wVec_i,wVec_j in zip(wVec_i,wVec_j))/ \
                                        sum(max(wVec_i,wVec_j) 
                                                for wVec_i,wVec_j in zip(wVec_i,wVec_j))
    return jaccardSimilarities

# SCIKIT-LEARN JACCARD
# >>> import numpy as np
# >>> from sklearn.metrics import jaccard_similarity_score
# >>> y_pred = [0, 2, 1, 3]
# >>> y_true = [0, 1, 2, 3]
# >>> jaccard_similarity_score(y_true, y_pred)
# 0.5
# >>> jaccard_similarity_score(y_true, y_pred, normalize=False)
# 2

In [15]:
class SimpleDistSem:
    
    def __init__(self, data=load_wiki, kFrequent=50):
        self.sents = load_wiki(kFrequent)
   
    def build_tfidf_matrix(self, dimension=100):
        
        print "... building model"
        tfidf = TfidfVectorizer()
        doc2vocab = tfidf.fit_transform([' '.join(sent) for sent in self.sents])
            # this is a "doc x vocab" matrix (doc=sent in this case).
        self.vocab = tfidf.vocabulary_.keys()
        self.wordToIndex = tfidf.vocabulary_
        self.indexToWord = {i:word for word,i in self.wordToIndex.iteritems()}
        
        print "... building matrix"
        vocab2doc = doc2vocab.A.T
            # this is a "vocab x doc" matrix.
            
        print "... dimension reduction to %d" % dimension
        self.tfidfMatrix = TruncatedSVD(n_components=dimension).fit_transform(vocab2doc)        
    
    def build_similarity_matrix(self, similarity=cosine):
        self.simMatrix = similarity(self.tfidfMatrix)
    
    def k_most_similar(self, words, k=20):
        assert len(words)==len(filter(lambda w:1 if w in self.vocab else 0, words))
        w2sim = {}
        for word in words:
            simList = self.simMatrix[self.wordToIndex[word]]
            w2sim[word] = map(lambda idx:(self.indexToWord[idx],
                                          self.simMatrix[self.wordToIndex[word]][idx]),
                              np.argsort(simList)[::-1][:k])
        return w2sim

In [16]:
%%time
ds = SimpleDistSem(kFrequent=50)

... extracting data
... cleaning data
... building token list and vocabulary
... saving top 50-frequent in vocabulary
CPU times: user 4min 51s, sys: 7.79 s, total: 4min 58s
Wall time: 4min 55s


In [17]:
%%time
ds.build_tfidf_matrix()

... building model
... building matrix
... dimension reduction to 100
CPU times: user 11min 29s, sys: 7min 16s, total: 18min 45s
Wall time: 5min 59s


#### b. Evaluator: k-Frequent

In [47]:
%%time
ds.build_similarity_matrix(cosine)

CPU times: user 825 ms, sys: 133 ms, total: 958 ms
Wall time: 743 ms


In [48]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
w2sim = ds.k_most_similar(words)

CPU times: user 10.6 ms, sys: 551 µs, total: 11.2 ms
Wall time: 10.3 ms


In [31]:
w2sim['car']

[(u'car', 1.0),
 (u'automobile', 0.78198512433256162),
 (u'manufacturer', 0.76825489012288861),
 (u'truck', 0.75936046819264114),
 (u'vehicle', 0.74034507695965179),
 (u'racing', 0.73296941278888672),
 (u'dodge', 0.73272329156215543),
 (u'ducati', 0.73025635105670872),
 (u'motorcycle', 0.72987873478148879),
 (u'aston', 0.72771677360033016),
 (u'factory', 0.7253843067022604),
 (u'motor', 0.7139360763061996),
 (u'bentley', 0.71382111106726009),
 (u'chrysler', 0.71184611174658552),
 (u'limited', 0.69660513896349774),
 (u'stock', 0.69489919650293464),
 (u'locomotive', 0.69143386648364602),
 (u'cable', 0.68722824072170041),
 (u'ford', 0.68420194003868762),
 (u'lease', 0.68215127135050091)]

**NB: After SVD reduction, some of the values in tf-idf matrix come negative, which causes Jaccard to give weird output.**

In [36]:
%%time
ds.build_similarity_matrix(jaccard) # super slow, searching for better algorithm.

CPU times: user 1h 58s, sys: 42.9 s, total: 1h 1min 41s
Wall time: 1h 1min 16s


In [38]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
w2sim = ds.k_most_similar(words)

CPU times: user 10.2 ms, sys: 578 µs, total: 10.8 ms
Wall time: 10 ms


In [39]:
w2sim['car']

[(u'car', 1.0),
 (u'history', -0.15115789972759958),
 (u'text', -0.26226840947899999),
 (u'market', -0.36955511780147104),
 (u'economy', -0.38972338959638458),
 (u'year', -0.40801622790322367),
 (u'business', -0.45524213911197303),
 (u'case', -0.46700191886219455),
 (u'price', -0.48427401476008269),
 (u'time', -0.49814133092182583),
 (u'period', -0.51142309447448309),
 (u'society', -0.51356120088132173),
 (u'education', -0.51563384962275005),
 (u'order', -0.51637872789032258),
 (u'community', -0.51717864757827092),
 (u'father', -0.5208925301408166),
 (u'game', -0.52261063187878454),
 (u'law', -0.54649911286173602),
 (u'function', -0.54762230441489623),
 (u'growth', -0.55112694656448535)]

#### c. Evaluator: BLESS

In [42]:
import random

In [43]:
def bless_evaluator(simMatrix=None, indexers=[None,None]):
    wordToIndex, indexToWord = indexers
    path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03/BLESS_part.txt'
    with open(path,'rb') as f:
        bless = f.readlines()
    bless = [line.split('\t') for line in bless] # split into (concept, _, relation, relatum).
    crPairs = [(c.split('-')[0],r.split('-')[0],rel) for c,_,rel,r in bless]
    posPairs = [(c,r) for c,r,rel in crPairs if rel=='hyper']
    negPairs = [(c,r) for c,r,rel in crPairs if rel=='mero']
    
    return [map(lambda (c,r):(c,r,simMatrix[wordToIndex[c]][wordToIndex[r]]), posPairs),
            map(lambda (c,r):(c,r,simMatrix[wordToIndex[c]][wordToIndex[r]]), negPairs)]

##### Cosine

In [49]:
%%time
# ds.build_similarity_matrix(cosine) # comment out if has been computed previously.
posEval, negEval = bless_evaluator(ds.simMatrix, indexers=[ds.wordToIndex, ds.indexToWord])

CPU times: user 13.9 ms, sys: 4.95 ms, total: 18.8 ms
Wall time: 19.2 ms


In [50]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(posEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in posEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('turtle', 'pet', 0.45376598151239955), ('clarinet', 'artifact', 0.14487351849329463), ('battleship', 'ship', 0.75923317149541025), ('television', 'object', 0.066853681127950509), ('castle', 'building', 0.43025187595393954)]
Average Cosine:  0.395534739769


In [51]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(negEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in negEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('van', 'gear', 0.41969753514314606), ('bus', 'plate', 0.11086664248226279), ('sword', 'point', -0.00087320182307122163), ('helicopter', 'wheel', 0.49287113499505791), ('car', 'door', 0.36863646621214197)]
Average Cosine:  0.363030447208


##### Jaccard

In [44]:
%%time
# ds.build_similarity_matrix(jaccard) # comment out if has been computed previously.
                                      # CAUTION: jaccard is slow!!!
posEval, negEval = bless_evaluator(ds.simMatrix, indexers=[ds.wordToIndex, ds.indexToWord])

CPU times: user 15 ms, sys: 15.4 ms, total: 30.4 ms
Wall time: 60 ms


In [45]:
print "Examples of Evaluation on Positive Relations (Jaccard): "
print random.sample(posEval, 5)
print "Average Cosine: ", np.mean([jaccardVal for _,_,jaccardVal in posEval])

Examples of Evaluation on Positive Relations (Jaccard): 
[('glove', 'garment', -1.2472914246383657), ('castle', 'home', -0.91276277360885272), ('jet', 'plane', -1.6013089320024307), ('sheep', 'animal', -0.82145541582443582), ('jet', 'craft', -2.075346455557209)]
Average Cosine:  -1.32821427543


In [46]:
print "Examples of Evaluation on Positive Relations (Jaccard): "
print random.sample(negEval, 5)
print "Average Cosine: ", np.mean([jaccardVal for _,_,jaccardVal in negEval])

Examples of Evaluation on Positive Relations (Jaccard): 
[('bus', 'mirror', -0.98619981130401546), ('cathedral', 'dome', -0.99491967294843175), ('restaurant', 'garden', -0.61123827919885343), ('cathedral', 'tower', -1.0782011960757136), ('castle', 'hall', -1.1341032130484032)]
Average Cosine:  -1.36918847379


### D. Accessories

In [8]:
import warnings
warnings.filterwarnings("ignore")

## II. Word Embedding Based (Gensim)

### A. Load Data

In [54]:
sents = load_wiki()

... extracting data
... cleaning data
... building token list and vocabulary
... saving top 50-frequent in vocabulary


In [55]:
sents[:5]

[[u'anarchism'],
 [u'anarchism', u'philosophy', u'theory', u'attitude', u'state'],
 [u'anarchist', u'criterion', u'anarchism', u'criterion'],
 [u'oxford',
  u'companion',
  u'philosophy',
  u'position',
  u'anarchist',
  u'anarchist',
  u'share',
  u'family',
  u'resemblance'],
 [u'type', u'tradition', u'anarchism']]

### B. Build Model

In [56]:
from gensim.models import Word2Vec

In [58]:
%%time
w2v_model = Word2Vec(sents, size=100, window=4) 
    # if sents is raw data (i.e. low-freq. words unfiltered),
    #  use 'min_count' to control.

CPU times: user 22.8 s, sys: 1.29 s, total: 24.1 s
Wall time: 9.2 s


In [59]:
cosineSimilarities = np.asarray([w2v_model[word] for word])

array([ -2.75150329e-01,   6.99418545e-01,  -5.11950314e-01,
         6.59707561e-02,   4.91376370e-01,  -4.77761716e-01,
         5.42201400e-01,  -1.97092444e-01,   5.33426642e-01,
        -3.37237179e-01,  -8.23528618e-02,  -4.95483190e-01,
        -1.47832707e-01,  -5.31127512e-01,  -5.64234257e-01,
         2.55174309e-01,  -6.66471720e-01,  -1.57762356e-02,
         3.95696431e-01,   8.59119296e-02,   4.43060737e-04,
        -5.26001334e-01,   2.14888588e-01,  -7.58717954e-02,
        -4.19115603e-01,   1.17773570e-01,  -1.55215964e-01,
         1.13844804e-01,   7.16124177e-01,  -6.85070634e-01,
        -1.87814653e-01,   2.55785659e-02,  -2.51530170e-01,
         6.57502770e-01,  -5.09228885e-01,  -1.33832723e-01,
        -3.79972667e-01,  -7.30716437e-02,  -2.47103795e-02,
        -1.50842983e-02,   1.57926455e-01,   1.72145128e-01,
         2.03344733e-01,  -2.37459227e-01,  -7.21358359e-01,
        -1.29057765e-01,   3.44870687e-01,   1.70839652e-01,
        -9.33472216e-02,

In [76]:
vocab = w2v_model.vocab.keys()
wordToIndex = {word:i for i,word in enumerate(vocab)}
indexToWord = {i:word for word,i in wordToIndex.iteritems()}
w2vMatrix = np.asarray([w2v_model[word] for word in vocab])

In [77]:
cosineSimilarities = cosine(w2vMatrix)

### C. Evaluator: K-Frequent

In [78]:
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']

In [82]:
assert len(words)==len(filter(lambda w:1 if w in vocab else 0, words))
w2sim = {}
for word in words:
    simList = cosineSimilarities[wordToIndex[word]]
    w2sim[word] = map(lambda idx:(indexToWord[idx],
                                  cosineSimilarities[wordToIndex[word]][idx]),
                      np.argsort(simList)[::-1][1:20+1])

In [83]:
w2sim['car']

[(u'racing', 0.80745568246577037),
 (u'motorcycle', 0.77827624676000107),
 (u'truck', 0.77548379907232889),
 (u'ducati', 0.76853153410659369),
 (u'driver', 0.76733914567819295),
 (u'vehicle', 0.75342257784875111),
 (u'automobile', 0.75091631398808334),
 (u'bmw', 0.75091198003567983),
 (u'audi', 0.74906836369815033),
 (u'bugatti', 0.74045194356275501),
 (u'brabham', 0.7319402436920569),
 (u'locomotive', 0.73175412390384009),
 (u'motor', 0.71965120134839877),
 (u'bogie', 0.71595481866357158),
 (u'bike', 0.7138517139922359),
 (u'taxi', 0.70554621519881633),
 (u'carriage', 0.70307595216451924),
 (u'bicycle', 0.69848066962656186),
 (u'rider', 0.69688658673939274),
 (u'competitor', 0.69254138733413384)]

### D. Evaluator: BLESS

In [84]:
posEval, negEval = bless_evaluator(cosineSimilarities, indexers=[wordToIndex, indexToWord])

In [85]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(posEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in posEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('guitar', 'artifact', 0.17882556861542048), ('deer', 'animal', 0.55486907926090812), ('turtle', 'creature', 0.59347283689587227), ('box', 'object', 0.099544941891068189), ('radio', 'artifact', -0.08771763428802383)]
Average Cosine:  0.443507618235


In [86]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(negEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in negEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('castle', 'furniture', 0.27074505246656916), ('dress', 'button', 0.3428140659476267), ('truck', 'mirror', 0.29521023604557545), ('whale', 'mouth', 0.66658632206587454), ('cello', 'string', 0.81084896388357219)]
Average Cosine:  0.426877469438
