# Distributional Semantics Model: Narrow Word-Window (-2,+2)

### A. Data-Loading Facilities

In [4]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03")

In [5]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from collections import defaultdict, Counter
from functools import partial
from itertools import permutations, product
punctuation = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'
from operator import itemgetter

In [6]:
import warnings
warnings.filterwarnings("ignore")

In [27]:
def load_wiki(cutoffFreq=20):
    
    print "... extracting data"
    with open('wikicorpus.txt','rb') as f:
        raw = f.readlines()
    raw = [sent[4:].split() for sent in raw if sent.startswith('<c>')] 
        # sent[4:]: get rid of initial <c>.
        # extract sentences; split sentences into word complexes.
    raw = [[map(partial(str.split, word), '|') for word in sent] for sent in raw] 
        # split word complexes into words.
    
    print "... cleaning data"
    tagged_sents = [[(word[0][1].lower(),word[0][2]) for word in sent if len(word[0])>1 
                      and word[0][1].lower() not in stopwords
                      and word[0][1] not in punctuation] for sent in raw]
    
    print "... getting top-%d frequent context words and top-50 frequent nouns" % cutoffFreq
    all_tokens = [word for tagged_sent in tagged_sents for word,tag in tagged_sent]
        # type: list of words.
    context_fdist = Counter(all_tokens)
    context_vocab = list({word for word,freq in context_fdist.iteritems() if freq>=cutoffFreq})
    n_tokens = [word for tagged_sent in tagged_sents for word,tag in tagged_sent if tag.startswith('N')]
    n_fdist = Counter(n_tokens)
    n_vocab = list({word for word,freq in n_fdist.iteritems() if freq>=50})
    
    sents = [[word for word,tag in tagged_sent if word in context_vocab] for tagged_sent in tagged_sents]
    
    return (n_vocab, context_vocab, sents)


In [28]:
%%time
n_vocab, context_vocab, sents = load_wiki()

... extracting data
... cleaning data
... getting top-20 frequent context words and top-50 frequent nouns
CPU times: user 19min 23s, sys: 9.23 s, total: 19min 32s
Wall time: 19min 30s


In [7]:
import cPickle

In [8]:
# cleaned_data_path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/CODE/CLEANED_DATA/'
# cPickle.dump((n_vocab,context_vocab,sents), open(cleaned_data_path+'cleaned_wiki.p', 'wb')) 
# n_vocab, context_vocab, sents = cPickle.load(open(cleaned_data_path+'cleaned_wiki.p', 'rb'))

### B1. Analyzer: Cooccurrence Matrix Based

#### A. Model

In [9]:
import numpy as np

In [146]:
# SIMILARITY MEASURES
def cosine(w2w):
    w2w_norm = w2w / np.apply_along_axis(lambda r: np.sqrt(np.dot(r,r))
                               , 1, w2w)[:,np.newaxis]
    return np.dot(w2w_norm, w2w_norm.T)
    
def ppmi(w2w):
    def iszero(m): return m==0
    rowSums, colSums, totalSums = w2w.sum(axis=1), w2w.sum(axis=0), w2w.sum()
    pwi, pwj, ppmiMatrix = rowSums/totalSums, colSums/totalSums, w2w/totalSums
    ppmiMatrix /= pwi[:,np.newaxis] # * 1/pwi by row.
    ppmiMatrix /= pwj # * 1/pwj by col.
    ppmiMatrix[iszero[ppmiMatrix]] = 1e-10 # handling logging-on-0 issue.
    ppmiMatrix = np.log(ppmiMatrix) # compute pmi.
    ppmiMatrix = np.maximum(ppmiMatrix, 0) # compute ppmi.
    return ppmiMatrix

In [147]:
class SimpleDistSem:
    
    def __init__(self, target_vocab, context_vocab, sents):
        self.sents = sents 
            # a list of sents considered in the corpus.
            #  in this case, words in context_vocab.
        self.vocab = target_vocab
        self.context = context_vocab
        self.t2i = {t:i for i,t in enumerate(self.vocab)}
        self.c2i = {word:i for i,word in enumerate(self.context)}
        self.algos = {'ppmi':ppmi, 'cosine':cosine}
    
    def build_t2c_matrix(self, win_size):
        
        print "... counting words"
        cooccurrenceDict = defaultdict(int)
        for sent in self.sents:
            for i,target in enumerate(sent):
                contexts = sent[max(0,i-win_size):i] + \
                           sent[min(i+1,len(sent)):min(i+1+win_size,len(sent))]
                for context in contexts:
                    cooccurrenceDict[(target,context)] += 1
        
        print "... building cooccurrence matrix"
        self.t2c = np.zeros((len(self.vocab),len(self.context)))
        for target in self.vocab:
            for context in self.context:
                self.t2c[self.t2i[target]][self.c2i[context]] = cooccurrenceDict[(target,context)]
        
        print "... performing association transformation (ppmi)"
        self.t2c = ppmi(self.t2c)
    
    def build_similarity_matrix(self, similarity='ppmi'):
        self.sim_algo = similarity
        self.simMatrix = self.algos[similarity](self.t2c)
    
    def k_most_similar(self, words, k=20):
        assert len(words)==len(filter(lambda w:1 if w in self.vocab else 0, words))
        w2sim = {}
        for word in words:
            simList = self.simMatrix[self.t2i[word]]
            if self.sim_algo=='ppmi':
                w2sim[word] = map(lambda idx:(self.context[idx],
                                              self.simMatrix[self.t2i[word]][idx]),
                                  np.argsort(simList)[::-1][1:k+1])
            else:
                w2sim[word] = map(lambda idx:(self.vocab[idx],
                                              self.simMatrix[self.t2i[word]][idx]),
                                  np.argsort(simList)[::-1][1:k+1])                
            # [1:k+1]: skip self-similarty.
        return w2sim

In [148]:
%%time
ds = SimpleDistSem(n_vocab,context_vocab,sents)

CPU times: user 5.52 ms, sys: 25.9 ms, total: 31.4 ms
Wall time: 35.3 ms


In [150]:
%%time
ds.build_t2c_matrix(win_size=2)

... counting words
... building cooccurrence matrix
... performing association transformation (ppmi)
CPU times: user 5min 16s, sys: 3min 37s, total: 8min 54s
Wall time: 10min 26s


In [47]:
# model_path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/CODE/MODELS/'
# cPickle.dump(ds.t2c, open(model_path+'t2c_matrix.p', 'wb')) 

#### b. Evaluator: K-Frequent

In [151]:
%%time
ds.build_similarity_matrix('cosine')

CPU times: user 1min 3s, sys: 1.27 s, total: 1min 4s
Wall time: 11.4 s


In [152]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
w2sim = ds.k_most_similar(words)

CPU times: user 7.96 ms, sys: 4.96 ms, total: 12.9 ms
Wall time: 25.6 ms


In [153]:
w2sim['car']

[('car', 1.0000000000000002),
 ('vehicle', 0.25072320894335415),
 ('racing', 0.24022578208684151),
 ('engine', 0.22363940122018253),
 ('truck', 0.21850334001874441),
 ('motor', 0.18665395294251125),
 ('automobile', 0.18078987685826148),
 ('passenger', 0.18012375221484972),
 ('audi', 0.17510730832578281),
 ('diesel', 0.17401171371376842),
 ('race', 0.17030898939042763),
 ('bus', 0.16792566694909561),
 ('drive', 0.16740259061597954),
 ('train', 0.16448985856932427),
 ('company', 0.16447986670458833),
 ('driver', 0.16132842234251166),
 ('design', 0.1611686939345211),
 ('motorcycle', 0.15610660086994815),
 ('bmw', 0.15464034114236788),
 ('bicycle', 0.15417229535706148)]

#### c. Evaluator: BLESS

In [154]:
import random

In [166]:
def bless_evaluator(simMatrix, indexers):
    t2i,c2i = indexers
    path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03/BLESS_part.txt'
    with open(path,'rb') as f:
        bless = f.readlines()
    bless = [line.split('\t') for line in bless] # split into (concept, _, relation, relatum).
    crPairs = [(c.split('-')[0].lower(),r.split('-')[0].lower(),rel) for c,_,rel,r in bless]
    crPairs = filter(lambda crPair: crPair[1] in ds.t2i, crPairs) # there's some unseen relata.
    posPairs = [(c,r) for c,r,rel in crPairs if rel in ['hyper','coord']]
    negPairs = [(c,r) for c,r,rel in crPairs if rel in ['mero','random-n']]
    
    return [map(lambda (c,r):(c,r,simMatrix[t2i[c]][c2i[r]]), posPairs),
            map(lambda (c,r):(c,r,simMatrix[t2i[c]][c2i[r]]), negPairs)]

##### Cosine

In [167]:
%%time
# ds.build_similarity_matrix('cosine')
posEval, negEval = bless_evaluator(ds.simMatrix, indexers=[ds.t2i,ds.t2i])

CPU times: user 19.4 ms, sys: 6.44 ms, total: 25.8 ms
Wall time: 22.6 ms


In [168]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(posEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in posEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('shirt', 'object', 0.041864159270515566), ('train', 'jet', 0.08989481104311528), ('guitar', 'banjo', 0.22317135865887036), ('battleship', 'van', 0.047521489275332625), ('truck', 'transport', 0.11725661176396823)]
Average Cosine:  0.100564104776


In [169]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(negEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in negEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('hospital', 'symmetry', 0.013632528572148493), ('salmon', 'parameter', 0.021756062636271396), ('dagger', 'railway', 0.01554310357048351), ('van', 'metal', 0.031358001782264146), ('grape', 'disagreement', 0.0086348801143316407)]
Average Cosine:  0.0513189291684


### C. Acc@1 & Acc@5

##### Data & Measures

In [104]:
path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03/BLESS_part.txt'
with open(path,'rb') as f:
    bless = f.readlines()
bless = [line.split('\t') for line in bless] # split into (concept, _, relation, relatum).
cr_pairs = [(c.split('-')[0].lower(),r.split('-')[0].lower(),rel) for c,_,rel,r in bless] # len=4536
pos_pairs = [(c,r) for c,r,rel in crPairs if rel=='hyper']
neg_pairs = [(c,r) for c,r,rel in crPairs if rel=='mero']

In [105]:
cs = list({c for c,_,_ in cr_pairs})
rs = list({r for _,r,_ in cr_pairs})
print 'concepts: ', all(cs_i in ds.t2i for cs_i in cs)
print 'relata: ', all(rs_i in ds.t2i for rs_i in rs)

concepts:  True
relata:  False


In [106]:
unseen_relata = filter(lambda rs_i: rs_i not in ds.t2i, rs)
cr_pairs = filter(lambda cr: cr[1] not in unseen_relata, crPairs) # len=4527

In [140]:
from __future__ import division
def weighted_jaccard(u,v): return sum(map(min, zip(u,v))) / sum(map(max, zip(u,v)))
def csn(u,v): return np.dot(u,v)/(np.dot(u,u)*np.dot(v,v))
def get_vector(word): return ds.simMatrix[ds.t2i[word]]
cr_dict = {(c,r):rel for c,r,rel in cr_pairs}

##### PPMI + Weighted Jaccard Condition

In [129]:
%%time
ds.build_similarity_matrix('ppmi')

CPU times: user 4.43 s, sys: 1.85 s, total: 6.28 s
Wall time: 6.67 s


In [130]:
%%time
concepts = defaultdict(list)
for concept, relatum, _ in cr_pairs:
    concepts[concept] += [(relatum, weighted_jaccard(get_vector(concept),get_vector(relatum)))]

CPU times: user 57.5 s, sys: 174 ms, total: 57.6 s
Wall time: 57.7 s


In [131]:
from operator import itemgetter
for word in concepts.keys():
    concepts[word] = sorted(concepts[word], key=itemgetter(1), reverse=True)
# concepts['frog']
# [('lizard', 0.091162496824635675),
#  ('toad', 0.078255155511223695),
#  ('snake', 0.068625132400646119),
#  ('turtle', 0.064585659819217031),
#  ('vertebrate', 0.063845815047815918),
#  ... ]

In [132]:
num_concepts = len(concepts)
correct = 0
for concept,relata in concepts.iteritems():
    correct += 1 if cr_dict[(concept,relata[0][0])] in ['hyper','coord'] else 0
print 'Acc@1 (ppmi): %.2f' % (correct/num_concepts)

Acc@1 (ppmi): 0.81


In [133]:
num_concepts = len(concepts)*5
correct = 0
for concept,relata in concepts.iteritems():
    for i in xrange(5):
        correct += 1 if cr_dict[(concept,relata[i][0])] in ['hyper','coord'] else 0
print 'Acc@5 (ppmi): %.2f' % (correct/num_concepts)

Acc@5 (ppmi): 0.71


##### PPMI + Cosine Condition

In [141]:
%%time
ds.build_similarity_matrix('ppmi')

CPU times: user 4.18 s, sys: 1.68 s, total: 5.85 s
Wall time: 6.2 s


In [142]:
%%time
concepts = defaultdict(list)
for concept, relatum, _ in cr_pairs:
    concepts[concept] += [(relatum, csn(get_vector(concept),get_vector(relatum)))]

CPU times: user 124 ms, sys: 956 µs, total: 125 ms
Wall time: 125 ms


In [143]:
from operator import itemgetter
for word in concepts.keys():
    concepts[word] = sorted(concepts[word], key=itemgetter(1), reverse=True)
# concepts['frog']
# [('lizard', 0.091162496824635675),
#  ('toad', 0.078255155511223695),
#  ('snake', 0.068625132400646119),
#  ('turtle', 0.064585659819217031),
#  ('vertebrate', 0.063845815047815918),
#  ... ]

In [144]:
num_concepts = len(concepts)
correct = 0
for concept,relata in concepts.iteritems():
    correct += 1 if cr_dict[(concept,relata[0][0])] in ['hyper','coord'] else 0
print 'Acc@1 (ppmi): %.2f' % (correct/num_concepts)

Acc@1 (ppmi): 0.81


In [145]:
num_concepts = len(concepts)*5
correct = 0
for concept,relata in concepts.iteritems():
    for i in xrange(5):
        correct += 1 if cr_dict[(concept,relata[i][0])] in ['hyper','coord'] else 0
print 'Acc@5 (ppmi): %.2f' % (correct/num_concepts)

Acc@5 (ppmi): 0.74
