# Distributional Semantics Model: Narrow Word-Window (-2,+2)

### A. Data-Loading Facilities

In [1]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03")

In [2]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from collections import defaultdict
from functools import partial
from itertools import permutations, product
punctuation = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def load_wiki(cutoffFreq=50):
    
    print "... extracting data"
    with open('wikicorpus.txt','rb') as f:
        raw = f.readlines()
    raw = [sent[4:].split() for sent in raw if sent.startswith('<c>')] 
        # sent[4:]: get rid of initial <c>.
        # extract sentences; split sentences into word complexes.
    raw = [[map(partial(str.split, word), '|') for word in sent] for sent in raw] 
        # split word complexes into words.
    
    print "... cleaning data"
    sents = [[word[0][1].lower() for word in sent if len(word[0])>1 
              and (word[0][2].startswith('N') or word[0][2]=='.')
              and word[0][1].lower() not in stopwords
              and word[0][1] not in punctuation] for sent in raw]
        # extract lemmas => complete sents corpus .
    
    print "... building token list and vocabulary"
    tokens = [word for sent in sents for word in sent]
        # type: list of words.
    fdist = nltk.FreqDist(tokens)
    vocab = list(set(tokens))   
    
    print "... saving top %d-frequent in vocabulary" % cutoffFreq
    vocab = [word for word in vocab if fdist[word] >= cutoffFreq]
        # vocab is not returned, because the k-frequent cut latter can change it.
    sents = [word.decode('utf-8','ignore') for sent in sents for word in sent if word in vocab]
        # type: list of all words in the corpus, separated by periods.
    sents.append('.') # handle -1 window lookahead +2 error.
        
    return sents

In [5]:
%%time
sents = load_wiki(50)

... extracting data
... cleaning data
... building token list and vocabulary
... saving top 50-frequent in vocabulary
CPU times: user 4min 56s, sys: 7.46 s, total: 5min 3s
Wall time: 5min 1s


### B1. Analyzer: Cooccurrence Matrix Based

#### A. Model

In [6]:
import numpy as np

In [7]:
# SIMILARITY MEASURES
def cosine(w2w):
    w2w_norm = w2w / np.apply_along_axis(lambda r: np.sqrt(np.dot(r,r))
                               , 1, w2w)[:,np.newaxis]
    return np.dot(w2w_norm, w2w_norm.T)
    
def ppmi(w2w):
    rowSums, colSums, totalSums = w2w.sum(axis=1), w2w.sum(axis=0), w2w.sum()
    pwi, pwj, ppmiMatrix = rowSums/totalSums, colSums/totalSums, w2w/totalSums
    ppmiMatrix /= pwi[:,np.newaxis] # * 1/pwi by row.
    ppmiMatrix /= pwj # * 1/pwj by col.
    ppmiMatrix = np.nan_to_num(np.log(ppmiMatrix)) # compute pmi.
    ppmiMatrix = np.maximum(ppmiMatrix, 0) # compute ppmi.
    return ppmiMatrix

In [12]:
class SimpleDistSem:
    
    def __init__(self, data=sents, kFrequent=50):
        self.sents = sents # a list of all words in the corpus (count=2573841).
        self.vocab = list({word for word in sents})
        self.wordToIndex = {word:i for i,word in enumerate(self.vocab)}
        self.indexToWord = {i:word for word,i in self.wordToIndex.iteritems()}
    
    def build_w2w_matrix(self):
        
        print "... counting words"
        cooccurrenceDict = defaultdict(int)
        def add_entry(w_i,w_j): 
            cooccurrenceDict[(self.wordToIndex[w_i],self.wordToIndex[w_j])] += 1
            cooccurrenceDict[(self.wordToIndex[w_j],self.wordToIndex[w_i])] += 1
        for i,word in enumerate(self.sents):
            if word=='.': continue # not pass, because we do not execute anything after this.
            elif self.sents[i+2]=='.':
                add_entry(word,self.sents[i-2])
                add_entry(word,self.sents[i-1])
                add_entry(word,self.sents[i+1])                
            elif self.sents[i+1]=='.':
                add_entry(word,self.sents[i-2])
                add_entry(word,self.sents[i-1])
            elif self.sents[i-1]=='.':
                add_entry(word,self.sents[i+1])
                add_entry(word,self.sents[i+2])
            elif self.sents[i-2]=='.':
                add_entry(word,self.sents[i-1])
                add_entry(word,self.sents[i+1])
                add_entry(word,self.sents[i+2])
            elif i==0: 
                add_entry(word,self.sents[i+1])
                add_entry(word,self.sents[i+2])
            elif i==1: 
                add_entry(word,self.sents[i-1])
                add_entry(word,self.sents[i+1])
                add_entry(word,self.sents[i+2])
            else: 
                add_entry(word,self.sents[i-2])
                add_entry(word,self.sents[i-1])
                add_entry(word,self.sents[i+1])
                add_entry(word,self.sents[i+2])                
        
        print "... building cooccurrence matrix"
        self.w2w = np.zeros((len(self.vocab),len(self.vocab)))
        for (widx_i,widx_j),count in cooccurrenceDict.iteritems():
            self.w2w[widx_i][widx_j] = count   
    
    def build_similarity_matrix(self, similarity=ppmi):
        self.simMatrix = similarity(self.w2w)
    
    def k_most_similar(self, words, k=20):
        assert len(words)==len(filter(lambda w:1 if w in self.vocab else 0, words))
        w2sim = {}
        for word in words:
            simList = self.simMatrix[self.wordToIndex[word]]
            w2sim[word] = map(lambda idx:(self.indexToWord[idx],
                                          self.simMatrix[self.wordToIndex[word]][idx]),
                              np.argsort(simList)[::-1][1:k+1])
            # [1:k+1]: skip self-similarty.
        return w2sim

In [13]:
%%time
ds = SimpleDistSem(sents)

CPU times: user 298 ms, sys: 3.35 ms, total: 301 ms
Wall time: 300 ms


In [14]:
%%time
ds.build_w2w_matrix()

... counting words
... building cooccurrence matrix
CPU times: user 14.9 s, sys: 552 ms, total: 15.5 s
Wall time: 15.5 s


#### b. Evaluator: K-Frequent

In [15]:
%%time
ds.build_similarity_matrix(ppmi)

CPU times: user 1.63 s, sys: 636 ms, total: 2.27 s
Wall time: 2.27 s


In [16]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
w2sim = ds.k_most_similar(words)

CPU times: user 8.95 ms, sys: 985 µs, total: 9.93 ms
Wall time: 8.7 ms


In [17]:
w2sim['car']

[(u'brabham', 4.7541390061810009),
 (u'bogie', 4.7196794382386171),
 (u'racing', 4.6159968217289284),
 (u'brake', 4.3700947885249093),
 (u'earnhardt', 4.3508297443719703),
 (u'bentley', 4.336193236849228),
 (u'cable', 4.333252451412112),
 (u'clutch', 4.3034034140262367),
 (u'audi', 4.1868158357746275),
 (u'chassis', 4.1559314130182834),
 (u'truck', 4.1102047334027869),
 (u'aston', 4.0998044587849973),
 (u'grip', 4.0076891698771915),
 (u'car', 3.9737637378896324),
 (u'prix', 3.9188333410889142),
 (u'installation', 3.8923217478803402),
 (u'motors', 3.8778311826189782),
 (u'tire', 3.8627233996270061),
 (u'luxury', 3.8419753494828979),
 (u'crash', 3.794477126063899)]

In [18]:
%%time
ds.build_similarity_matrix(cosine)

CPU times: user 22.6 s, sys: 310 ms, total: 22.9 s
Wall time: 3.77 s


In [19]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
w2sim = ds.k_most_similar(words)

CPU times: user 13 ms, sys: 813 µs, total: 13.9 ms
Wall time: 13.1 ms


In [20]:
w2sim['piano']

[(u'concerto', 0.69962000806699731),
 (u'sonata', 0.66682645809820673),
 (u'cello', 0.66204078544049516),
 (u'violin', 0.62362338390182581),
 (u'trio', 0.54569317313435917),
 (u'bartk', 0.51422402079027529),
 (u'instrument', 0.49846288561417884),
 (u'bass', 0.49324167360193788),
 (u'composer', 0.48117823426359407),
 (u'guitar', 0.48099499031158183),
 (u'debussy', 0.47360253110598505),
 (u'quartet', 0.47347787243971523),
 (u'piece', 0.46483406817803208),
 (u'banjo', 0.44451716233948796),
 (u'repertoire', 0.42313460635578409),
 (u'solo', 0.4161858330271393),
 (u'string', 0.40897887998984539),
 (u'mozart', 0.40546351019303656),
 (u'music', 0.40448759314494348),
 (u'orchestra', 0.40356894488532824)]

#### c. Evaluator: BLESS

In [21]:
import random

In [22]:
def bless_evaluator(simMatrix=None, indexers=[None,None]):
    wordToIndex, indexToWord = indexers
    path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03/BLESS_part.txt'
    with open(path,'rb') as f:
        bless = f.readlines()
    bless = [line.split('\t') for line in bless] # split into (concept, _, relation, relatum).
    crPairs = [(c.split('-')[0],r.split('-')[0],rel) for c,_,rel,r in bless]
    posPairs = [(c,r) for c,r,rel in crPairs if rel=='hyper']
    negPairs = [(c,r) for c,r,rel in crPairs if rel=='mero']
    
    return [map(lambda (c,r):(c,r,simMatrix[wordToIndex[c]][wordToIndex[r]]), posPairs),
            map(lambda (c,r):(c,r,simMatrix[wordToIndex[c]][wordToIndex[r]]), negPairs)]

##### PPMI

In [23]:
%%time
ds.build_similarity_matrix(ppmi)
posEval, negEval = bless_evaluator(ds.simMatrix, indexers=[ds.wordToIndex, ds.indexToWord])

CPU times: user 1.7 s, sys: 625 ms, total: 2.33 s
Wall time: 2.34 s


In [24]:
print "Examples of Evaluation on Positive Relations (PPMI): "
print random.sample(posEval, 5)
print "Average PPMI: ", np.mean([ppmiVal for _,_,ppmiVal in posEval])

Examples of Evaluation on Positive Relations (PPMI): 
[('sheep', 'mammal', 2.9225073882803363), ('restaurant', 'construction', 1.6390546363558582), ('elephant', 'vertebrate', 0.0), ('pig', 'mammal', 0.0), ('goat', 'food', 1.3948544777760254)]
Average PPMI:  1.50076975329


In [25]:
print "Examples of Evaluation on Negative Relations (PPMI): "
print random.sample(negEval, 5)
print "Average PPMI: ", np.mean([ppmiVal for _,_,ppmiVal in negEval])

Examples of Evaluation on Negative Relations (PPMI): 
[('cathedral', 'archbishop', 3.3945832542402079), ('dress', 'neck', 0.0), ('truck', 'accelerator', 0.0), ('snake', 'skin', 3.2467953233848528), ('violin', 'neck', 0.0)]
Average PPMI:  1.30109975988


##### Cosine

In [26]:
%%time
ds.build_similarity_matrix(cosine)
posEval, negEval = bless_evaluator(ds.simMatrix, indexers=[ds.wordToIndex, ds.indexToWord])

CPU times: user 23.1 s, sys: 291 ms, total: 23.4 s
Wall time: 3.89 s


In [27]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(posEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in posEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('cello', 'instrument', 0.60768635898244827), ('flute', 'instrument', 0.42458316313315286), ('piano', 'furniture', 0.22483378356353043), ('castle', 'housing', 0.18794690763427052), ('hospital', 'construction', 0.25102776063439169)]
Average Cosine:  0.228813688324


In [28]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(negEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in negEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('train', 'engine', 0.19246061385996641), ('cathedral', 'ceiling', 0.19485368119580454), ('fighter', 'crew', 0.2569492415497649), ('lizard', 'leg', 0.22180841313329286), ('pistol', 'grip', 0.18631294795623843)]
Average Cosine:  0.178549498886
