# Distributional Semantics Model: Narrow Word-Window (-2,+2)

### A. Data-Loading Facilities

In [12]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03")

In [23]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from collections import defaultdict, Counter
from functools import partial
from itertools import permutations, product
punctuation = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'
from operator import itemgetter

In [14]:
import warnings
warnings.filterwarnings("ignore")

In [27]:
def load_wiki(cutoffFreq=20):
    
    print "... extracting data"
    with open('wikicorpus.txt','rb') as f:
        raw = f.readlines()
    raw = [sent[4:].split() for sent in raw if sent.startswith('<c>')] 
        # sent[4:]: get rid of initial <c>.
        # extract sentences; split sentences into word complexes.
    raw = [[map(partial(str.split, word), '|') for word in sent] for sent in raw] 
        # split word complexes into words.
    
    print "... cleaning data"
    tagged_sents = [[(word[0][1].lower(),word[0][2]) for word in sent if len(word[0])>1 
                      and word[0][1].lower() not in stopwords
                      and word[0][1] not in punctuation] for sent in raw]
    
    print "... getting top-%d frequent context words and top-50 frequent nouns" % cutoffFreq
    all_tokens = [word for tagged_sent in tagged_sents for word,tag in tagged_sent]
        # type: list of words.
    context_fdist = Counter(all_tokens)
    context_vocab = list({word for word,freq in context_fdist.iteritems() if freq>=20})
    n_tokens = [word for tagged_sent in tagged_sents for word,tag in tagged_sent if tag.startswith('N')]
    n_fdist = Counter(n_tokens)
    n_vocab = list({word for word,freq in n_fdist.iteritems() if freq>=50})
    
    sents = [[word for word,tag in tagged_sent if word in context_vocab] for tagged_sent in tagged_sents]
    
    return (n_vocab, context_vocab, sents)


In [28]:
%%time
n_vocab, context_vocab, sents = load_wiki()

... extracting data
... cleaning data
... getting top-20 frequent context words and top-50 frequent nouns
CPU times: user 19min 23s, sys: 9.23 s, total: 19min 32s
Wall time: 19min 30s


In [44]:
import cPickle

In [45]:
cleaned_data_path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/CODE/CLEANED_DATA/'
cPickle.dump((n_vocab,context_vocab,sents), open(cleaned_data_path+'cleaned_wiki.p', 'wb')) 
# obj = cPickle.load(open('save.p', 'rb'))

### B1. Analyzer: Cooccurrence Matrix Based

#### A. Model

In [38]:
import numpy as np

In [39]:
# SIMILARITY MEASURES
def cosine(w2w):
    w2w_norm = w2w / np.apply_along_axis(lambda r: np.sqrt(np.dot(r,r))
                               , 1, w2w)[:,np.newaxis]
    return np.dot(w2w_norm, w2w_norm.T)
    
def ppmi(w2w):
    rowSums, colSums, totalSums = w2w.sum(axis=1), w2w.sum(axis=0), w2w.sum()
    pwi, pwj, ppmiMatrix = rowSums/totalSums, colSums/totalSums, w2w/totalSums
    ppmiMatrix /= pwi[:,np.newaxis] # * 1/pwi by row.
    ppmiMatrix /= pwj # * 1/pwj by col.
    ppmiMatrix = np.nan_to_num(np.log(ppmiMatrix)) # compute pmi.
    ppmiMatrix = np.maximum(ppmiMatrix, 0) # compute ppmi.
    return ppmiMatrix

In [103]:
class SimpleDistSem:
    
    def __init__(self, target_vocab, context_vocab, sents):
        self.sents = sents 
            # a list of sents considered in the corpus.
            #  in this case, words in context_vocab.
        self.vocab = target_vocab
        self.context = context_vocab
        self.t2i = {t:i for i,t in enumerate(self.vocab)}
        self.c2i = {word:i for i,word in enumerate(self.context)}
        self.algos = {'ppmi':ppmi, 'cosine':cosine}
    
    def build_t2c_matrix(self, win_size):
        
        print "... counting words"
        cooccurrenceDict = defaultdict(int)
        for sent in self.sents:
            for i,target in enumerate(sent):
                contexts = sent[max(0,i-win_size):i] + \
                           sent[min(i+1,len(sent)):min(i+1+win_size,len(sent))]
                for context in contexts:
                    cooccurrenceDict[(target,context)] += 1
        
        print "... building cooccurrence matrix"
        self.t2c = np.zeros((len(self.vocab),len(self.context)))
        for target in self.vocab:
            for context in self.context:
                self.t2c[self.t2i[target]][self.c2i[context]] = cooccurrenceDict[(target,context)]
    
    def build_similarity_matrix(self, similarity='ppmi'):
        self.sim_algo = similarity
        self.simMatrix = self.algos[similarity](self.t2c)
    
    def k_most_similar(self, words, k=20):
        assert len(words)==len(filter(lambda w:1 if w in self.vocab else 0, words))
        w2sim = {}
        for word in words:
            simList = self.simMatrix[self.t2i[word]]
            if self.sim_algo=='ppmi':
                w2sim[word] = map(lambda idx:(self.context[idx],
                                              self.simMatrix[self.t2i[word]][idx]),
                                  np.argsort(simList)[::-1][1:k+1])
            else:
                w2sim[word] = map(lambda idx:(self.vocab[idx],
                                              self.simMatrix[self.t2i[word]][idx]),
                                  np.argsort(simList)[::-1][1:k+1])                
            # [1:k+1]: skip self-similarty.
        return w2sim

In [104]:
%%time
ds = SimpleDistSem(n_vocab,context_vocab,sents)

CPU times: user 8.09 ms, sys: 9.56 ms, total: 17.7 ms
Wall time: 31.8 ms


In [None]:
%%time
ds.build_t2c_matrix(win_size=2)

... counting words
... building cooccurrence matrix


In [47]:
model_path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/CODE/MODELS/'
cPickle.dump(ds.t2c, open(model_path+'t2c_matrix.p', 'wb')) 

#### b. Evaluator: K-Frequent

In [100]:
%%time
ds.build_similarity_matrix('ppmi')

CPU times: user 4.16 s, sys: 2.1 s, total: 6.26 s
Wall time: 6.57 s


In [102]:
ds.simMatrix.shape

(6971, 18609)

In [101]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
w2sim = ds.k_most_similar(words)

IndexError: list index out of range

In [33]:
w2sim['car']

[(u'brabham', 4.7766275467196584),
 (u'bogie', 4.7055917736029063),
 (u'racing', 4.5943839798650412),
 (u'earnhardt', 4.366924146418258),
 (u'bentley', 4.3662664679992869),
 (u'brake', 4.3409486600149974),
 (u'cable', 4.3255062609917951),
 (u'clutch', 4.2657252387774101),
 (u'audi', 4.2176254439829988),
 (u'aston', 4.1303838909076473),
 (u'chassis', 4.128978409298913),
 (u'truck', 4.0516232018194867),
 (u'grip', 3.9780431663256288),
 (u'car', 3.9593991033033911),
 (u'installation', 3.9023097782248346),
 (u'motors', 3.8902269603187118),
 (u'prix', 3.8135937342977959),
 (u'tire', 3.8054304236586352),
 (u'compact', 3.7933090631262902),
 (u'crash', 3.7869508643837979)]

In [34]:
%%time
ds.build_similarity_matrix('cosine')

CPU times: user 20.8 s, sys: 272 ms, total: 21 s
Wall time: 3.54 s


In [35]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
w2sim = ds.k_most_similar(words)

CPU times: user 12 ms, sys: 649 µs, total: 12.6 ms
Wall time: 11.8 ms


In [36]:
w2sim['piano']

[(u'piano', 0.99999999999999956),
 (u'cello', 0.84187590856948435),
 (u'instrument', 0.83042142568560717),
 (u'piece', 0.82429209108329438),
 (u'concerto', 0.82214317322249686),
 (u'style', 0.80750453162609459),
 (u'composition', 0.80621877549042775),
 (u'music', 0.80335035929212972),
 (u'sound', 0.80095692227873194),
 (u'sonata', 0.79813149147023077),
 (u'violin', 0.7963416282295428),
 (u'work', 0.79472264984458352),
 (u'arrangement', 0.79370701901133001),
 (u'performance', 0.79002073528385708),
 (u'career', 0.78407199429323937),
 (u'influence', 0.78106297327464669),
 (u'tone', 0.78064091165776106),
 (u'theme', 0.78052972790344377),
 (u'guitar', 0.78019642679685886),
 (u'history', 0.78010361181095722)]

In [85]:
ds.simMatrix.shape

(6971, 6971)

#### c. Evaluator: BLESS

In [52]:
import random

In [87]:
def bless_evaluator(simMatrix, indexers=[t2i,c2i]):
    t2i,c2i = indexers
    path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03/BLESS_part.txt'
    with open(path,'rb') as f:
        bless = f.readlines()
    bless = [line.split('\t') for line in bless] # split into (concept, _, relation, relatum).
    crPairs = [(c.split('-')[0],r.split('-')[0],rel) for c,_,rel,r in bless]
    posPairs = [(c,r) for c,r,rel in crPairs if rel=='hyper']
    negPairs = [(c,r) for c,r,rel in crPairs if rel=='mero']
    
    return [map(lambda (c,r):(c,r,simMatrix[t2i[c]][c2i[r]]), posPairs),
            map(lambda (c,r):(c,r,simMatrix[t2i[c]][c2i[r]]), negPairs)]

##### PPMI

In [89]:
%%time
ds.build_similarity_matrix('ppmi')
posEval, negEval = bless_evaluator(ds.simMatrix, indexers=[ds.t2i,c2i])

CPU times: user 4.05 s, sys: 1.53 s, total: 5.58 s
Wall time: 5.59 s


In [90]:
print "Examples of Evaluation on Positive Relations (PPMI): "
print random.sample(posEval, 5)
print "Average PPMI: ", np.mean([ppmiVal for _,_,ppmiVal in posEval])

Examples of Evaluation on Positive Relations (PPMI): 
[('shirt', 'garment', 0.0), ('flute', 'object', 0.0), ('bomb', 'object', 0.0), ('jet', 'plane', 0.0), ('library', 'construction', 0.0)]
Average PPMI:  0.111013883294


In [91]:
print "Examples of Evaluation on Negative Relations (PPMI): "
print random.sample(negEval, 5)
print "Average PPMI: ", np.mean([ppmiVal for _,_,ppmiVal in negEval])

Examples of Evaluation on Negative Relations (PPMI): 
[('bowl', 'clay', 0.0), ('oak', 'trunk', 0.0), ('van', 'seat', 0.0), ('table', 'cover', 0.0), ('tiger', 'eye', 0.0)]
Average PPMI:  0.0664503314987


##### Cosine

In [92]:
%%time
ds.build_similarity_matrix('cosine')
posEval, negEval = bless_evaluator(ds.simMatrix, indexers=[ds.t2i,ds.t2i])

CPU times: user 57.3 s, sys: 717 ms, total: 58 s
Wall time: 9.38 s


In [93]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(posEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in posEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('dagger', 'object', 0.65870104098146565), ('horse', 'mammal', 0.65559039669903407), ('sword', 'device', 0.73989582866053138), ('bag', 'object', 0.78645090412740359), ('ferry', 'craft', 0.45008036681292374)]
Average Cosine:  0.573630019172


In [94]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(negEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in negEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('restaurant', 'seat', 0.615422359359027), ('restaurant', 'chef', 0.50549522953873005), ('cannon', 'charge', 0.53031500999697223), ('goat', 'mouth', 0.40557216613367419), ('box', 'side', 0.58359322474165565)]
Average Cosine:  0.558184338667
