# Simple Distributional Semantics Model

### A. Data Loading Facilities

In [74]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03")

In [75]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from collections import defaultdict
from functools import partial
from itertools import permutations
from string import punctuation

In [76]:
def load_wiki(cutoffFreq=50):
    
    print "... extracting data"
    with open('wikicorpus.txt','rb') as f:
        raw = f.readlines()
    raw = [sent.split() for sent in raw if sent.startswith('<c>')] 
        # extract sentences; split sentences into word complexes.
    raw = [[map(partial(str.split, word), '|') for word in sent] for sent in raw] 
        # split word complexes into words.
    
    print "... cleaning data"
    sents = [[word[0][1].lower() for word in sent if len(word[0])>1 
              and word[0][2].startswith('N')
              and word[0][1].lower() not in stopwords
              and word[0][1] not in punctuation] for sent in raw]
        # extract lemmas => complete sents corpus .
    
    print "... building token list and vocabulary"
    tokens = [word for sent in sents for word in sent]
        # type: list of words.
    fdist = nltk.FreqDist(tokens)
    vocab = list(set(tokens))   
    
    print "... saving top %d-frequent in vocabulary" % cutoffFreq
    vocab = [word for word in vocab if fdist[word] >= cutoffFreq]
    sents = [[word.decode('utf-8','ignore') for word in sent if word in vocab] for sent in sents]
        # type: list of lists of words.
        
    return [sents, tokens, vocab]

### B. Analyzer

In [77]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [78]:
# SIMILARITY MEASURES
def cosine(w2w):
    return
    
def ppmi(w2w):
    rowSums, colSums, totalSums = w2w.sum(axis=1), w2w.sum(axis=0), w2w.sum()
    pwi, pwj, ppmiMatrix = rowSums/totalSums, colSums/totalSums, w2w/totalSums
    ppmiMatrix /= pwi[:,np.newaxis] # * 1/pwi by row.
    ppmiMatrix /= pwj # * 1/pwj by col.
    ppmiMatrix = np.nan_to_num(np.log(ppmiMatrix)) # compute pmi.
    ppmiMatrix = np.maximum(ppmiMatrix, 0) # compute ppmi.
    return ppmiMatrix
          
def lmi(w2w):
    return

In [79]:
class SimpleDistSem:
    
    def __init__(self, data=load_wiki, kFrequent=50):
        self.sents, self.tokens, self.vocab = data(50)
    
    def build_w2w_matrix(self, vectorizer=CountVectorizer):
        
        print "... building model and paraphernalias"
        model = vectorizer()
        model.fit_transform([' '.join(sent) for sent in self.sents])
        self.vocab = model.vocabulary_.keys()
        self.wordToIndex = model.vocabulary_
        self.indexToWord = {v:k for k,v in self.wordToIndex.items()}
        self.vocabSize = len(self.vocab)
        
        print "... building word-to-word matrix"
        w2w = np.zeros((self.vocabSize,self.vocabSize))
        for sent in self.sents:
            for w_i,w_j in permutations(sent,2):
                if w_i in self.vocab and w_j in self.vocab:
                    w2w[self.wordToIndex[w_i]][self.wordToIndex[w_j]] += 1
        self.w2w = w2w
    
    def build_similarity_matrix(self, similarity=ppmi):
        self.simMatrix = similarity(self.w2w)
    
    def k_most_similar(self, words, k=20):
        assert len(words)==len(filter(lambda w:1 if w in self.vocab else 0, words))
        w2sim = {}
        for word in words:
            simList = self.simMatrix[self.wordToIndex[word]]
            w2sim[word] = map(lambda idx:(self.indexToWord[idx],
                                          self.simMatrix[self.wordToIndex[word]][idx]),
                              np.argsort(simList)[::-1][:k])
        return w2sim
    

In [81]:
%%time
ds = SimpleDistSem(kFrequent=100)

... extracting data
... cleaning data
... building token list and vocabulary
... saving top 50-frequent in vocabulary
CPU times: user 5min 11s, sys: 8.96 s, total: 5min 20s
Wall time: 5min 23s


In [82]:
%%time
ds.build_w2w_matrix()

... building model and paraphernalias
... building word-to-word matrix
CPU times: user 1h 2min 17s, sys: 22.9 s, total: 1h 2min 40s
Wall time: 1h 2min 29s


In [83]:
%%time
ds.build_similarity_matrix()

CPU times: user 1.81 s, sys: 746 ms, total: 2.56 s
Wall time: 2.96 s


In [84]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
w2sim = ds.k_most_similar(words)

CPU times: user 7.06 ms, sys: 2.63 ms, total: 9.69 ms
Wall time: 14.1 ms


In [88]:
w2sim['car']

[(u'bugatti', 4.8527980020768435),
 (u'brabham', 4.8313494586693597),
 (u'racing', 4.4619548146786379),
 (u'car', 4.4586625842872083),
 (u'bogie', 4.3360280214393345),
 (u'audi', 4.2875567856731793),
 (u'clutch', 4.2313513332882513),
 (u'brake', 4.2258495851681701),
 (u'aston', 4.1464781874132743),
 (u'chassis', 4.1382022781094143),
 (u'grip', 4.0938699684444142),
 (u'motors', 4.0904116142730658),
 (u'earnhardt', 4.0813858030657535),
 (u'bentley', 4.0659757740553975),
 (u'cable', 3.9069692227710919),
 (u'bmw', 3.8521240552396079),
 (u'driving', 3.7989865555424505),
 (u'prix', 3.7752967844200458),
 (u'tire', 3.7719578831545308),
 (u'drag', 3.6860582350175921)]

### C. Evaluators

In [94]:
def bless_evaluator(simMatrix=None, indexers=[None,None]):
    wordToIndex, indexToWord = indexers
    path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03/BLESS_part.txt'
    with open(path,'rb') as f:
        bless = f.readlines()
    bless = [line.split('\t') for line in bless] # split into (concept, _, relation, relatum).
    crPairs = [(c.split('-')[0],r.split('-')[0],rel) for c,_,rel,r in bless]
    posPairs = [(c,r) for c,r,rel in crPairs if rel=='hyper']
    negPairs = [(c,r) for c,r,rel in crPairs if rel=='mero']
    
    
    return [map(lambda (c,r):(c,r,simMatrix[wordToIndex[c]][wordToIndex[r]]), posPairs),
            map(lambda (c,r):(c,r,simMatrix[wordToIndex[c]][wordToIndex[r]]), negPairs)]

In [95]:
%%time
posEval, negEval = bless_evaluator(ds.simMatrix, indexers=[ds.wordToIndex, ds.indexToWord])

CPU times: user 14.8 ms, sys: 5.27 ms, total: 20.1 ms
Wall time: 17.7 ms


In [97]:
posEval

[('frog', 'animal', 3.3320557700805304),
 ('frog', 'beast', 0.0),
 ('frog', 'creature', 0.0),
 ('frog', 'vertebrate', 3.879642982150346),
 ('lizard', 'animal', 1.6260814361188607),
 ('lizard', 'beast', 0.0),
 ('lizard', 'carnivore', 4.7110134121801606),
 ('lizard', 'creature', 3.4886539816037847),
 ('lizard', 'reptile', 5.8582097007040144),
 ('lizard', 'vertebrate', 4.2531101898685124),
 ('snake', 'animal', 2.681094840732654),
 ('snake', 'beast', 3.684011271118945),
 ('snake', 'creature', 2.7519079169895235),
 ('snake', 'reptile', 4.2051729042155976),
 ('snake', 'vertebrate', 4.2095113058141962),
 ('turtle', 'animal', 2.3693030276000884),
 ('turtle', 'beast', 4.2476881953402792),
 ('turtle', 'creature', 2.6224376606509123),
 ('turtle', 'food', 0.0),
 ('turtle', 'pet', 0.0),
 ('turtle', 'reptile', 4.7688498284369318),
 ('turtle', 'vertebrate', 0.0),
 ('phone', 'artifact', 0.0),
 ('phone', 'commodity', 0.0),
 ('phone', 'device', 3.1683963253919254),
 ('phone', 'equipment', 2.161913518605

In [98]:
negEval

[('frog', 'blood', 0.0),
 ('frog', 'bone', 0.0),
 ('frog', 'eye', 0.0),
 ('frog', 'foot', 0.0),
 ('frog', 'head', 0.0),
 ('frog', 'leg', 4.3559271623967595),
 ('frog', 'lung', 0.0),
 ('frog', 'poison', 4.485416532501703),
 ('frog', 'skin', 3.3401031507876127),
 ('frog', 'tongue', 0.0),
 ('lizard', 'blood', 0.0),
 ('lizard', 'eye', 0.0),
 ('lizard', 'head', 0.84088686145944602),
 ('lizard', 'leg', 4.0362471895549801),
 ('lizard', 'mouth', 2.6132695932862471),
 ('lizard', 'scale', 0.0),
 ('lizard', 'skin', 0.0),
 ('lizard', 'stripe', 0.0),
 ('lizard', 'tail', 2.9997262961746785),
 ('lizard', 'tongue', 0.0),
 ('snake', 'bone', 1.5864192550464704),
 ('snake', 'eye', 2.0785228787153276),
 ('snake', 'head', 1.490435157965075),
 ('snake', 'mouth', 1.8765235286719857),
 ('snake', 'poison', 3.0235253869374978),
 ('snake', 'scale', 0.83489600181724122),
 ('snake', 'skin', 3.2645063663432978),
 ('snake', 'tail', 0.0),
 ('snake', 'tongue', 2.4961051508326557),
 ('snake', 'tooth', 0.0),
 ('turtle',

### D. Accessories

In [8]:
import warnings
warnings.filterwarnings("ignore")