# Simple Distributional Semantics Model

### A. Data Loading Facilities

In [47]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03")

In [48]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from collections import defaultdict
from functools import partial
from itertools import permutations
from string import punctuation

In [49]:
def load_wiki(cutoffFreq=50):
    
    print "... extracting data"
    with open('wikicorpus.txt','rb') as f:
        raw = f.readlines()
    raw = [sent.split() for sent in raw if sent.startswith('<c>')] 
        # extract sentences; split sentences into word complexes.
    raw = [[map(partial(str.split, word), '|') for word in sent] for sent in raw] 
        # split word complexes into words.
    
    print "... cleaning data"
    sents = [[word[0][1].lower() for word in sent if len(word[0])>1 
              and word[0][2].startswith('N')
              and word[0][1].lower() not in stopwords
              and word[0][1] not in punctuation] for sent in raw]
        # extract lemmas => complete sents corpus .
    
    print "... building token list and vocabulary"
    tokens = [word for sent in sents for word in sent]
        # type: list of words.
    fdist = nltk.FreqDist(tokens)
    vocab = list(set(tokens))   
    
    print "... saving top %d-frequent in vocabulary" % cutoffFreq
    vocab = [word for word in vocab if fdist[word] >= cutoffFreq]
    sents = [[word.decode('utf-8','ignore') for word in sent if word in vocab] for sent in sents]
        # type: list of lists of words.
        
    return [sents, tokens, vocab]

### B. Analyzer

In [59]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [60]:
# SIMILARITY MEASURES
def cosine(w2w):
    return
    
def ppmi(w2w):
    rowSums, colSums, totalSums = w2w.sum(axis=1), w2w.sum(axis=0), w2w.sum()
    pwi, pwj, ppmiMatrix = rowSums/totalSums, colSums/totalSums, w2w/totalSums
    ppmiMatrix /= pwi[:,np.newaxis] # * 1/pwi by row.
    ppmiMatrix /= pwj # * 1/pwj by col.
    ppmiMatrix = np.nan_to_num(np.log(ppmiMatrix)) # compute pmi.
    ppmiMatrix = np.maximum(ppmiMatrix, 0) # compute ppmi.
    return ppmiMatrix
          
def lmi(w2w):
    return

In [61]:
class SimpleDistSem:
    
    def __init__(self, data=load_wiki, kFrequent=50):
        self.sents, self.tokens, self.vocab = data(50)
    
    def build_w2w_matrix(self, vectorizer=CountVectorizer):
        
        print "... building model and paraphernalias"
        model = vectorizer()
        model.fit_transform([' '.join(sent) for sent in self.sents])
        self.vocab = model.vocabulary_.keys()
        self.wordToIndex = model.vocabulary_
        self.indexToWord = {v:k for k,v in self.wordToIndex.items()}
        self.vocabSize = len(self.vocab)
        
        print "... building word-to-word matrix"
        w2w = np.zeros((self.vocabSize,self.vocabSize))
        for sent in self.sents:
            for w_i,w_j in permutations(sent,2):
                if w_i in self.vocab and w_j in self.vocab:
                    w2w[self.wordToIndex[w_i]][self.wordToIndex[w_j]] += 1
        self.w2w = w2w
    
    def build_similarity_matrix(self, similarity=ppmi):
        self.simMatrix = similarity(self.w2w)
    
    def k_most_similar(self, words, k=20):
        assert len(words)==len(filter(lambda w:1 if w in self.vocab else 0, words))
        w2sim = {}
        for word in words:
            simList = self.simMatrix[self.wordToIndex[word]]
            w2sim[word] = map(lambda idx:(self.IndexToWord[idx],
                                          self.simMatrix[self.wordToIndex[word]][idx]),
                              np.argsort(simList)[::-1][:k])
        return w2sim
    

In [53]:
%%time
ds = SimpleDistSem(kFrequent=50)

... extracting data
... cleaning data
... building token list and vocabulary
... saving top 50-frequent in vocabulary
CPU times: user 6min 8s, sys: 7.24 s, total: 6min 16s
Wall time: 6min 15s


In [54]:
%%time
ds.build_w2w_matrix()

... building model and paraphernalias
... building word-to-word matrix
CPU times: user 1h 24s, sys: 20.1 s, total: 1h 44s
Wall time: 1h 28s


In [63]:
%%time
ds.build_similarity_matrix(similarity=ppmi)

CPU times: user 1.73 s, sys: 633 ms, total: 2.37 s
Wall time: 2.6 s


In [70]:
%%time
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
# w2sim = ds.k_most_similar(words, 20)
'''
[('frog', 'animal', 3.3325151288269708),
 ('frog', 'beast', 0.0),
 ('frog', 'creature', 0.0),
 ('frog', 'vertebrate', 3.8798665752467492),
 ('lizard', 'animal', 1.625313047626979),
 ('lizard', 'beast', 0.0),
 ('lizard', 'carnivore', 4.7100092580382418),
 ('lizard', 'creature', 3.4876498274618664),
 ('lizard', 'reptile', 5.8572055465620956),
 ('lizard', 'vertebrate', 4.2521060357265936),
 ...

[('frog', 'blood', 0.0),
 ('frog', 'bone', 0.0),
 ('frog', 'eye', 0.0),
 ('frog', 'foot', 0.0),
 ('frog', 'head', 0.0),
 ('frog', 'leg', 4.3561507554931627),
 ('frog', 'lung', 0.0),
 ('frog', 'poison', 4.4856401255981062),
 ('frog', 'skin', 3.3403267438840158),
 ('frog', 'tongue', 0.0),
 ('lizard', 'blood', 0.0),
 ('lizard', 'eye', 0.0),
 ('lizard', 'head', 0.84024111741419227),
 ('lizard', 'leg', 4.0352430354130613),
 ('lizard', 'mouth', 2.6126872910805572),
 ...
'''

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


0.0

### C. Evaluators

In [None]:
def bless_evaluator(simMatrix=None, indexers=[None,None]):
    wordToIndex, indexToWord = indexers
    path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03/BLESS_part.txt'
    with open(path,'rb') as f:
        bless = f.readlines()
    bless = [line.split('\t') for line in bless] # split into (concept, _, relation, relatum).
    crPairs = [(c.split('-')[0],r.split('-')[0],rel) for c,_,rel,r in bless]
    posPairs = [(c,r) for c,r,rel in crPairs if rel=='hyper']
    negPairs = [(c,r) for c,r,rel in crPairs if rel=='mero']
    
    print "Evaluation of Positive Pairs: "
    map(lambda (c,r):(c,r,simMatrix[wordToIndex[c]][wordToIndex[r]]), posPairs)
    
    print "Evaluation of Negative Pairs: "
    map(lambda (c,r):(c,r,simMatrix[wordToIndex[c]][wordToIndex[r]]), negPairs)

In [None]:
bless_evaluator(ds.build_similarity_matrix(), indexers=[ds.wordToIndex, ds.indexToWord])

### D. Accessories

In [8]:
import warnings
warnings.filterwarnings("ignore")

# Breakdown

### A. Preprocessing

In [30]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03")

In [31]:
import nltk
from nltk.corpus import stopwords
from collections import defaultdict

In [32]:
from functools import partial

In [33]:
%%time
# read raw file
with open('wikicorpus.txt','rb') as f:
    raw = f.readlines()

CPU times: user 3.82 s, sys: 740 ms, total: 4.56 s
Wall time: 4.58 s


In [34]:
%%time
# extract sentences
raw = [sent for sent in raw if sent.startswith('<c>')]
print raw[1]

<c> Anarchism|Anarchism|NNP|I-NP|O|N is|be|VBZ|I-VP|O|(S[dcl]\NP)/NP a|a|DT|I-NP|O|NP[nb]/N political|political|JJ|I-NP|O|N/N philosophy|philosophy|NN|I-NP|O|N encompassing|encompass|VBG|I-VP|O|(S[ng]\NP)/NP theories|theory|NNS|I-NP|O|N and|and|CC|I-NP|O|conj attitudes|attitude|NNS|I-NP|O|N which|which|WDT|B-NP|O|(NP\NP)/(S[dcl]\NP) consider|consider|VBP|I-VP|O|((S[dcl]\NP)/(S[to]\NP))/NP the|the|DT|I-NP|O|NP[nb]/N state|state|NN|I-NP|O|N to|to|TO|I-VP|O|(S[to]\NP)/(S[b]\NP) be|be|VB|I-VP|O|(S[b]\NP)/(S[adj]\NP) unnecessary|unnecessary|JJ|I-ADJP|O|S[adj]\NP ,|,|,|I-ADJP|O|, harmful|harmful|JJ|I-ADJP|O|S[adj]\NP ,|,|,|I-ADJP|O|, and/|and/|JJ|I-ADJP|O|S[adj]\NP or|or|CC|I-ADJP|O|conj undesirable|undesirable|JJ|I-ADJP|O|S[adj]\NP .|.|.|O|O|.

CPU times: user 2.5 s, sys: 43.6 ms, total: 2.54 s
Wall time: 2.54 s


In [35]:
%%time
# split sentences into word complexes
raw = [sent.split() for sent in raw]
print raw[1]

['<c>', 'Anarchism|Anarchism|NNP|I-NP|O|N', 'is|be|VBZ|I-VP|O|(S[dcl]\\NP)/NP', 'a|a|DT|I-NP|O|NP[nb]/N', 'political|political|JJ|I-NP|O|N/N', 'philosophy|philosophy|NN|I-NP|O|N', 'encompassing|encompass|VBG|I-VP|O|(S[ng]\\NP)/NP', 'theories|theory|NNS|I-NP|O|N', 'and|and|CC|I-NP|O|conj', 'attitudes|attitude|NNS|I-NP|O|N', 'which|which|WDT|B-NP|O|(NP\\NP)/(S[dcl]\\NP)', 'consider|consider|VBP|I-VP|O|((S[dcl]\\NP)/(S[to]\\NP))/NP', 'the|the|DT|I-NP|O|NP[nb]/N', 'state|state|NN|I-NP|O|N', 'to|to|TO|I-VP|O|(S[to]\\NP)/(S[b]\\NP)', 'be|be|VB|I-VP|O|(S[b]\\NP)/(S[adj]\\NP)', 'unnecessary|unnecessary|JJ|I-ADJP|O|S[adj]\\NP', ',|,|,|I-ADJP|O|,', 'harmful|harmful|JJ|I-ADJP|O|S[adj]\\NP', ',|,|,|I-ADJP|O|,', 'and/|and/|JJ|I-ADJP|O|S[adj]\\NP', 'or|or|CC|I-ADJP|O|conj', 'undesirable|undesirable|JJ|I-ADJP|O|S[adj]\\NP', '.|.|.|O|O|.']
CPU times: user 1.2 s, sys: 128 ms, total: 1.33 s
Wall time: 1.29 s


In [36]:
%%time
# split word complexes into words
raw = [[map(partial(str.split, word), '|') for word in sent] for sent in raw]
print raw[1]

[[['<c>']], [['Anarchism', 'Anarchism', 'NNP', 'I-NP', 'O', 'N']], [['is', 'be', 'VBZ', 'I-VP', 'O', '(S[dcl]\\NP)/NP']], [['a', 'a', 'DT', 'I-NP', 'O', 'NP[nb]/N']], [['political', 'political', 'JJ', 'I-NP', 'O', 'N/N']], [['philosophy', 'philosophy', 'NN', 'I-NP', 'O', 'N']], [['encompassing', 'encompass', 'VBG', 'I-VP', 'O', '(S[ng]\\NP)/NP']], [['theories', 'theory', 'NNS', 'I-NP', 'O', 'N']], [['and', 'and', 'CC', 'I-NP', 'O', 'conj']], [['attitudes', 'attitude', 'NNS', 'I-NP', 'O', 'N']], [['which', 'which', 'WDT', 'B-NP', 'O', '(NP\\NP)/(S[dcl]\\NP)']], [['consider', 'consider', 'VBP', 'I-VP', 'O', '((S[dcl]\\NP)/(S[to]\\NP))/NP']], [['the', 'the', 'DT', 'I-NP', 'O', 'NP[nb]/N']], [['state', 'state', 'NN', 'I-NP', 'O', 'N']], [['to', 'to', 'TO', 'I-VP', 'O', '(S[to]\\NP)/(S[b]\\NP)']], [['be', 'be', 'VB', 'I-VP', 'O', '(S[b]\\NP)/(S[adj]\\NP)']], [['unnecessary', 'unnecessary', 'JJ', 'I-ADJP', 'O', 'S[adj]\\NP']], [[',', ',', ',', 'I-ADJP', 'O', ',']], [['harmful', 'harmful', 'J

In [37]:
%%time
# extract lemmas => complete sents corpus
stopwords = stopwords.words('english')
from string import punctuation
sents = [[word[0][1].lower() for word in sent if len(word[0])>1 
          and word[0][2].startswith('N')
          and word[0][1].lower() not in stopwords
          and word[0][1] not in punctuation] for sent in raw]
print sents[1]



['anarchism', 'philosophy', 'theory', 'attitude', 'state']
CPU times: user 45.7 s, sys: 734 ms, total: 46.4 s
Wall time: 46 s


In [38]:
%%time
# build token list
tokens = [word for sent in sents for word in sent]
print len(tokens)

2919790
CPU times: user 285 ms, sys: 18.8 ms, total: 304 ms
Wall time: 302 ms


In [39]:
%%time
# build vocabulary list
vocab = list(set(tokens))
print len(vocab)

151330
CPU times: user 255 ms, sys: 3.85 ms, total: 258 ms
Wall time: 259 ms


In [40]:
%%time
# generate word-count distribution
fdist = nltk.FreqDist(tokens)

CPU times: user 1.04 s, sys: 11.7 ms, total: 1.05 s
Wall time: 1.05 s


In [41]:
%%time
# extract more-than-50-count words
vocab50 = [word for word in vocab if fdist[word] >= 50]
print vocab50[:10]
print len(vocab50)

['pigment', 'wednesday', 'thrace', 'hp', 'existentialism', 'numeral', 'controversy', 'kidd', 'topography', 'projection']
6971
CPU times: user 66.9 ms, sys: 5.46 ms, total: 72.4 ms
Wall time: 68.2 ms


In [42]:
%%time
# cull vocab in sents to leave only vocab50
sents = [[word.decode('utf-8','ignore') for word in sent if word in vocab50] for sent in sents]

CPU times: user 3min 54s, sys: 2.7 s, total: 3min 57s
Wall time: 3min 55s


In [43]:
sents[1]

[u'anarchism', u'philosophy', u'theory', u'attitude', u'state']

In [83]:
%%time
# build cooccurrence matrix I
from sklearn.feature_extraction.text import CountVectorizer
sentsInStrings = [' '.join(sent) for sent in sents]
# sentsInStrings = [' '.join(sent).decode('utf-8','ignore') for sent in sents]
cv = CountVectorizer()
rawCooccur = cv.fit_transform(sentsInStrings)

CPU times: user 3.05 s, sys: 256 ms, total: 3.31 s
Wall time: 3.5 s


In [85]:
%%time
# building cooccurrence matrix II
import numpy as np
l = len(cv.vocabulary_)
cm = np.zeros((l,l))
for sent in sents:
    for w_i,w_j in permutations(sent,2):
        if w_i in cv.vocabulary_ and w_j in cv.vocabulary_:
            cm[cv.vocabulary_[w_i]][cv.vocabulary_[w_j]] += 1

### B. Compute PPMI

In [343]:
%%time
# build PPMI table
totalCounts = float(cm.sum())
def ppmi():
    rowSums, colSums = cm.sum(axis=1), cm.sum(axis=0)
    pwi, pwj, ppmiMatrix = rowSums/totalCounts, colSums/totalCounts, cm/totalCounts
    ppmiMatrix /= pwi[:,np.newaxis]
    ppmiMatrix /= pwj
    ppmiMatrix = np.nan_to_num(np.log(ppmiMatrix))
    ppmiMatrix = np.maximum(ppmiMatrix, 0)
    return ppmiMatrix

ppmiMatrix = ppmi()

#     pw1w2 = cm[cv.vocabulary_[w1]][cv.vocabulary_[w2]] / totalCounts
#     pw1, pw2 = sum(cm[cv.vocabulary_[w1]]), sum(cm[cv.vocabulary_[w2]])
#     return max(np.log(pw1w2/(pw1*pw2)), 0)



CPU times: user 1.77 s, sys: 684 ms, total: 2.45 s
Wall time: 2.47 s


### C. Top 20 Similiar Words

In [351]:
wordList = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
wordToIdx = cv.vocabulary_
idxToWord = {v:k for k,v in wordToIdx.items()}

In [391]:
def twenty_most_similar(w):
    ppmiList = ppmiMatrix[wordToIdx[w]]
    contextByPPMI = map(lambda idx:(idxToWord[idx],ppmiMatrix[wordToIdx[w]][idx]), 
                        np.argsort(ppmiList)[::-1])
    return contextByPPMI[:20]

In [390]:
twenty_most_similar('car')

[(u'bugatti', 4.8523328830750989),
 (u'brabham', 4.8308843396676151),
 (u'racing', 4.4614896956768924),
 (u'car', 4.4587365004256378),
 (u'bogie', 4.335562902437589),
 (u'audi', 4.2870916666714347),
 (u'clutch', 4.2308862142865067),
 (u'brake', 4.2253844661664255),
 (u'aston', 4.1460130684115288),
 (u'chassis', 4.1377371591076697),
 (u'grip', 4.0934048494426696),
 (u'motors', 4.0899464952713211),
 (u'earnhardt', 4.0809206840640089),
 (u'bentley', 4.0655106550536537),
 (u'cable', 3.9065041037693473),
 (u'bmw', 3.8516589362378633),
 (u'driving', 3.7985214365407054),
 (u'prix', 3.7798610138233029),
 (u'tire', 3.7714927641527862),
 (u'drag', 3.685593116015847)]

### D. BLESS Evaluation

In [416]:
path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03/BLESS_part.txt'
with open(path,'rb') as f:
    bless = f.readlines()

In [417]:
bless = [line.split('\t') for line in bless]
print bless[1]

['frog-n', 'amphibian_reptile', 'coord', 'snake-n\n']


In [418]:
crPairs = [(c.split('-')[0],r.split('-')[0],rel) for c,_,rel,r in bless]
concepts = list(set([c for c,_,_ in crPairs]))

In [420]:
posPairs = [(c,r) for c,r,rel in crPairs if rel=='hyper']
negPairs = [(c,r) for c,r,rel in crPairs if rel=='mero']

In [426]:
map(lambda (c,r):(c,r,ppmiMatrix[wordToIdx[c]][wordToIdx[r]]), posPairs)

[('frog', 'animal', 3.3325151288269708),
 ('frog', 'beast', 0.0),
 ('frog', 'creature', 0.0),
 ('frog', 'vertebrate', 3.8798665752467492),
 ('lizard', 'animal', 1.625313047626979),
 ('lizard', 'beast', 0.0),
 ('lizard', 'carnivore', 4.7100092580382418),
 ('lizard', 'creature', 3.4876498274618664),
 ('lizard', 'reptile', 5.8572055465620956),
 ('lizard', 'vertebrate', 4.2521060357265936),
 ('snake', 'animal', 2.6803264522407724),
 ('snake', 'beast', 3.6830071169770262),
 ('snake', 'creature', 2.7509037628476047),
 ('snake', 'reptile', 4.2041687500736789),
 ('snake', 'vertebrate', 4.2085071516722774),
 ('turtle', 'animal', 2.3685346391082067),
 ('turtle', 'beast', 4.2466840411983604),
 ('turtle', 'creature', 2.6214335065089935),
 ('turtle', 'food', 0.0),
 ('turtle', 'pet', 0.0),
 ('turtle', 'reptile', 4.767845674295013),
 ('turtle', 'vertebrate', 0.0),
 ('phone', 'artifact', 0.0),
 ('phone', 'commodity', 0.0),
 ('phone', 'device', 3.167587464679003),
 ('phone', 'equipment', 2.160909364463

In [427]:
map(lambda (c,r):(c,r,ppmiMatrix[wordToIdx[c]][wordToIdx[r]]), negPairs)

[('frog', 'blood', 0.0),
 ('frog', 'bone', 0.0),
 ('frog', 'eye', 0.0),
 ('frog', 'foot', 0.0),
 ('frog', 'head', 0.0),
 ('frog', 'leg', 4.3561507554931627),
 ('frog', 'lung', 0.0),
 ('frog', 'poison', 4.4856401255981062),
 ('frog', 'skin', 3.3403267438840158),
 ('frog', 'tongue', 0.0),
 ('lizard', 'blood', 0.0),
 ('lizard', 'eye', 0.0),
 ('lizard', 'head', 0.84024111741419227),
 ('lizard', 'leg', 4.0352430354130613),
 ('lizard', 'mouth', 2.6126872910805572),
 ('lizard', 'scale', 0.0),
 ('lizard', 'skin', 0.0),
 ('lizard', 'stripe', 0.0),
 ('lizard', 'tail', 2.9987221420327597),
 ('lizard', 'tongue', 0.0),
 ('snake', 'bone', 1.585730707661156),
 ('snake', 'eye', 2.0776907971884766),
 ('snake', 'head', 1.4897894139198213),
 ('snake', 'mouth', 1.8759412264662956),
 ('snake', 'poison', 3.022521232795579),
 ('snake', 'scale', 0.83508321043723321),
 ('snake', 'skin', 3.263502212201379),
 ('snake', 'tail', 0.0),
 ('snake', 'tongue', 2.4951009966907374),
 ('snake', 'tooth', 0.0),
 ('turtle', 