# Distributional Semantics Model: Syntactic-Dependency

In [3]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03")

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stopwords = stopwords.words('english')
from collections import defaultdict, Counter
from functools import partial
from itertools import permutations, product
from string import punctuation
import numpy as np

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
class LoadWiki:
    # load wiki dump in the following format:
    # [ ([(rel, dependended, dependant), (..), ...], 
    #    [(tk, tk_lemma, pos), (..), ...]),
    #   ([...], [...]), 
    #   ... ]
    
    def __init__(self):
        self.raw = []
        with open('wikicorpus.txt','rb') as f:
            bufferList = [] # temporarily store dependencies until encounter <c>.
            for line in f.readlines():
                if line.startswith('('):
                    bufferList.append(self.__dependency_line_processor(line))
                elif line.startswith('<c>'):
                    self.raw.append((bufferList, self.__pos_line_processor(line)))
                    bufferList = []
                else: pass
    
    def __dependency_line_processor(self, line):
        if line[-2]=='_': # get rid of parentheses in (rel,dpdd,dpdt,_) case.
            tmp = line[1:-2].split()
            return (tmp[0],tmp[1].split('_')[0],tmp[2].split('_')[0])
        else: # get rid of parentheses in (rel,_,dpdd,dpdt), (rel,dpdd,dpdt), (rel,dpdd,dpdt,..), (..) case.
            tmp = line[1:-1].split()
            if len(tmp)<3: return
            elif len(tmp)==3 or len(tmp)==4: return (tmp[0],tmp[1].split('_')[0],tmp[2].split('_')[0])
            else: 
                return (tmp[0],tmp[-2].split('_')[0],tmp[-1].split('_')[0]) # format: (rel, depended, dependant)
    
    def __term_splitter(self, term):
        tmp = term.split('|')
        return (tmp[0],tmp[1],tmp[2]) # save only tk, lm, pos in a tuple.
    
    def __pos_line_processor(self, line):
        tmp = line[4:].split() # get rid of initial <c>
        return map(self.__term_splitter, tmp) 
    
    def get_data(self):
        return self.raw

In [7]:
%%time
data = LoadWiki().get_data() # the number of sents = 397238

CPU times: user 32.4 s, sys: 1.99 s, total: 34.4 s
Wall time: 34.3 s


In [8]:
dependencies = [depList for depList,sentList in data]
sents = [sentList for depList,sentList in data]

In [16]:
%%time
subVocab = [PorterStemmer().stem(word.decode('utf-8','ignore')) 
            for sent in sents 
            for word,_,pos in sent if pos.startswith('N')]

In [18]:
def cosine(w2w):
    w2w_norm = w2w / np.apply_along_axis(lambda r: np.sqrt(np.dot(r,r))
                               , 1, w2w)[:,np.newaxis]
    return np.dot(w2w_norm, w2w_norm.T)
    
def ppmi(w2w):
    rowSums, colSums, totalSums = w2w.sum(axis=1), w2w.sum(axis=0), w2w.sum()
    pwi, pwj, ppmiMatrix = rowSums/totalSums, colSums/totalSums, w2w/totalSums
    ppmiMatrix /= pwi[:,np.newaxis] # * 1/pwi by row.
    ppmiMatrix /= pwj # * 1/pwj by col.
    ppmiMatrix = np.nan_to_num(np.log(ppmiMatrix)) # compute pmi.
    ppmiMatrix = np.maximum(ppmiMatrix, 0) # compute ppmi.
    return ppmiMatrix

In [19]:
class DepedencyDict:
    
    def __init__(self, dependencies, sents):
        self.dependencies = dependencies
        self.sents = sents
        self.depDict = defaultdict(list)
        self.stemmer = PorterStemmer()
        self.__build_dependency_dict()
        
    def __build_dependency_dict(self):
        print "... extracting dependencies"
        for i,sent in enumerate(sents):
            for word in sent:
                if word[2].startswith('N'):
                    dps = self.__extract_dependencies(i,safe(word[0]))
                    if len(dps)!=0: self.depDict[self.stemmer.stem(safe(word[0]))].extend(dps)
    
    def __extract_dependencies(self, sentIndex, word):
        dps = []
        for dp in self.dependencies[sentIndex]:
            if dp is not None and word in dp:
                dps.append(self.stemmer.stem(safe(dp[2]))) if word==safe(dp[1]) \
                    else dps.append(self.stemmer.stem(safe(dp[1])))
            else: pass
        return dps

def safe(word):
    return word.decode('utf-8','ignore')

In [11]:
%%time
depDict = DepedencyDict(dependencies,sents).depDict

... extracting dependencies
CPU times: user 1min 54s, sys: 1.18 s, total: 1min 55s
Wall time: 1min 56s


In [47]:
class SimpleDistSem:
    
    def __init__(self, sents, depDict, kFrequent=50):
        self.stemmer = PorterStemmer()
        print "... counting word frequencies"
        self.freqCounts = Counter([self.stemmer.stem(safe(word[0])) for sent in sents for word in sent])
        self.depDict = depDict
        print "... building vocabulary"
        self.vocab = [word for word in self.depDict.keys() if self.freqCounts[word]>kFrequent]
        self.contexts = list({value for values in ds.depDict.values() for value in values})
        self.wordToIndex = {word:index for index,word in enumerate(self.vocab)}
        self.contextToIndex = {context:index for index,context in enumerate(self.contexts)}
    
    def build_w2w_matrix(self):
        
        print "... building full w2w matrix"
        self.w2c = np.zeros((len(self.vocab),len(self.contexts)))
        counter = 0
        for word in self.vocab:
            counter += 1
            for dep in self.depDict[word]:
                self.w2c[self.wordToIndex[word]][self.contextToIndex[dep]] += 1
            if counter % 500 == 0: print "... processed %d words" % counter
    

In [48]:
%%time
ds = SimpleDistSem(sents,depDict)

... counting word frequencies
... building vocabulary
CPU times: user 1min 31s, sys: 408 ms, total: 1min 32s
Wall time: 1min 32s


In [49]:
%%time
ds.build_w2w_matrix()

... building full w2w matrix
... processed 500 words
... processed 1000 words
... processed 1500 words
... processed 2000 words
... processed 2500 words
... processed 3000 words
... processed 3500 words
... processed 4000 words
... processed 4500 words
... processed 5000 words
... processed 5500 words
... processed 6000 words
... processed 6500 words
... processed 7000 words
... processed 7500 words
... processed 8000 words
... processed 8500 words
... processed 9000 words
CPU times: user 4.78 s, sys: 418 ms, total: 5.2 s
Wall time: 5.13 s


In [50]:
%%time
cosineSimilarities = cosine(ds.w2c)

CPU times: user 1min 45s, sys: 1.19 s, total: 1min 46s
Wall time: 16.7 s


In [51]:
%%time
vocab = ds.vocab
wordToIndex = ds.wordToIndex
indexToWord = {i:w for w,i in wordToIndex.iteritems()}
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
words = filter(lambda w:1 if w in vocab else 0, words)
w2sim = {}
for word in words:
    simList = cosineSimilarities[wordToIndex[word]]
    w2sim[word] = map(lambda idx:(indexToWord[idx],
                                  cosineSimilarities[wordToIndex[word]][idx]),
                      np.argsort(simList)[::-1][1:20+1])

CPU times: user 18.5 ms, sys: 2.19 ms, total: 20.7 ms
Wall time: 20.4 ms


In [53]:
w2sim['car']

[(u'ship', 0.99646414343514145),
 (u'machin', 0.99582312027210418),
 (u'design', 0.99581473404469589),
 (u'divis', 0.99553674599771103),
 (u'charact', 0.99535878392129906),
 (u'shell', 0.99523853078832369),
 (u'network', 0.99521343885009617),
 (u'structur', 0.99497686513167716),
 (u'class', 0.99496173147785805),
 (u'tank', 0.99479915008256747),
 (u'compani', 0.99477120855467938),
 (u'gener', 0.99471366053696786),
 (u'track', 0.99466696541411481),
 (u'build', 0.99465408211611483),
 (u'librari', 0.99450893332439605),
 (u'tube', 0.99446350552347296),
 (u'system', 0.99443211166413215),
 (u'celebr', 0.99435074029852422),
 (u'programm', 0.9943261932744496),
 (u'signal', 0.9943073166718277)]

In [81]:
import random
def bless_evaluator(simMatrix=None, indexers=[None,None]):
    wordToIndex, indexToWord = indexers
    path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03/BLESS_part.txt'
    with open(path,'rb') as f:
        bless = f.readlines()
    bless = [line.split('\t') for line in bless] # split into (concept, _, relation, relatum).
    
    filter(lambda w:1 if w in vocab else 0, words)
    
    def filter_vocab(w):
        if w in ds.vocab: return w # this is way too hacky, i know.
    
    crPairs = [(c.split('-')[0],r.split('-')[0],rel) for c,_,rel,r in bless]
    posPairs = [(c,r) for c,r,rel in crPairs if rel=='hyper' 
                 if filter_vocab(c) is not None and filter_vocab(r) is not None]
    negPairs = [(c,r) for c,r,rel in crPairs if rel=='mero' 
                 if filter_vocab(c) is not None and filter_vocab(r) is not None]
    
    stemmer = PorterStemmer()
    
    return [map(lambda (c,r):(c,r,simMatrix[wordToIndex[stemmer.stem(c)]][wordToIndex[stemmer.stem(r)]]), posPairs),
            map(lambda (c,r):(c,r,simMatrix[wordToIndex[stemmer.stem(c)]][wordToIndex[stemmer.stem(r)]]), negPairs)]

In [82]:
posEval, negEval = bless_evaluator(ds.w2c, indexers=[ds.wordToIndex, ds.vocab])

In [83]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(posEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in posEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('pig', 'mammal', 0.0), ('sword', 'object', 0.0), ('sword', 'artifact', 0.0), ('bull', 'mammal', 0.0), ('pistol', 'weapon', 0.0)]
Average Cosine:  0.00558659217877


In [84]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(negEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in negEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('corn', 'kernel', 0.0), ('car', 'diesel', 0.0), ('car', 'light', 0.0), ('dolphin', 'tail', 0.0), ('pig', 'mouth', 0.0)]
Average Cosine:  0.0218281036835
