# Distributional Semantics Model: Syntactic-Dependency

In [1]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03")

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stopwords = stopwords.words('english')
from collections import defaultdict, Counter
from functools import partial
from itertools import permutations, product
from string import punctuation
import numpy as np

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
class LoadWiki:
    # load wiki dump in the following format:
    # [ ([(rel, dependended, dependant), (..), ...], 
    #    [(tk, tk_lemma, pos), (..), ...]),
    #   ([...], [...]), 
    #   ... ]
    
    def __init__(self):
        self.raw = []
        with open('wikicorpus.txt','rb') as f:
            bufferList = [] # temporarily store dependencies until encounter <c>.
            for line in f.readlines():
                if line.startswith('('):
                    bufferList.append(self.__dependency_line_processor(line))
                elif line.startswith('<c>'):
                    self.raw.append((bufferList, self.__pos_line_processor(line)))
                    bufferList = []
                else: pass
    
    def __dependency_line_processor(self, line):
        if line[-2]=='_': # get rid of parentheses in (rel,dpdd,dpdt,_) case.
            tmp = line[1:-2].split()
            return (tmp[0],tmp[1].split('_')[0],tmp[2].split('_')[0])
        else: # get rid of parentheses in (rel,_,dpdd,dpdt), (rel,dpdd,dpdt), (rel,dpdd,dpdt,..), (..) case.
            tmp = line[1:-1].split()
            if len(tmp)<3: return
            elif len(tmp)==3 or len(tmp)==4: return (tmp[0],tmp[1].split('_')[0],tmp[2].split('_')[0])
            else: 
                return (tmp[0],tmp[-2].split('_')[0],tmp[-1].split('_')[0]) # format: (rel, depended, dependant)
    
    def __term_splitter(self, term):
        tmp = term.split('|')
        return (tmp[0],tmp[1],tmp[2]) # save only tk, lm, pos in a tuple.
    
    def __pos_line_processor(self, line):
        tmp = line[4:].split() # get rid of initial <c>
        return map(self.__term_splitter, tmp) 
    
    def get_data(self):
        return self.raw

In [5]:
%%time
data = LoadWiki().get_data() # the number of sents = 397238

CPU times: user 34 s, sys: 2.31 s, total: 36.3 s
Wall time: 36.6 s


In [6]:
dependencies = [depList for depList,sentList in data]
sents = [sentList for depList,sentList in data]

In [56]:
%%time
# GET RID OF SENTS W/ EMPTY DEPS
emptyIdx = [i for i,item in enumerate(dependencies) if item==[]]
sents = [sent for i,sent in enumerate(sents) if i not in emptyIdx]

In [60]:
dependencies = [dep for dep in dependencies if dep!=[]]
    # len(deps)=len(sents)=370645

In [76]:
# DEPENDENCY TYPES
depTypes = list({dep[0] for depList in dependencies for dep in depList if dep is not None})

In [100]:
print depTypes

['xcomp', 'xmod', 'ncsubj', 'xsubj', 'ncmod', 'cmod', 'det', 'dobj', 'obj2', 'iobj', 'aux', 'conj', 'ccomp']


In [119]:
# SAVE k MOST SYNTACTICALLY-SIGNIFICANT DISTINGUISHER RELS
depTypesWide = ['xcomp','xmod','ncsubj','xsubj','ncmod','cmod','dobj','obj2','iobj','ccomp']
depTypesNarrow = ['ncsubj','xsubj','ncmod','dobj','obj2','iobj']
    # for meanings of tags, see http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.2.8742&rep=rep1&type=pdf

In [140]:
# RESTRICT CONTEXT: ONLY FREQUENT-ENOUGH ONES
tokens = [PorterStemmer().stem(safe(w)) for sent in sents for w,_,_ in sent]

In [142]:
tokensFreq = Counter(tokens)

In [144]:
tokensFreq.items()[:10]

[(u'', 4779),
 (u'Y1l', 1),
 (u'Hansabank', 1),
 (u'Keach', 1),
 (u'woodi', 13),
 (u'un-studi', 1),
 (u'spideri', 1),
 (u'Simka', 1),
 (u'Pronk', 1),
 (u'Dagg', 1)]

In [164]:
tokensFreq.most_common(2001)[-1]

(u'nine', 485)

In [120]:
dependencies[1]

[('ncmod', '', 'anarchists'),
 ('ncmod', '', 'criteria'),
 ('dobj', 'constitutes', 'anarchism'),
 ('dobj', 'for', 'what'),
 ('ncmod', '', 'criteria'),
 ('dobj', 'have', 'criteria'),
 ('aux', 'have', 'may'),
 ('ncsubj', 'have', 'anarchists'),
 ('det', 'other', 'each'),
 ('dobj', 'with', 'other'),
 ('iobj', 'disagree', 'with'),
 ('det', 'criteria', 'these'),
 ('ncsubj', 'are', 'criteria'),
 ('dobj', 'on', 'what'),
 ('ncmod', '', 'disagree'),
 ('ncmod', '', 'disagree'),
 ('ncsubj', 'disagree', 'they'),
 ('conj', 'and', 'disagree'),
 ('conj', 'and', 'may')]

In [99]:
sents[1]

[('Specific', 'Specific', 'NNP'),
 ('anarchists', 'anarchist', 'NNS'),
 ('may', 'may', 'MD'),
 ('have', 'have', 'VB'),
 ('additional', 'additional', 'JJ'),
 ('criteria', 'criterion', 'NNS'),
 ('for', 'for', 'IN'),
 ('what', 'what', 'WP'),
 ('constitutes', 'constitute', 'VBZ'),
 ('anarchism', 'anarchism', 'NN'),
 (',', ',', ','),
 ('and', 'and', 'CC'),
 ('they', 'they', 'PRP'),
 ('often', 'often', 'RB'),
 ('disagree', 'disagree', 'VBP'),
 ('with', 'with', 'IN'),
 ('each', 'each', 'DT'),
 ('other', 'other', 'NN'),
 ('on', 'on', 'IN'),
 ('what', 'what', 'WP'),
 ('these', 'these', 'DT'),
 ('criteria', 'criterion', 'NNS'),
 ('are', 'be', 'VBP'),
 ('.', '.', '.')]

In [256]:
def cosine(w2w):
    w2w_norm = w2w / np.apply_along_axis(lambda r: np.sqrt(np.dot(r,r))
                               , 1, w2w)[:,np.newaxis]
    return np.dot(w2w_norm, w2w_norm.T)

# this PPMI algorithm unfit for w2w matrix when target words != context words!
# def ppmi(w2w):
#     rowSums, colSums, totalSums = w2w.sum(axis=1), w2w.sum(axis=0), w2w.sum()
#     pwi, pwj, ppmiMatrix = rowSums/totalSums, colSums/totalSums, w2w/totalSums
#     ppmiMatrix /= pwi[:,np.newaxis] # * 1/pwi by row.
#     ppmiMatrix /= pwj # * 1/pwj by col.
#     ppmiMatrix = np.nan_to_num(np.log(ppmiMatrix)) # compute pmi.
#     ppmiMatrix = np.maximum(ppmiMatrix, 0) # compute ppmi.
#     return ppmiMatrix

In [121]:
class DepedencyDict:
    
    def __init__(self, dependencies, sents):
        self.dependencies = dependencies
        self.sents = sents
        self.depDict = defaultdict(list)
        self.stemmer = PorterStemmer()
        self.__build_dependency_dict()
        
    def __build_dependency_dict(self):
        print "... extracting dependencies"
        for i,sent in enumerate(sents):
            for word in sent:
                if word[2].startswith('N'):
                    dps = self.__extract_dependencies(i,safe(word[0]))
                    if len(dps)!=0: self.depDict[self.stemmer.stem(safe(word[0]))].extend(dps)
    
    def __extract_dependencies(self, sentIndex, word):
        dps = []
        for dp in self.dependencies[sentIndex]:
            if dp is not None and word in dp and dp[0] in depTypesNarrow:
                if word==safe(dp[1]) and dp[2] not in punctuation:
                    dps.append(self.stemmer.stem(safe(dp[2])))
                elif word==safe(dp[2]) and dp[1] not in punctuation:
                    dps.append(self.stemmer.stem(safe(dp[1])))
                else: pass
            else: pass
        return dps

def safe(word):
    return word.decode('utf-8','ignore')

In [122]:
%%time
depDict = DepedencyDict(dependencies,sents).depDict

... extracting dependencies
CPU times: user 1min 38s, sys: 1.07 s, total: 1min 39s
Wall time: 1min 40s


In [138]:
sents[1]

[('Specific', 'Specific', 'NNP'),
 ('anarchists', 'anarchist', 'NNS'),
 ('may', 'may', 'MD'),
 ('have', 'have', 'VB'),
 ('additional', 'additional', 'JJ'),
 ('criteria', 'criterion', 'NNS'),
 ('for', 'for', 'IN'),
 ('what', 'what', 'WP'),
 ('constitutes', 'constitute', 'VBZ'),
 ('anarchism', 'anarchism', 'NN'),
 (',', ',', ','),
 ('and', 'and', 'CC'),
 ('they', 'they', 'PRP'),
 ('often', 'often', 'RB'),
 ('disagree', 'disagree', 'VBP'),
 ('with', 'with', 'IN'),
 ('each', 'each', 'DT'),
 ('other', 'other', 'NN'),
 ('on', 'on', 'IN'),
 ('what', 'what', 'WP'),
 ('these', 'these', 'DT'),
 ('criteria', 'criterion', 'NNS'),
 ('are', 'be', 'VBP'),
 ('.', '.', '.')]

In [235]:
class SimpleDistSem:
    
    def __init__(self, sents, depDict, kFrequent=50):
        self.stemmer = PorterStemmer()
        print "... counting word frequencies"
        self.freqCounts = Counter([self.stemmer.stem(safe(word[0])) for sent in sents for word in sent])
        self.depDict = depDict
        print "... building vocabulary"
        temp_vocab = [word for word in self.depDict.keys() if self.freqCounts[word]>kFrequent]
        self.contexts = list({value for values in self.depDict.values() for value in values 
                              if tokensFreq[value]>400 and not ''})
        self.vocab = [word for word in temp_vocab if any([context_word in self.contexts for context_word in self.depDict[word]])]
        self.wordToIndex = {word:index for index,word in enumerate(self.vocab)}
        self.contextToIndex = {context:index for index,context in enumerate(self.contexts)}
    
    def build_w2c_matrix(self):
        
        print "... building full w2c matrix"
        self.w2c = np.zeros((len(self.vocab),len(self.contexts)))
        counter = 0
        for word in self.vocab:
            counter += 1
            for dep in self.depDict[word]:
                if dep in self.contexts:
                    self.w2c[self.wordToIndex[word]][self.contextToIndex[dep]] += 1
            if counter % 500 == 0: print "... processed %d words" % counter

In [234]:
ds.contexts[2]

u'oldest'

In [231]:
'upgrad' in ds.contexts

False

In [237]:
#%%timem
ds = SimpleDistSem(sents,depDict)

... counting word frequencies
... building vocabulary


In [238]:
%%time
ds.build_w2c_matrix()

... building full w2c matrix
... processed 500 words
... processed 1000 words
... processed 1500 words
... processed 2000 words
... processed 2500 words
... processed 3000 words
... processed 3500 words
... processed 4000 words
... processed 4500 words
... processed 5000 words
... processed 5500 words
... processed 6000 words
... processed 6500 words
... processed 7000 words
... processed 7500 words
... processed 8000 words
... processed 8500 words
CPU times: user 2min 26s, sys: 1.97 s, total: 2min 28s
Wall time: 2min 27s


In [239]:
ds.w2c.shape

(8652, 2127)

In [200]:
ds.contexts[:10]

[u'',
 u'four',
 u'oldest',
 u'dynasti',
 u'whose',
 u'accus',
 u'accur',
 u'concret',
 u'Western',
 u'under']

##### PPMI

In [247]:
%%time
ppmiSimilarities = ppmi(ds.w2c)

CPU times: user 590 ms, sys: 238 ms, total: 829 ms
Wall time: 837 ms


In [248]:
%%time
vocab = ds.vocab
wordToIndex = ds.wordToIndex
indexToWord = {i:w for w,i in wordToIndex.iteritems()}
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
words = filter(lambda w:1 if w in vocab else 0, words)
w2sim = {}
for word in words:
    simList = ppmiSimilarities[wordToIndex[word]]
    w2sim[word] = map(lambda idx:(indexToWord[idx],
                                  ppmiSimilarities[wordToIndex[word]][idx]),
                      np.argsort(simList)[::-1][1:20+1])

CPU times: user 19.7 ms, sys: 1.8 ms, total: 21.6 ms
Wall time: 21.2 ms


In [249]:
w2sim['car']

[(u'Era', 4.9201998645415781),
 (u'Crystal', 4.5708242230844576),
 (u'Altern', 4.4848817932837326),
 (u'altitud', 4.4003244052556694),
 (u'IEC', 4.3349416459928181),
 (u'Practic', 4.079416685175568),
 (u'Inca', 4.0222582713356196),
 (u'excav', 3.8899903027578691),
 (u'like', 3.8641471902922646),
 (u'Parson', 3.7113438973905701),
 (u'sexual', 3.7071772246957244),
 (u'cooper', 3.6805089776135631),
 (u'cadmium', 3.3541123929810914),
 (u'clergi', 3.3446635037831589),
 (u'Presley', 3.3291110907756742),
 (u'Energi', 3.310761952107478),
 (u'dilut', 3.2072212731666374),
 (u'Dick', 3.1327787969121212),
 (u'reput', 3.1322354662820748),
 (u'wherebi', 3.1284403953135231)]

##### Cosine

In [240]:
%%time
cosineSimilarities = cosine(ds.w2c)

CPU times: user 10.9 s, sys: 301 ms, total: 11.2 s
Wall time: 2.66 s


In [241]:
%%time
vocab = ds.vocab
wordToIndex = ds.wordToIndex
indexToWord = {i:w for w,i in wordToIndex.iteritems()}
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
words = filter(lambda w:1 if w in vocab else 0, words)
w2sim = {}
for word in words:
    simList = cosineSimilarities[wordToIndex[word]]
    w2sim[word] = map(lambda idx:(indexToWord[idx],
                                  cosineSimilarities[wordToIndex[word]][idx]),
                      np.argsort(simList)[::-1][1:20+1])

CPU times: user 22.3 ms, sys: 58.8 ms, total: 81.1 ms
Wall time: 91.9 ms


In [242]:
w2sim['car']

[(u'engin', 0.94309930395038566),
 (u'system', 0.93603132778170495),
 (u'machin', 0.93453145228422441),
 (u'comput', 0.92688048974110959),
 (u'ship', 0.92525594823520196),
 (u'program', 0.92502353671012238),
 (u'vehicl', 0.92378263830994045),
 (u'oper', 0.92292229346271104),
 (u'design', 0.92091667909932851),
 (u'aircraft', 0.91975513877538151),
 (u'architectur', 0.91930301613423682),
 (u'train', 0.91903461634408234),
 (u'unit', 0.91724829715729406),
 (u'anim', 0.91670523078538657),
 (u'vessel', 0.91664412667338602),
 (u'divis', 0.9163243037919615),
 (u'activ', 0.91521280439143304),
 (u'tank', 0.91469086363275265),
 (u'experi', 0.91413547847980448),
 (u'boat', 0.91363809520929029)]

#### BLESS Evaluation

In [243]:
import random
def bless_evaluator(simMatrix=None, indexers=[None,None]):
    wordToIndex, indexToWord = indexers
    path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03/BLESS_part.txt'
    with open(path,'rb') as f:
        bless = f.readlines()
    bless = [line.split('\t') for line in bless] # split into (concept, _, relation, relatum).
    
    filter(lambda w:1 if w in vocab else 0, words)
    
    def filter_vocab(w):
        if w in ds.vocab: return w # this is way too hacky, i know.
    
    crPairs = [(c.split('-')[0],r.split('-')[0],rel) for c,_,rel,r in bless]
    posPairs = [(c,r) for c,r,rel in crPairs if rel=='hyper' 
                 if filter_vocab(c) is not None and filter_vocab(r) is not None]
    negPairs = [(c,r) for c,r,rel in crPairs if rel=='mero' 
                 if filter_vocab(c) is not None and filter_vocab(r) is not None]
    
    stemmer = PorterStemmer()
    
    return [map(lambda (c,r):(c,r,simMatrix[wordToIndex[stemmer.stem(c)]][wordToIndex[stemmer.stem(r)]]), posPairs),
            map(lambda (c,r):(c,r,simMatrix[wordToIndex[stemmer.stem(c)]][wordToIndex[stemmer.stem(r)]]), negPairs)]

##### Cosine

In [260]:
posEval, negEval = bless_evaluator(cosineSimilarities, indexers=[ds.wordToIndex, ds.vocab])

In [261]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(posEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in posEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('potato', 'food', 0.59673120904353727), ('guitar', 'instrument', 0.85472223983043261), ('bull', 'beast', 0.62697262442409385), ('cat', 'mammal', 0.73782331602938822), ('rat', 'mammal', 0.91269136147598096)]
Average Cosine:  0.685915096846


In [262]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(negEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in negEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('fork', 'plastic', 0.68949141703171235), ('van', 'floor', 0.19955151307702232), ('bomber', 'motor', 0.78723632322391224), ('cannon', 'ball', 0.7652596655938072), ('whale', 'fin', 0.34123184732844725)]
Average Cosine:  0.596797240985
