# Distributional Semantics Model: Syntactic-Dependency

In [1]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03")

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stopwords = stopwords.words('english')
from collections import defaultdict, Counter
from functools import partial
from itertools import permutations, product
from string import punctuation
import numpy as np

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
class LoadWiki:
    # load wiki dump in the following format:
    # [ ([(rel, dependended, dependant), (..), ...], 
    #    [(tk, tk_lemma, pos), (..), ...]),
    #   ([...], [...]), 
    #   ... ]
    
    def __init__(self):
        self.raw = []
        with open('wikicorpus.txt','rb') as f:
            bufferList = [] # temporarily store dependencies until encounter <c>.
            for line in f.readlines():
                if line.startswith('('):
                    bufferList.append(self.__dependency_line_processor(line))
                elif line.startswith('<c>'):
                    self.raw.append((bufferList, self.__pos_line_processor(line)))
                    bufferList = []
                else: pass
    
    def __dependency_line_processor(self, line):
        if line[-2]=='_': # get rid of parentheses in (rel,dpdd,dpdt,_) case.
            tmp = line[1:-2].split()
            return (tmp[0],tmp[1].split('_')[0],tmp[2].split('_')[0])
        else: # get rid of parentheses in (rel,_,dpdd,dpdt), (rel,dpdd,dpdt), (rel,dpdd,dpdt,..), (..) case.
            tmp = line[1:-1].split()
            if len(tmp)<3: return
            elif len(tmp)==3 or len(tmp)==4: return (tmp[0],tmp[1].split('_')[0],tmp[2].split('_')[0])
            else: 
                return (tmp[0],tmp[-2].split('_')[0],tmp[-1].split('_')[0]) # format: (rel, depended, dependant)
    
    def __term_splitter(self, term):
        tmp = term.split('|')
        return (tmp[0],tmp[1],tmp[2]) # save only tk, lm, pos in a tuple.
    
    def __pos_line_processor(self, line):
        tmp = line[4:].split() # get rid of initial <c>
        return map(self.__term_splitter, tmp) 
    
    def get_data(self):
        return self.raw

In [5]:
%%time
data = LoadWiki().get_data() # the number of sents = 397238

CPU times: user 34 s, sys: 2.31 s, total: 36.3 s
Wall time: 36.6 s


In [6]:
dependencies = [depList for depList,sentList in data]
sents = [sentList for depList,sentList in data]

In [56]:
%%time
# GET RID OF SENTS W/ EMPTY DEPS
emptyIdx = [i for i,item in enumerate(dependencies) if item==[]]
sents = [sent for i,sent in enumerate(sents) if i not in emptyIdx]

In [60]:
dependencies = [dep for dep in dependencies if dep!=[]]
    # len(deps)=len(sents)=370645

In [76]:
# DEPENDENCY TYPES
depTypes = list({dep[0] for depList in dependencies for dep in depList if dep is not None})

In [77]:
depTypes

['xcomp',
 'xmod',
 'ncsubj',
 'xsubj',
 'ncmod',
 'cmod',
 'det',
 'dobj',
 'obj2',
 'iobj',
 'aux',
 'conj',
 'ccomp']

In [79]:
def cosine(w2w):
    w2w_norm = w2w / np.apply_along_axis(lambda r: np.sqrt(np.dot(r,r))
                               , 1, w2w)[:,np.newaxis]
    return np.dot(w2w_norm, w2w_norm.T)
    
def ppmi(w2w):
    rowSums, colSums, totalSums = w2w.sum(axis=1), w2w.sum(axis=0), w2w.sum()
    pwi, pwj, ppmiMatrix = rowSums/totalSums, colSums/totalSums, w2w/totalSums
    ppmiMatrix /= pwi[:,np.newaxis] # * 1/pwi by row.
    ppmiMatrix /= pwj # * 1/pwj by col.
    ppmiMatrix = np.nan_to_num(np.log(ppmiMatrix)) # compute pmi.
    ppmiMatrix = np.maximum(ppmiMatrix, 0) # compute ppmi.
    return ppmiMatrix

In [80]:
class DepedencyDict:
    
    def __init__(self, dependencies, sents):
        self.dependencies = dependencies
        self.sents = sents
        self.depDict = defaultdict(list)
        self.stemmer = PorterStemmer()
        self.__build_dependency_dict()
        
    def __build_dependency_dict(self):
        print "... extracting dependencies"
        for i,sent in enumerate(sents):
            for word in sent:
                if word[2].startswith('N'):
                    dps = self.__extract_dependencies(i,safe(word[0]))
                    if len(dps)!=0: self.depDict[self.stemmer.stem(safe(word[0]))].extend(dps)
    
    def __extract_dependencies(self, sentIndex, word):
        dps = []
        for dp in self.dependencies[sentIndex]:
            if dp is not None and word in dp:
                if word==safe(dp[1]) and dp[2] not in punctuation:
                    dps.append(self.stemmer.stem(safe(dp[2])))
                elif word==safe(dp[2]) and dp[1] not in punctuation:
                    dps.append(self.stemmer.stem(safe(dp[1])))
                else: pass
#                 dps.append(self.stemmer.stem(safe(dp[2]))) if word==safe(dp[1]) \
#                     else dps.append(self.stemmer.stem(safe(dp[1])))
            else: pass
        return dps

def safe(word):
    return word.decode('utf-8','ignore')

In [81]:
%%time
depDict = DepedencyDict(dependencies,sents).depDict

... extracting dependencies
CPU times: user 1min 45s, sys: 171 ms, total: 1min 45s
Wall time: 1min 45s


In [84]:
class SimpleDistSem:
    
    def __init__(self, sents, depDict, kFrequent=50):
        self.stemmer = PorterStemmer()
        print "... counting word frequencies"
        self.freqCounts = Counter([self.stemmer.stem(safe(word[0])) for sent in sents for word in sent])
        self.depDict = depDict
        print "... building vocabulary"
        self.vocab = [word for word in self.depDict.keys() if self.freqCounts[word]>kFrequent]
        self.contexts = list({value for values in self.depDict.values() for value in values})
        self.wordToIndex = {word:index for index,word in enumerate(self.vocab)}
        self.contextToIndex = {context:index for index,context in enumerate(self.contexts)}
    
    def build_w2c_matrix(self):
        
        print "... building full w2c matrix"
        self.w2c = np.zeros((len(self.vocab),len(self.contexts)))
        counter = 0
        for word in self.vocab:
            counter += 1
            for dep in self.depDict[word]:
                self.w2c[self.wordToIndex[word]][self.contextToIndex[dep]] += 1
            if counter % 500 == 0: print "... processed %d words" % counter

In [85]:
%%time
ds = SimpleDistSem(sents,depDict)

... counting word frequencies
... building vocabulary
CPU times: user 1min 22s, sys: 481 ms, total: 1min 22s
Wall time: 1min 22s


In [86]:
%%time
ds.build_w2c_matrix()

... building full w2c matrix
... processed 500 words
... processed 1000 words
... processed 1500 words
... processed 2000 words
... processed 2500 words
... processed 3000 words
... processed 3500 words
... processed 4000 words
... processed 4500 words
... processed 5000 words
... processed 5500 words
... processed 6000 words
... processed 6500 words
... processed 7000 words
... processed 7500 words
... processed 8000 words
... processed 8500 words
CPU times: user 3.52 s, sys: 381 ms, total: 3.9 s
Wall time: 3.85 s


In [88]:
ds.w2c.shape

(8685, 17638)

In [87]:
%%time
cosineSimilarities = cosine(ds.w2c)

CPU times: user 1min 33s, sys: 1.2 s, total: 1min 34s
Wall time: 15.2 s


In [89]:
%%time
vocab = ds.vocab
wordToIndex = ds.wordToIndex
indexToWord = {i:w for w,i in wordToIndex.iteritems()}
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
words = filter(lambda w:1 if w in vocab else 0, words)
w2sim = {}
for word in words:
    simList = cosineSimilarities[wordToIndex[word]]
    w2sim[word] = map(lambda idx:(indexToWord[idx],
                                  cosineSimilarities[wordToIndex[word]][idx]),
                      np.argsort(simList)[::-1][1:20+1])

CPU times: user 23.8 ms, sys: 67.8 ms, total: 91.6 ms
Wall time: 89.7 ms


In [90]:
w2sim['car']

[(u'system', 0.97359998771879197),
 (u'structur', 0.97263485741860878),
 (u'design', 0.96832065907347264),
 (u'charact', 0.96640669846764304),
 (u'divis', 0.96638110633317631),
 (u'ship', 0.96609023407073136),
 (u'network', 0.96605111606453675),
 (u'program', 0.96516951583765997),
 (u'hous', 0.96341195016901793),
 (u'branch', 0.96229483123467285),
 (u'letter', 0.96222918429087723),
 (u'stone', 0.96079214339859165),
 (u'plot', 0.95993755015311311),
 (u'machin', 0.95992822971600389),
 (u'sound', 0.95945330361446468),
 (u'forc', 0.95932615190546255),
 (u'unit', 0.95902237068157614),
 (u'count', 0.95883709439604148),
 (u'shell', 0.9586924138689481),
 (u'class', 0.95837460062135793)]

In [91]:
import random
def bless_evaluator(simMatrix=None, indexers=[None,None]):
    wordToIndex, indexToWord = indexers
    path = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03/BLESS_part.txt'
    with open(path,'rb') as f:
        bless = f.readlines()
    bless = [line.split('\t') for line in bless] # split into (concept, _, relation, relatum).
    
    filter(lambda w:1 if w in vocab else 0, words)
    
    def filter_vocab(w):
        if w in ds.vocab: return w # this is way too hacky, i know.
    
    crPairs = [(c.split('-')[0],r.split('-')[0],rel) for c,_,rel,r in bless]
    posPairs = [(c,r) for c,r,rel in crPairs if rel=='hyper' 
                 if filter_vocab(c) is not None and filter_vocab(r) is not None]
    negPairs = [(c,r) for c,r,rel in crPairs if rel=='mero' 
                 if filter_vocab(c) is not None and filter_vocab(r) is not None]
    
    stemmer = PorterStemmer()
    
    return [map(lambda (c,r):(c,r,simMatrix[wordToIndex[stemmer.stem(c)]][wordToIndex[stemmer.stem(r)]]), posPairs),
            map(lambda (c,r):(c,r,simMatrix[wordToIndex[stemmer.stem(c)]][wordToIndex[stemmer.stem(r)]]), negPairs)]

In [95]:
posEval, negEval = bless_evaluator(cosineSimilarities, indexers=[ds.wordToIndex, ds.vocab])

In [96]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(posEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in posEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('piano', 'artifact', 0.50474002855218159), ('violin', 'artifact', 0.55263562539169031), ('bowl', 'vessel', 0.85618419681912072), ('jet', 'aircraft', 0.86721147284187294), ('jet', 'craft', 0.80909679986787131)]
Average Cosine:  0.747963592901


In [97]:
print "Examples of Evaluation on Positive Relations (Cosine): "
print random.sample(negEval, 5)
print "Average Cosine: ", np.mean([cosineVal for _,_,cosineVal in negEval])

Examples of Evaluation on Positive Relations (Cosine): 
[('car', 'plate', 0.92880148980402744), ('fighter', 'top', 0.59276522238339391), ('train', 'driver', 0.82425045445738843), ('pub', 'glass', 0.76026941907066259), ('bowl', 'interior', 0.84189714921644343)]
Average Cosine:  0.71709650429
