# Distributional Semantics Model: Syntactic-Dependency

In [3]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/DISTRIBUTIONAL_SEMANTICS/ASSIGNMENT_03")

In [79]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stopwords = stopwords.words('english')
from collections import defaultdict, Counter
from functools import partial
from itertools import permutations, product
from string import punctuation
import numpy as np

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
class LoadWiki:
    # load wiki dump in the following format:
    # [ ([(rel, dependended, dependant), (..), ...], 
    #    [(tk, tk_lemma, pos), (..), ...]),
    #   ([...], [...]), 
    #   ... ]
    
    def __init__(self):
        self.raw = []
        with open('wikicorpus.txt','rb') as f:
            bufferList = [] # temporarily store dependencies until encounter <c>.
            for line in f.readlines():
                if line.startswith('('):
                    bufferList.append(self.__dependency_line_processor(line))
                elif line.startswith('<c>'):
                    self.raw.append((bufferList, self.__pos_line_processor(line)))
                    bufferList = []
                else: pass
    
    def __dependency_line_processor(self, line):
        if line[-2]=='_': # get rid of parentheses in (rel,dpdd,dpdt,_) case.
            tmp = line[1:-2].split()
            return (tmp[0],tmp[1].split('_')[0],tmp[2].split('_')[0])
        else: # get rid of parentheses in (rel,_,dpdd,dpdt), (rel,dpdd,dpdt), (rel,dpdd,dpdt,..), (..) case.
            tmp = line[1:-1].split()
            if len(tmp)<3: return
            elif len(tmp)==3 or len(tmp)==4: return (tmp[0],tmp[1].split('_')[0],tmp[2].split('_')[0])
            else: 
                print "TEST: ", tmp
                return (tmp[0],tmp[-2].split('_')[0],tmp[-1].split('_')[0]) # format: (rel, depended, dependant)
    
    def __term_splitter(self, term):
        tmp = term.split('|')
        return (tmp[0],tmp[1],tmp[2]) # save only tk, lm, pos in a tuple.
    
    def __pos_line_processor(self, line):
        tmp = line[4:].split() # get rid of initial <c>
        return map(self.__term_splitter, tmp) 
    
    def get_data(self):
        return self.raw

In [7]:
%%time
data = LoadWiki().get_data() # the number of sents = 397238

CPU times: user 32 s, sys: 1.87 s, total: 33.9 s
Wall time: 33.9 s


In [8]:
dependencies = [depList for depList,sentList in data]
sents = [sentList for depList,sentList in data]

In [84]:
def cosine(w2w):
    w2w_norm = w2w / np.apply_along_axis(lambda r: np.sqrt(np.dot(r,r))
                               , 1, w2w)[:,np.newaxis]
    return np.dot(w2w_norm, w2w_norm.T)
    
def ppmi(w2w):
    rowSums, colSums, totalSums = w2w.sum(axis=1), w2w.sum(axis=0), w2w.sum()
    pwi, pwj, ppmiMatrix = rowSums/totalSums, colSums/totalSums, w2w/totalSums
    ppmiMatrix /= pwi[:,np.newaxis] # * 1/pwi by row.
    ppmiMatrix /= pwj # * 1/pwj by col.
    ppmiMatrix = np.nan_to_num(np.log(ppmiMatrix)) # compute pmi.
    ppmiMatrix = np.maximum(ppmiMatrix, 0) # compute ppmi.
    return ppmiMatrix

In [45]:
class DepedencyDict:
    
    def __init__(self, dependencies, sents):
        self.dependencies = dependencies
        self.sents = sents
        self.depDict = defaultdict(list)
        self.stemmer = PorterStemmer()
        self.__build_dependency_dict()
        
    def __build_dependency_dict(self):
        print "... extracting dependencies"
        for i,sent in enumerate(sents):
            for word in sent:
                if word[2].startswith('N'):
                    dps = self.__extract_dependencies(i,safe(word[0]))
                    if len(dps)!=0: self.depDict[self.stemmer.stem(safe(word[0]))].extend(dps)
    
    def __extract_dependencies(self, sentIndex, word):
        dps = []
        for dp in self.dependencies[sentIndex]:
            if dp is not None and word in dp:
                dps.append(self.stemmer.stem(safe(dp[2]))) if word==safe(dp[1]) \
                    else dps.append(self.stemmer.stem(safe(dp[1])))
            else: pass
        return dps

def safe(word):
    return word.decode('utf-8','ignore')

In [46]:
%%time
depDict = DepedencyDict(dependencies,sents).depDict

... extracting dependencies
CPU times: user 1min 47s, sys: 457 ms, total: 1min 47s
Wall time: 1min 47s


In [81]:
class SimpleDistSem:
    
    def __init__(self, sents, depDict, kFrequent=50):
        self.stemmer = PorterStemmer()
        print "... counting word frequencies"
        self.freqCounts = Counter([self.stemmer.stem(safe(item[1])) for sent in sents for item in sent])
        self.depDict = depDict
        print "... building vocabulary"
        self.vocab = [word for word in self.depDict.keys() if self.freqCounts[word]>kFrequent]
        self.wordToIndex = {word:index for index,word in enumerate(self.vocab)}
    
    def build_w2w_matrix(self):
        print "... building w2w matrix"
        self.w2w = np.zeros((len(self.vocab),len(self.vocab)))
        counter = 0
        for word in self.vocab:
            counter += 1
            for dep in self.depDict[word]:
                if dep in self.vocab:
                    self.w2w[self.wordToIndex[word]][self.wordToIndex[dep]] += 1
                    self.w2w[self.wordToIndex[dep]][self.wordToIndex[word]] += 1
            if counter % 500 == 0: print "... processed %d words" % counter
        

In [82]:
%%time
t = SimpleDistSem(sents,depDict)

... counting word frequencies
... building vocabulary
CPU times: user 1min 22s, sys: 312 ms, total: 1min 22s
Wall time: 1min 22s


In [83]:
%%time
t.build_w2w_matrix()

... building w2w matrix
... processed 500 words
... processed 1000 words
... processed 1500 words
... processed 2000 words
... processed 2500 words
... processed 3000 words
... processed 3500 words
... processed 4000 words
... processed 4500 words
... processed 5000 words
... processed 5500 words
... processed 6000 words
... processed 6500 words
... processed 7000 words
... processed 7500 words
... processed 8000 words
... processed 8500 words
CPU times: user 15min 36s, sys: 7.79 s, total: 15min 44s
Wall time: 15min 37s


In [85]:
%%time
cosineSimilarities = cosine(t.w2w)

CPU times: user 39.9 s, sys: 645 ms, total: 40.6 s
Wall time: 7.31 s


In [91]:
%%time
vocab = t.vocab
wordToIndex = t.wordToIndex
indexToWord = {i:w for w,i in wordToIndex.iteritems()}
words = ['car','bus','hospital','hotel','gun','bomb','horse','fox','table','bowl','guitar','piano']
words = filter(lambda w:1 if w in vocab else 0, words)
w2sim = {}
for word in words:
    simList = cosineSimilarities[wordToIndex[word]]
    w2sim[word] = map(lambda idx:(indexToWord[idx],
                                  cosineSimilarities[wordToIndex[word]][idx]),
                      np.argsort(simList)[::-1][1:20+1])

CPU times: user 17.6 ms, sys: 1.47 ms, total: 19 ms
Wall time: 18.4 ms


In [93]:
w2sim['gun']

[(u'furthermor', nan),
 (u'Von', nan),
 (u'Abdul', nan),
 (u'Chrono', nan),
 (u'Kenneth', nan),
 (u'160', nan),
 (u'Tommi', nan),
 (u'comed', nan),
 (u'gun', 1.0000000000000004),
 (u'partner', 0.99651059833415512),
 (u'compon', 0.99633584457458257),
 (u'card', 0.99579196919351554),
 (u'processor', 0.99574415374759728),
 (u'rifl', 0.99554245509484351),
 (u'vehicl', 0.99550160533292054),
 (u'characterist', 0.99544846742783843),
 (u'regiment', 0.99543944807642926),
 (u'battalion', 0.99543862475385603),
 (u'engin', 0.99540763897557161),
 (u'facil', 0.99524023077397883)]