### Project Kojak

** The problem  **

We will attempt to identify amboguously defined words - words that are homographs (spelled the same, but with multiple meanings) and determine the exact meaning of the word from a context window.

Here we attempt to do this in a few stages
1. train a word embedding on some training corpus using skip-gram (Here we use 1000 sholarly research papers) 
2. identify common homographs and extract the various context windows
3. interpret the context windows as vectors in the embedding space and appy a clustering algorith (DBSCAN). Each cluster is interpreted as a distinct definition of the homograph. Each cluster then is representative vector.
4. apply to a test corpus - match context of given homograph to most similar group.


### This notebook

Loads a pre-trained word embedding model, uses DBSCAN clustering to identify several 'definitions' of a set of homographs, and saves those definitions for later use.

In [1]:
import gensim
import json
import os
import re
import time
from nltk.corpus import stopwords
from nltk import tokenize
from nltk import pos_tag
from pprint import pprint



In [2]:
# Declare stopwords, preprocess the data from source file

stop = stopwords.words('english')
stop += ['?','!',':',';','[',']','[]','“' ]
stop += ['.', ',', '(', ')', "'", '"',"''",'""',"``",'”', '“', '?', '!', '’', 'et', 'al', 'al.']
stop = set(stop)

class MyPapers(object):
    # a memory-friendly way to load a large corpora
     def __init__(self, dirname):
            self.dirname = dirname
 
     def __iter__(self):
        with open(self.dirname) as data_file:    
            data = json.load(data_file)
        # iterate through all file names in our directory
        for paper in data:
            sentences = tokenize.sent_tokenize(paper['full_text'])
            for sentence in sentences:
                try:
                    line = re.sub(r'[?\.,!:;\(\)“\[\]]',' ',sentence)
                    line = [word for word in line.lower().split() if word not in stop]
                    yield line
                except:
                    print("Empty line found")
                    continue
                

In [None]:
#Instantiate iterable on the data

#papers is an iterable of scholarly papers, tokenized for prcessing
papers = MyPapers('data/train_data.json') 

## Word embeddings

Import word2vec word embeddings trained on 2848 scholarly journal articles

In [4]:
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from functools import reduce

In [5]:
model = gensim.models.word2vec.Word2Vec.load("data/journal.txt")

In [6]:
model.corpus_count

182533

In [7]:
vectors = model.wv

In [8]:
len(vectors.vocab)

127777

** contexts to vectors **

In [123]:
from collections import Counter, defaultdict
# The function takes as arguments a list of tokenized documents and a window size
# and returns each word in the document along with its window context as a tuple

def generate_word_counts(documents):
    counts = Counter()
    
    for document in documents:        
        for word in set(document):                    
            counts[word] += 1
            
    return counts

# Takes list of word tokens as arguments
# Returns a list of vectors whose components are the arithmetic mean of the 
# corresponding component of all of the input vectors

def get_vectors(word_list):
    vecs = []
    for word in word_list:
        try:
            vecs.append(vectors[word])
        except:
            print("{} missing from vocabulary".format(word))
            #continue
    return vecs

# Takes list of vectors as arguments
# Returns a single vector whose components are the arithmetic mean of the 
# corresponding component of all of the input vectors

def vector_average(word_list):
    vector_list = get_vectors(word_list)
    A = np.array(vector_list)
    dim = A.shape[0]
    ones = np.ones(dim)
    return ones.dot(A)/len(word_list)

# Takes list of tokenized documents, target word and window size as arguments
# Returns list of vectors where each vector represents the context window 
# of the target word in the word embedding space

def context2vectors(documents,target):

    context_vectors = []

    for document in documents:
        sentence = document
        if target in sentence:
            str_sentence = streamlined_sentence(sentence)
            context_vectors.append(vector_average(get_vectors(str_sentence)))
                    
    return context_vectors


# Takes list of vectors as arguments
# Returns a single vector whose components are the arithmetic mean of the 
# corresponding component of all of the input vectors weighted by Inverse Document Frecuency

def vector_average2(words): #, word_counts, vectors):
    
    total = sum(list(word_counts.values()))
    vocab = set(vectors.vocab.keys())
    words = [x for x in words if x in vocab]
    vector_list = list(map((lambda x: vectors[x]*np.log((1 + total)/(1 + word_counts[x]))),words))
    
    if len(vector_list) == 0:
        return 0
    elif len(vector_list) == 1:
        vector_sum = vector_list[0]
    else:
        vector_sum = reduce((lambda x,y: np.add(x,y)),vector_list)
        
    weighted_average = (1.0/np.linalg.norm(vector_sum))*vector_sum
    
    return weighted_average

# Takes list of tokenized documents, target word and window size as arguments
# Returns list of vectors where each vector represents the context window 
# of the target word in the word embedding space

def context2vectors2(documents,target):

    context_vectors = []

    for document in documents:
        sentence = document
        if target in sentence:
            sentence.remove(target)
            #str_sentence = streamlined_sentence(sentence)
            context_vectors.append(vector_average2(sentence))
                    
    return context_vectors

In [10]:
#Instantiate iterable on the data

#papers is an iterable of scholarly papers, tokenized for prcessing
papers = MyPapers('data/train_data.json') 

In [None]:
def MyPapers_plus(papers):
    
    phrases = gensim.models.phrases.Phrases(sentences = papers, min_count = 5, threshold = 150)
    bigram = gensim.models.phrases.Phraser(phrases)
    phrases2 = gensim.models.phrases.Phrases(sentences = bigram[papers], min_count = 5, threshold = 300)
    trigram = gensim.models.phrases.Phraser(phrases2)
    
    return trigram[bigram[papers]]

In [19]:
word_counts = generate_word_counts(MyPapers_plus(papers))

In [20]:
word_counts['new_york_city']

14

In [130]:
#dictionary = gensim.corpora.dictionary.Dictionary(MyPapers_plus(papers))
#text = [dictionary.doc2bow(c) for c in MyPapers_plus(papers)]

** Clustering with DBSAN **

Use DBSCAN to determine similar usages of the target homographs. Each of these similar usages will be combined to a representative vector in the embedding space and constitute a "definition" of that word.

In [21]:
def streamlined_sentence(sentence):
    POS = {'JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBD','VBZ'}
    st_sent = [word[0] for word in pos_tag(sentence) if word[1] in POS]
    return st_sent

In [92]:
#Arguments: The desired cluster number, a list of documents making up the corpus, the target homograph
#           a list of labels for the conext sentences indicating the homographs usage, and window size
# The function prints the representative context windows for the target word within the desired cluster   

def print_cluster_context(cluster_number, documents, target, labels):
    
    context_vectors = []

    for document in documents:
        sentence = document
        if target in sentence:
            str_sentence = streamlined_sentence(sentence)
            context_vectors.append(str_sentence)
            
    for i, label in enumerate(labels):
        if label == cluster_number:
            print(context_vectors[i])
            
#Arguments: The desired cluster number, a list of documents making up the corpus, the target homograph
#           a list of labels for the conext sentences indicating the homographs usage, and window size
# The function prints the representative context windows for the target word within the desired cluster   

def cluster_context(documents, target, labels):
    
    context_sentences = []

    for document in documents:
        sentence = document
        if target in sentence:
            str_sentence = streamlined_sentence(sentence)
            context_sentences.append(str_sentence)
                     
    clustered_sentences = defaultdict(list)                
    for i, label in enumerate(labels):
        clustered_sentences[label].append(context_sentences[i])
    
    return clustered_sentences

# Arguments: List of vectors, each representing a context window and a list of labels 
#            corresponding to the context vectors
# Returns:   A dictionary where keys are the identified labels from clustering and the value is a single 
#            representing the cluster

def identify_definition(context_vectors, labels):
    
    cluster_numbers = set(labels)
    definitions = dict()
    cluster_vectors = defaultdict(list)
    
    if len(set(labels)) == 1:
        print("No consistent definition found")
        definitions[0] = np.zeros(len(context_vectors[0]))
        return definitions
    
    for i, label in enumerate(labels):
        cluster_vectors[label].append(context_vectors[i])
    
    for key in cluster_vectors.keys():
        if key < 0:
            continue
        else:
            v = vector_average(cluster_vectors[key])
            definitions[key] = v/np.linalg.norm(v)
                    
    return definitions


In [80]:
target = u'train'
context_vectors = context2vectors2(MyPapers_plus(papers), target)

In [81]:
epsilon = .1

In [85]:
dbscan = DBSCAN(eps = epsilon, metric = 'cosine', algorithm = 'brute', min_samples = 3)
dbscan.fit(context_vectors)

DBSCAN(algorithm='brute', eps=0.1, leaf_size=30, metric='cosine',
    min_samples=3, n_jobs=1, p=None)

In [86]:
labels = dbscan.labels_
n_clusters = len(set(labels)) # - (1 if -1 in labels else 0)
print(n_clusters)

4


In [87]:
labels

array([-1, -1, -1,  1, -1,  0, -1,  1, -1, -1, -1, -1,  0, -1, -1,  0, -1,
        1,  0,  0, -1, -1, -1, -1, -1,  1, -1, -1,  0, -1, -1, -1, -1,  0,
       -1, -1,  0, -1, -1,  0,  0,  0, -1,  0, -1,  0,  2, -1, -1, -1, -1,
       -1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  0, -1, -1, -1, -1, -1, -1,
        0,  2,  2, -1, -1,  0,  0,  0,  0,  0, -1, -1, -1,  0, -1, -1, -1,
        0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,  0, -1, -1, -1,
       -1, -1,  0, -1, -1, -1, -1, -1])

In [78]:
target_definitions = identify_definition(context_vectors, labels)

In [79]:
contexts = cluster_context(MyPapers_plus(papers), target, labels)

# Recording Definitions

Use the DBSCAN clusters to determine the various definitions of a word, then create a dictionary for the word

In [125]:
def define_window(window, dictionary):
    cosine_dists = []
    wv = vector_average(window)
    window_vector = wv/np.linalg.norm(wv)
    for k,v in dictionary.items():
        cosine_dists.append((1 - np.dot(window_vector, v),k))
    cosine_dists.sort()
    print(cosine_dists)
    return cosine_dists[0][1]
    
def extract_dictionary(papers, homographs):
    dictionary = dict()
    for word in homographs:
        print("Calculating context vectors for \"{}\"".format(word))
        context_vectors = context2vectors2(papers, word)
        print("Clustering...")
        dbscan = DBSCAN(eps = epsilon, metric = 'cosine', algorithm = 'brute', min_samples = 3)
        dbscan.fit(context_vectors)
        labels = dbscan.labels_
        print("found {} distinct definitions".format(len(set(labels))))
        print("Building definitions for \"{}\"".format(word))
        dictionary[word] = identify_definition(context_vectors, labels)
        
    print("Dictionary complete")    
    return dictionary    

In [89]:
homographs = ['attribute', 'bank', 'charge', 'park', 'train']

dictionary = extract_dictionary(papers, homographs)

Calculating context vectors for attribute
Clustering...
found 4 distinct definitions
Building definition for attribute
Calculating context vectors for bank
Clustering...
found 3 distinct definitions
Building definition for bank
Calculating context vectors for charge
Clustering...
found 6 distinct definitions
Building definition for charge
Calculating context vectors for second
Clustering...
found 13 distinct definitions
Building definition for second
Calculating context vectors for train
Clustering...
found 4 distinct definitions
Building definition for train
Dictionary complete


In [42]:
dictionary['bank'].keys()

dict_keys([0, 1])

## Testing

In [98]:
# TEST WINDOW #

target = u'train'
context_vectors = context2vectors2(MyPapers_plus(papers), target)
dbscan = DBSCAN(eps = epsilon, metric = 'cosine', algorithm = 'brute', min_samples = 4)
dbscan.fit(context_vectors)
labels = dbscan.labels_
#target_definitions = identify_definition(context_vectors, labels)
target_definitions = dictionary[target]


contexts = cluster_context(MyPapers_plus(papers), target, labels)
correct = 0
wrong = 0
for c in contexts:
    for window in contexts[c]:
        d = define_window(window, target_definitions)
        if c == d:
            correct += 1
        else:
            print("Cluster {}, defined as {} \n".format(c, d), window, '\n')
            if c != -1:
                wrong += 1
                
print("Number correct: {}\nNumber wrong: {}".format(correct, wrong))

Cluster -1, defined as 1 
 ['model', 'working', 'helps', 'train', 'students', 'develop', 'trust', 'researchers', 'university', 'company'] 

Cluster -1, defined as 0 
 ['paper', 'complex', 'network', 'analysis', 'performed', 'time-varying', 'brain', 'functional', 'connectomes', 'persistent', 'peripheral', 'neuropathic', 'pain', 'obtained', 'means', 'local', 'field', 'potential', 'spike', 'train', 'analysis'] 

Cluster -1, defined as 0 
 ['hence', 'study', 'regions', 'cortex', 'ventral', 'posterolateral', 'vpl', 'nuclei', 'thalamus', 'chosen', 'putting', 'probes', 'local', 'field', 'potential', 'spike', 'train', 'analysis', 'aim', 'reconstruct', 'functional', 'time-varying', 'brain', 'activity', 'regions'] 

Cluster -1, defined as 1 
 ['based', 'success', 'uk', 'national', 'health', 'service', 'mandated', 'primary', 'care', 'services', 'screening', 'treatment', 'anxiety', 'depression', 'common', 'curriculum', 'service', 'entitled', 'improving', 'access', 'psychological', 'therapies', 'de

In [129]:
def read_glossary(glossary):
    
    vector_glossary = dict()
    
    for k, v in glossary.items():
        vector_glossary[k] = {key:vector_average2(tokenize.word_tokenize(value)) for (key,value) in v.items()}
    
    return vector_glossary

def context(documents, target):
    
    context_sentences = []

    for document in documents:
        #print(document[:15])
        sentence = document
        if target in sentence:
            #str_sentence = streamlined_sentence(sentence)
            #print[str_sentence]
            sentence.remove(target)
            context_sentences.append(sentence)
            
    return context_sentences


In [136]:
g = read_glossary(glossary)

In [134]:
target = u'charge'
context_sentences = context(MyPapers_plus(papers), target)
#print(context_sentences)

for c in context_sentences:
    d = define_window(c, g[target])
    print("{}\nDefined as {} \n".format(c, d))



[(0.16059814709758236, 5), (0.23080756133305957, 1), (0.26926161686981109, 3), (0.28137601933693446, 4), (0.3843184364280906, 2)]
['specifically', 'mothers', 'become', 'manager', 'taking', 'full', 'childrearing', 'affairs', 'assume', 'responsibilities', 'sociocultural', 'reproduction']
Defined as 5 

[(0.20935028349878992, 3), (0.24966084715189585, 2), (0.5282130816233771, 4), (0.52897297361852635, 1), (0.58132746878072261, 5)]
['dust', 'particles', 'attach', 'onto', 'surface', 'due', 'gravity', 'electrostatic', 'mechanical', 'effects', 'wind', 'water', 'droplets']
Defined as 3 

[(0.17710670130761486, 3), (0.25376895531419441, 2), (0.4569239663491701, 1), (0.48875424951993385, 4), (0.49887785183826339, 5)]
['deposition', 'held', 'variation', 'electrical', 'potential', 'near', 'surface', 'double', 'layer', 'surface', 'energy', 'effects', 'capillary', 'effects', 'addition', 'gravity', 'electrostatic', 'forces', 'qasem']
Defined as 3 

[(0.1846605199103839, 3), (0.25221095216650369, 2), 

In [135]:
charge_def = {1:"(criminal law) a pleading describing some wrong or offense",
              2:"(explosive) a quantity of explosive to be set off at one time",
              3:"(physics) the quantity of unbalanced electricity in a body (either positive or negative) and construed as an excess or deficiency of electrons",
              4:"(finance) request for payment in exchange for a good or service",
              5:"the responsibility of taking care or control of someone or something."}

state_def = {1:"(physics) the condition of matter with respect to structure, form, constitution, phase, or the like",
            2:"a nation or territory considered as an organized political community under one government",
            3:"express something definitely or clearly in speech or writing"}
glossary = { 'charge':charge_def, 'state':state_def}

In [137]:
target = u'state'
context_sentences = context(MyPapers_plus(papers), target)
#print(context_sentences)

for c in context_sentences:
    d = define_window(c, g[target])
    print("{}\nDefined as {} \n".format(c, d))

[(0.27474465878066157, 1), (0.34133801464450564, 3), (0.39960212011433138, 2)]
['mazzoni', 'cad/cam', 'procedure', 'surgical', 'application', 'minimized', 'interval', '2', 'weeks', 'aim', 'present', 'retrospective', 'observational', 'study', 'describe', 'development', 'in-house', 'workflow', 'reduced', 'planning', 'time', 'thereby', 'allowing', 'cad/cam', 'based', 'jaw', 'reconstruction', 'microvascular', 'fibula', 'graft', 'even', 'urgent', 'cases', 'retrospective', 'analysis', 'done', '30', 'patients', 'case', 'one', 'patient', 'illustrated', 'full', 'workflow']
Defined as 1 

[(0.31922753104571799, 2), (0.34476964513183295, 1), (0.40652343177275196, 3)]
['need', 'ethics', 'approval', 'waived', 'ethics', 'commission', 'chamber', 'medicine', 'rhineland-pfalz', 'according', 'berufsordnung', '§', '15', 'landeskrankenhausgesetz', '§', '36', 'und', '§', '37', 'workflow', 'applied', '30', 'cases', 'primary', 'secondary', 'reconstruction', 'time', 'january', '2014', 'january', '2016']
Defin

[(0.2511769469744759, 1), (0.38363806544073675, 2), (0.4281299564251273, 3)]
['test', 'data', 'obtained', 'simulation', 'covers', 'possible', 'states', 'sg', 'namely', 'normal', 'internal', 'fault', 'state', 'external', 'fault', 'state', 'various', 'conditions', 'fault', 'type', 'fault', 'location', 'fault', 'resistance', 'fault', 'inception', 'angle', 'rest_paper_organized', 'follows']
Defined as 1 

[(0.25043151933793784, 1), (0.35555411770040068, 3), (0.43284297667279104, 2)]
['section', 'prove', 'main', 'result', 'well-known', 'bsdes', 'provide', 'stochastic', 'representation', 'solutions', 'semi-linear', 'parabolic', 'partial_differential_equations', 'pdes', 'sometimes', 'called', 'nonlinear', 'feynman–kac', 'formula”']
Defined as 1 

[(0.27931971509511, 1), (0.36255688726061774, 2), (0.36429906156223191, 3)]
['stochastic', 'programming', 'model', 'developed', 'using', 'flow', 'method', 'express', 'different', 'states', 'batteries', 'objective', 'maximize', 'station’s', 'net', 'pr

[(0.21945071483131828, 2), (0.2765557292984846, 1), (0.30004582950275371, 3)]
['however', 'insurance', 'payouts', 'account', 'temporary', 'labor', 'inactivity', 'business', 'miss', 'consider', 'either', 'inter-industrial', 'impacts', 'inter-regional', 'connections', 'resulting', 'less', 'economic', 'damages', 'aspect', 'study', 'analyzed', 'u', 'economies', 'could', 'impacted', 'short-term', 'job', 'losses', 'generated', 'sandy', 'path', 'north', 'florida', 'new', 'hampshire']
Defined as 2 

[(0.17902759521736433, 2), (0.29487623969642263, 3), (0.32306547318407142, 1)]
['fema', 'keeps', 'continuing', 'collaborate', 'local', 'federal', 'partners', 'supporting', 'recovery', 'individuals', 'families', 'businesses', 'communities', 'superstorm', 'begins', 'economic', 'damage', 'assessment', 'measured', 'accurately']
Defined as 2 

[(0.24367519207331845, 2), (0.28040492817278784, 3), (0.28316105105476708, 1)]
['therefore', 'unless', 'unexpected', 'natural_disasters', 'avoided', 'local', 'fed

[(0.17551939982147413, 2), (0.27257259448973792, 3), (0.33916134637727779, 1)]
['first', 'strongest', 'drivers', 'csr', 'among', 'large-scale', 'mining', 'firms', 'operating', 'ghana', 'need', 'procure', 'protect', 'promote', 'company', 'reputation', 'desire', 'pre-empt', 'stiffer', 'regulation', 'encouragement', 'pre-existing', 'development', 'plans', 'host', 'communities']
Defined as 2 

[(0.23474967541883718, 2), (0.23669370547550783, 1), (0.25165808276262092, 3)]
['moderately', 'strong', 'drivers', 'related', 'expectations', 'agitations', 'various', 'internal_external', 'stakeholders', 'fear', 'sanctions', 'industry', 'peer', 'pressure', 'mimicry', 'descending', 'order', 'importance', 'second', 'beside', 'heightened', 'sense', 'moral_obligation', 'towards', 'shareholders', 'e']
Defined as 2 

[(0.24532369039801272, 3), (0.30473200104790243, 2), (0.3537125011415081, 1)]
['baralt', 'gurzynski-weiss', 'failed', 'find', 'significant', 'differences', 'anxiety', 'face-to-face', 'computer

[(0.27950481675144978, 1), (0.31831768259392534, 3), (0.34520123929287283, 2)]
['set', 'training', 'programs', 'next', 'used', 'determine', 'relationship', 'variable', 'power', 'consumption', 'hardware', 'component', 'selected', 'model']
Defined as 1 

[(0.26454380833444735, 1), (0.36311519879800569, 3), (0.36389666429927647, 2)]
['smartphone', 'components', 'held', 'particular', 'long', 'periods', 'time', 'state', 'discharge', 'battery', 'monitored', 'using', 'built-in', 'voltage', 'sensors', 'therefore', 'providing', 'estimate', 'power', 'consumption', 'particular', 'activity', 'state']
Defined as 1 

[(0.30785320123244064, 3), (0.31376107914433349, 1), (0.35372593164145272, 2)]
['therefore', 'mobile_multimedia', 'receiver', 'select', 'crowd', 'service', 'rejection', 'probability', 'sequence', 'video', 'frames', 'according', 'competition', 'video', 'frames', 'based', 'opportunistic_wavelet', 'model', 'proposed', 'cooperative', 'control', 'protocol', 'multimedia', 'mobile_crowd', 'ser

[(0.25860315571950387, 2), (0.31858935604791028, 3), (0.34655997276173367, 1)]
['next', 'section', 'reviews', 'literature', 'transnational', 'activities', 'argues', 'immigration-control', 'policies', 'create', 'irregular', 'legal_statuses', 'direct_indirect', 'effects', 'mobile', 'non-mobile_transnational', 'activities']
Defined as 2 

[(0.23515310574971948, 2), (0.28122314434028595, 3), (0.29215910143125445, 1)]
['final', 'sections', 'discuss', 'results', 'emphasis', 'effects', 'legal_status', 'vary', 'national', 'context', 'conclude', 'implications', 'findings', 'policies', 'immigration', 'control', 'co-development', 'section', 'reviews', 'literature', 'role', 'migrants’', 'transnational', 'activities', 'arguing', 'immigration-control', 'policies', 'may', 'constrain', 'cross-border', 'action']
Defined as 2 

[(0.22055021563363519, 2), (0.25923887055236949, 3), (0.31087442421346245, 1)]
['expected', 'migrants', 'mercy', 'immigration-control', 'mechanisms', 'ability', 'make', 'trips', 