### Project Kojak

** The problem  **

We will attempt to identify amboguously defined words - words that are homographs (spelled the same, but with multiple meanings) and determine the exact meaning of the word from a context window.

Here we attempt to do this in a few stages
1. train a word embedding on some training corpus using skip-gram (Here we use 1000 sholarly research papers) 
2. identify common homographs and extract the various context windows
3. interpret the context windows as vectors in the embedding space and appy a clustering algorith (DBSCAN). Each cluster is interpreted as a distinct definition of the homograph. Each cluster then is representative vector.
4. apply to a test corpus - match context of given homograph to most similar group.


### This notebook

Loads a pre-trained word embedding model, uses DBSCAN clustering to identify several 'definitions' of a set of homographs, and saves those definitions for later use.

In [1]:
import gensim
import json
import os
import re
import time
from nltk.corpus import stopwords
from pprint import pprint



In [2]:
# Declare stopwords, preprocess the data from source file

stop = stopwords.words('english')
stop += ['?','!',':',';','[',']','[]','“' ]
stop += ['.', ',', '(', ')', "'", '"',"''",'""',"``",'”', '“', '?', '!', '’', 'et', 'al', 'al.']
stop = set(stop)

class MyPapers(object):
    # a memory-friendly way to load a large corpora
     def __init__(self, dirname):
            self.dirname = dirname
 
     def __iter__(self):
        with open(self.dirname) as data_file:    
            data = json.load(data_file)
        # iterate through all file names in our directory
        for paper in data:
            try:
                line = [word for word in paper['full_text'].lower().split() if word not in stop]
                line = [re.sub(r'[?\.,!:;\(\)“\[\]]',' ',l) for l in line]
                yield line
            except:
                print("Empty document found")
                continue
                

## Word embeddings

Import word2vec word embeddings trained on 2848 scholarly journal articles

In [3]:
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from functools import reduce

In [4]:
model = gensim.models.word2vec.Word2Vec.load("data/journal.txt")

In [5]:
model.corpus_count

2848

In [6]:
vectors = model.wv

In [7]:
len(vectors.vocab)

200481

** contexts to vectors **

In [8]:
from collections import Counter, defaultdict
# The function takes as arguments a list of tokenized documents and a window size
# and returns each word in the document along with its window context as a tuple

def generate_word_counts(documents, window_size = 6):
    maxlen = window_size*2
    counts = Counter()
    
    for document in documents:
        L = len(document)
        # Choose the target word
        for index, word in enumerate(document):
            # Create the window
            s = index-window_size
            e = index+window_size+1
                    
            in_words = []
            context_words = []
            # Create the input/outputs for skipgrams
            for i in range(s, e):
                if i != index and 0 <= i < L:
                    #in_words.append([word])
                    context_words.append(document[i])
            x = word
            y = context_words
            counts[x] += 1
            for _ in y:
                counts[_] += 1

    return counts

In [9]:
#Instantiate iterable on the data

#papers is an iterable of scholarly papers, tokenized for prcessing
papers = MyPapers('data/train_data.json') 

In [10]:
word_counts = generate_word_counts(papers)

In [11]:
# Takes list of word tokens as arguments
# Returns a list of vectors whose components are the arithmetic mean of the 
# corresponding component of all of the input vectors

def get_vectors(word_list):
    vecs = []
    for word in word_list:
        vecs.append(vectors[word])        
    return vecs

# Takes list of vectors as arguments
# Returns a single vector whose components are the arithmetic mean of the 
# corresponding component of all of the input vectors

def vector_average(vector_list):
    A = np.array(vector_list)
    dim = A.shape[0]
    ones = np.ones(dim)
    return ones.dot(A)/dim

# Takes list of tokenized documents, target word and window size as arguments
# Returns list of vectors where each vector represents the context window 
# of the target word in the word embedding space

def context2vectors(documents,target,window_size = 6):

    context_vectors = []

    for document in documents:
        if target in document:
            windows = generate_windows([document],window_size)
            for w in windows:
                if w[0] == target:
                    context_vectors.append(vector_average2(get_vectors(w[1])))
                    
    return context_vectors


In [12]:
# Takes list of vectors as arguments
# Returns a single vector whose components are the arithmetic mean of the 
# corresponding component of all of the input vectors weighted by Inverse Document Frecuency

def vector_average2(words): #, word_counts, vectors):
    total = sum(list(word_counts.values()))
    words = [x for x in words if x in list(vectors.vocab.keys())]
    vector_list = list(map((lambda x: vectors[x]*np.log((1 + total)/(1 + word_counts[x]))),words))
    
    if len(vector_list) == 0:
        return 0
    elif len(vector_list) == 1:
        vector_sum = vector_list[0]
    else:
        vector_sum = reduce((lambda x,y: np.add(x,y)),vector_list)
        
    weighted_average = (1.0/len(words))*vector_sum
    
    return weighted_average

# Takes list of tokenized documents, target word and window size as arguments
# Returns list of vectors where each vector represents the context window 
# of the target word in the word embedding space

def context2vectors2(documents,target,window_size = 6):

    context_vectors = []

    for document in documents:
        if target in document:
            windows = generate_windows([document],window_size)
            for w in windows:
                if w[0] == target:
                    context_vectors.append(vector_average2(w[1]))
                    
    return context_vectors

In [13]:
dictionary = gensim.corpora.dictionary.Dictionary(papers)
text = [dictionary.doc2bow(c) for c in papers]

** Clustering with DBSAN **

Use DBSCAN to determine similar usages of the target homographs. Each of these similar usages will be combined to a representative vector in the embedding space and constitute a "definition" of that word.

In [14]:
# The function takes as arguments a list of tokenized documents and a window size
# and returns each word in the document along with its window context as a tuple

def generate_windows(documents, window_size):
    maxlen = window_size*2
    
    for document in documents:
        L = len(document)
        # Choose the target word
        for index, word in enumerate(document):
            # Create the window
            s = index-window_size
            e = index+window_size+1
                    
            in_words = []
            context_words = []
            # Create the input/outputs for skipgrams
            for i in range(s, e):
                if i != index and 0 <= i < L:
                    #in_words.append([word])
                    context_words.append(document[i])
            x = word
            y = context_words

            yield(x,y)
            

#Arguments: The desired cluster number, a list of documents making up the corpus, the target homograph
#           a list of labels for the conext sentences indicating the homographs usage, and window size
# The function prints the representative context windows for the target word within the desired cluster   

def print_cluster_context(cluster_number, documents, target, labels, window_size = 6):
    
    context_vectors = []

    for document in documents:
        text = document
        if target in text:
            #print(target)
            windows = generate_windows([text],window_size)
            #print windows[:2]
            for w in windows:
                if w[0] == target:
                    context_vectors.append((w[1]))
    for i, label in enumerate(labels):
        if label == cluster_number:
            print(context_vectors[i])
            


In [15]:
#Arguments: The desired cluster number, a list of documents making up the corpus, the target homograph
#           a list of labels for the conext sentences indicating the homographs usage, and window size
# The function prints the representative context windows for the target word within the desired cluster   

def cluster_context(documents, target, labels, window_size = 6):
    
    context_windows = []

    for document in documents:
        text = document
        if target in text:
            #print(target)
            windows = generate_windows([text],window_size)
            #print windows[:2]
            for w in windows:
                if w[0] == target:
                    context_windows.append((w[1]))
                    
    cluster_windows = defaultdict(list)                
    for i, label in enumerate(labels):
        cluster_windows[label].append(context_windows[i])
    
    return cluster_windows


In [16]:
# Arguments: List of vectors, each representing a context window and a list of labels 
#            corresponding to the context vectors
# Returns:   A dictionary where keys are the identified labels from clustering and the value is a single 
#            representing the cluster

def identify_definition(context_vectors, labels):
    
    cluster_numbers = set(labels)
    definitions = dict()
    cluster_vectors = defaultdict(list)
    
    for i, label in enumerate(labels):
        cluster_vectors[label].append(context_vectors[i])
    
    for key in cluster_vectors.keys():
        if key < 0:
            continue
        else:
            v = vector_average(cluster_vectors[key])
            definitions[key] = v/np.linalg.norm(v)
                    
    return definitions


In [17]:
target = u'extract'
context_vectors = context2vectors2(papers, target)

In [18]:
dbscan = DBSCAN(eps = .07, metric = 'cosine', algorithm = 'brute', min_samples = 5)
dbscan.fit(context_vectors)

DBSCAN(algorithm='brute', eps=0.07, leaf_size=30, metric='cosine',
    min_samples=5, n_jobs=1, p=None)

In [19]:
labels = dbscan.labels_
n_clusters = len(set(labels)) # - (1 if -1 in labels else 0)
print(n_clusters)

4


# Recording Definitions

Use the DBSCAN clusters to determine the various definitions of a word, then create a dictionary for the word

In [22]:
target_definitions = identify_definition(context_vectors, labels)

In [23]:
contexts = cluster_context(papers, target, labels)

In [24]:
def define_window(window, dictionary):
    cosine_dists = []
    wv = vector_average2(window)
    window_vector = wv/np.linalg.norm(wv)
    for k,v in dictionary.items():
        cosine_dists.append((1 - np.dot(window_vector, v),k))
    cosine_dists.sort()
    
    return cosine_dists[0][1]
    

In [34]:
def extract_dictionary(papers, homographs):
    dictionary = dict()
    for word in homographs:
        print("Calculating context vectors for {}".format(word))
        context_vectors = context2vectors2(papers, word)
        print("Clustering...")
        dbscan = DBSCAN(eps = .07, metric = 'cosine', algorithm = 'brute', min_samples = 5)
        dbscan.fit(context_vectors)
        labels = dbscan.labels_
        print("found {} distinct definitions".format(len(set(labels))))
        print("Building definition for {}".format(word))
        dictionary[word] = identify_definition(context_vectors, labels)
        
    print("Dictionary complete")    
    return dictionary

In [33]:
#homographs = ['attribute', 'bank', 'charge', 'feet', 'second', 'train']

dictionary = extract_dictionary(papers, homographs)

Calculating context vectors for attribute
Clustering...
Calculating context vectors for bank
Clustering...
Calculating context vectors for charge
Clustering...
Calculating context vectors for feet
Clustering...
Calculating context vectors for second
Clustering...
Calculating context vectors for train
Clustering...


In [42]:
dictionary['bank'].keys()

dict_keys([0, 1])

## Testing

In [56]:
# TEST WINDOW #

target = u'train'
context_vectors = context2vectors2(papers, target)
#target_definitions = identify_definition(context_vectors, labels)
target_definitions = dictionary[target]
dbscan = DBSCAN(eps = .07, metric = 'cosine', algorithm = 'brute', min_samples = 5)
dbscan.fit(context_vectors)
labels = dbscan.labels_

contexts = cluster_context(papers, target, labels)
correct = 0
wrong = 0
for c in contexts:
    for window in contexts[c]:
        d = define_window(window, target_definitions)
        if c == d:
            correct += 1
        else:
            print("Cluster {}, defined as {} \n".format(c, d), window, '\n')
            if c != -1:
                wrong += 1
                
print("Number correct: {}\nNumber wrong: {}".format(correct, wrong))

Cluster -1, defined as 0 
 ['obtained', 'means', 'local', 'field', 'potential', 'spike', 'analysis ', 'wide', 'range', 'topological', 'network', 'measures'] 

Cluster -1, defined as 0 
 ['putting', 'probes', 'local', 'field', 'potential', 'spike', 'analysis ', 'aim', 'reconstruct', 'functional', 'time-varying', 'brain'] 

Cluster -1, defined as 2 
 ['improving', 'access', 'psychological', 'therapies', ' iapt ', 'developed', 'low', 'intensity', 'psychological', 'wellbeing', 'practitioners', ' pwps '] 

Cluster -1, defined as 1 
 [' imo ', 'requires', 'using', 'marine', 'radar', 'simulator', 'seafarers ', 'marine', 'radar', 'simulator', 'acceptable', 'approach'] 

Cluster -1, defined as 0 
 ['endpoint', 'detection', 'detect', 'learned', ' in', 'examples', 'high', 'voltage', 'sockets  ', 'hypothesis', 'cannot', 'reproduce'] 

Cluster -1, defined as 0 
 ['  ', 'employs', 'use', 'virtual', 'reality', 'goggles', 'construction', 'site', 'workers', 'forming', 'ability', 'identify'] 

Cluster -

Cluster -1, defined as 0 
 ['similarity', 'functions', 'concatenated', 'feature', 'vector ', 'used', 'classifier', 'determining', 'whether', 'corresponding', 'two', 'tracks'] 

Cluster -1, defined as 0 
 ['first ', 'similarities', 'based', 'different', 'descriptors', 'adopted', 'classifier ', 'then ', 'decision', 'made', 'classifier', 'integrated'] 

Cluster -1, defined as 0 
 ['collect', 'large', 'number', 'ecg', 'heartbeats', 'order', 'reliable', 'biometric', 'system ', 'hence ', 'issue', 'might'] 

Cluster -1, defined as 0 
 ['size', 'environment', 'allocating', 'small', 'number', 'observations', 'model ', 'chose', 'arbitrary', 'number', '20', 'observations'] 

Cluster -1, defined as 1 
 ['embedded', 'systems ', 'multiple', 'objects', 'environment', 'used', 'evolved', 'neural', 'network', 'time '] 

Cluster -1, defined as 0 
 ['sequence', 'file', 'parsing', 'routine ', 'strand', 'able', 'classify', 'malware', 'data', 'changes while', 'produce', 'winning'] 

Cluster -1, defined as 1 

In [61]:
for d in contexts[-1]:
    print(define_window(d, target_definitions), d, '\n')

0 ['obtained', 'means', 'local', 'field', 'potential', 'spike', 'analysis ', 'wide', 'range', 'topological', 'network', 'measures'] 

0 ['putting', 'probes', 'local', 'field', 'potential', 'spike', 'analysis ', 'aim', 'reconstruct', 'functional', 'time-varying', 'brain'] 

2 ['improving', 'access', 'psychological', 'therapies', ' iapt ', 'developed', 'low', 'intensity', 'psychological', 'wellbeing', 'practitioners', ' pwps '] 

1 [' imo ', 'requires', 'using', 'marine', 'radar', 'simulator', 'seafarers ', 'marine', 'radar', 'simulator', 'acceptable', 'approach'] 

0 ['endpoint', 'detection', 'detect', 'learned', ' in', 'examples', 'high', 'voltage', 'sockets  ', 'hypothesis', 'cannot', 'reproduce'] 

0 ['  ', 'employs', 'use', 'virtual', 'reality', 'goggles', 'construction', 'site', 'workers', 'forming', 'ability', 'identify'] 

0 ['construction', 'plant', 'equipment', 'represented', 'virtual', 'environment', 'students', 'increase', 'learning', 'experience ', 'showed', 'improvement'] 



0 ['shown', 'bci', 'potential', 'alternative', 'treatment', 'option', 'adhd', 'subjects', ' –  ', 'neurofeedback', 'adhd', 'treatment'] 

0 ['relative', 'frequency', 'training', 'set ', 'creating', 'balanced', 'set', 'undersampling', 'majority', 'class', 'retaining', 'original'] 

0 ['pre-diagnosis', 'model', 'considerably', 'fewer', 'data', 'points', 'on ', 'result ', 'likely', 'pre-diagnosis', 'accuracy', 'scores'] 

1 ['  ', 'truccolo', '   ', 'modir', '   ', 'spike', 'analysis', 'active', 'neurobiological', 'research', 'area', 'calling'] 

0 ['machine', 'learning ', 'adjacency', 'matrix', 'given', 'goal', 'model', 'tuning', 'matrix', 'latent', 'features', 'way'] 

1 ['sekimoto', '   ', 'model', 'used', 'finding', 'optimal', 'schedule', 'based', 'train', 'demand ', 'proposed', 'orlin '] 

1 ['used', 'finding', 'optimal', 'train', 'schedule', 'based', 'demand ', 'proposed', 'orlin ', 'serafini ', 'ukovich', ' –  '] 

1 ['  ', 'gives', 'methodology', 'extract', 'relevant', 'data', 's