### Project Kojak

** The problem  **

We will attempt to identify amboguously defined words - words that are homographs (spelled the same, but with multiple meanings) and determine the exact meaning of the word from a context window.

Here we attempt to do this in a few stages
1. train a word embedding on some training corpus using skip-gram (Here we use 1000 sholarly research papers) 
2. identify common homographs and extract the various context windows
3. interpret the context windows as vectors in the embedding space and appy a clustering algorith (DBSCAN). Each cluster is interpreted as a distinct definition of the homograph. Each cluster then is representative vector.
4. apply to a test corpus - match context of given homograph to most similar group.


In [2]:
import gensim
import json
import os
import re
import time
from nltk.corpus import stopwords
from pprint import pprint



In [3]:
# Declare stopwords, preprocess the data from source file

stop = stopwords.words('english')
stop += ['?','!',':',';','[',']','[]','“' ]
stop += ['.', ',', '(', ')', "'", '"',"''",'""',"``",'”', '“', '?', '!', '’', 'et', 'al', 'al.']
stop = set(stop)

class MyPapers(object):
    # a memory-friendly way to load a large corpora
     def __init__(self, dirname):
            self.dirname = dirname
 
     def __iter__(self):
        with open(self.dirname) as data_file:    
            data = json.load(data_file)
        # iterate through all file names in our directory
        for paper in data:
            try:
                line = [word for word in paper['full_text'].lower().split() if word not in stop]
                line = [re.sub(r'[?\.,!:;\(\)“\[\]]',' ',l) for l in line]
                yield line
            except:
                print("Empty document found")
                continue
                

## Extract Context Window 

In [4]:
# Declare what word we are searchig for
target = u'extract'

In [5]:
#Instantiate iterable on the data

#papers is an iterable of scholarly papers, tokenized for prcessing
papers = MyPapers('data/train_data.json') 

# target_corpus will be a list of ony those papers containing the target word
target_corpus = []

for paper in papers:
    if target in paper:
        target_corpus.append(paper)
        
len(target_corpus)

163

In [9]:
# Arguments: target word and a starting number
# Prints all context windows for the target word for 10 papers in 

def print_10_contexts(target, paper_start, target_corpus):
    
    paper_count = 0
    print(target.upper())
    while paper_count < 10:
    #for paper_num in range(paper_start,paper_start + 10 ):
        try:
            paper = target_corpus[paper_start + paper_count]
            paper_count += 1
            windows = generate_windows([paper],6)
            count = 1
        except:
            break
        print('\nPAPER {}'.format(paper_count))
        for w in windows:
            if w[0] == target:
                print('{}: {}'.format(count, make_sentence(w[1])))
                count += 1
    

In [8]:
# Takes a list of strings (words_list)
# Returns single string of all words in words_list seperated by a white space.

def make_sentence(words_list):
    return ''.join([word + ' ' for word in words_list]).encode('utf-8')
    

In [None]:
#paper_start = 50

#print_10_contexts(target, paper_start, target_corpus)

## Word embeddings

In [10]:
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from functools import reduce

In [12]:
#Initialize the model

model = gensim.models.word2vec.Word2Vec(sentences = papers, size=300, window=6, min_count=1, workers=4,sg=1)

In [13]:
model.corpus_count

2848

In [14]:
model.save("data/journal.txt")

In [13]:
vectors = model.wv

In [15]:
len(vectors.vocab)

200481

** contexts to vectors **

In [7]:
from collections import Counter
# The function takes as arguments a list of tokenized documents and a window size
# and returns each word in the document along with its window context as a tuple

def generate_word_counts(documents, window_size = 6):
    maxlen = window_size*2
    counts = Counter()
    
    for document in documents:
        L = len(document)
        # Choose the target word
        for index, word in enumerate(document):
            # Create the window
            s = index-window_size
            e = index+window_size+1
                    
            in_words = []
            context_words = []
            # Create the input/outputs for skipgrams
            for i in range(s, e):
                if i != index and 0 <= i < L:
                    #in_words.append([word])
                    context_words.append(document[i])
            x = word
            y = context_words
            counts[x] += 1
            for _ in y:
                counts[_] += 1

    return counts

In [14]:
word_counts = generate_word_counts(papers)

In [40]:
# Takes list of word tokens as arguments
# Returns a list of vectors whose components are the arithmetic mean of the 
# corresponding component of all of the input vectors

def get_vectors(word_list):
    vecs = []
    for word in word_list:
        vecs.append(vectors[word])        
    return vecs

# Takes list of vectors as arguments
# Returns a single vector whose components are the arithmetic mean of the 
# corresponding component of all of the input vectors

def vector_average(vector_list):
    A = np.array(vector_list)
    dim = A.shape[0]
    ones = np.ones(dim)
    return ones.dot(A)/dim

# Takes list of tokenized documents, target word and window size as arguments
# Returns list of vectors where each vector represents the context window 
# of the target word in the word embedding space

def context2vectors(documents,target,window_size = 6):

    context_vectors = []

    for document in documents:
        if target in document:
            windows = generate_windows([document],window_size)
            for w in windows:
                if w[0] == target:
                    context_vectors.append(vector_average2(get_vectors(w[1])))
                    
    return context_vectors


In [53]:
# Takes list of vectors as arguments
# Returns a single vector whose components are the arithmetic mean of the 
# corresponding component of all of the input vectors weighted by Inverse Document Frecuency

def vector_average2(words): #, word_counts, vectors):
    total = sum(list(word_counts.values()))
    words = [x for x in words if x in list(vectors.vocab.keys())]
    vector_list = list(map((lambda x: vectors[x]*np.log((1 + total)/(1 + word_counts[x]))),words))
    vector_sum = reduce((lambda x,y: np.add(x,y)),vector_list)
    weighted_average = (1.0/len(words))*vector_sum
    
    return weighted_average

# Takes list of tokenized documents, target word and window size as arguments
# Returns list of vectors where each vector represents the context window 
# of the target word in the word embedding space

def context2vectors2(documents,target,window_size = 6):

    context_vectors = []

    for document in documents:
        if target in document:
            windows = generate_windows([document],window_size)
            for w in windows:
                if w[0] == target:
                    context_vectors.append(vector_average2(w[1]))
                    
    return context_vectors

In [52]:
toys = ['reduce', 'exhibit', 'uhuhuhuh']
vector_average2(toys)

array([ -2.40521717e+00,  -1.19343948e+00,  -1.42371440e+00,
         8.04274201e-01,  -9.90902126e-01,  -6.63835406e-01,
        -1.86185598e-01,  -8.15678716e-01,  -1.22230339e+00,
         4.86684978e-01,   9.18192387e-01,   6.61811233e-02,
         2.46386677e-01,   1.81828976e-01,   6.29535437e-01,
         1.30542898e+00,  -1.30601048e+00,  -1.55078506e+00,
         2.45777607e+00,  -5.20702481e-01,   8.27073097e-01,
         2.75134623e-01,   2.69613647e+00,   6.78881824e-01,
         7.89619327e-01,   3.08323646e+00,  -9.20088410e-01,
         1.98221743e-01,  -1.96112704e+00,   6.03531599e-01,
        -1.56243110e+00,  -2.66677094e+00,  -1.92627811e+00,
        -1.79501092e+00,   3.01403999e+00,   4.84251618e-01,
        -3.92093658e+00,   3.61459541e+00,  -3.49280381e+00,
         2.02580500e+00,  -1.61408472e+00,   3.01098049e-01,
         1.93309903e-01,  -7.72316515e-01,   6.16106629e-01,
         1.62125945e-01,  -1.94880807e+00,   1.47990203e+00,
         1.53829885e+00,

In [38]:
dictionary = gensim.corpora.dictionary.Dictionary(papers)
text = [dictionary.doc2bow(c) for c in papers]

In [54]:
target = u'extract'
#papers = MyPapers('abstract_scraper/full.json')
context_vectors = context2vectors2(papers, target)

** Clustering with DBSAN **

In [None]:
def print_cluster_context(cluster_number, documents, target, labels, window_size = 6):
    `
    context_vectors = []

    for document in documents:
        text = document
        if target in text:
            #print(target)
            windows = generate_windows([text],window_size)
            #print windows[:2]
            for w in windows:
                if w[0] == target:
                    context_vectors.append((w[1]))
    for i, label in enumerate(labels):
        if label == cluster_number:
            print(context_vectors[i])

In [None]:
dbscan = DBSCAN(eps = .085, metric = 'cosine', algorithm = 'brute', min_samples = 5)
dbscan.fit(context_vectors)

In [None]:
labels = dbscan.labels_
n_clusters = len(set(labels)) # - (1 if -1 in labels else 0)
print(n_clusters)

In [74]:
dbscan.labels_

array([-1, -1, -1, -1,  0, -1, -1, -1,  0,  0, -1, -1, -1, -1, -1, -1, -1,
       -1,  0,  0,  3,  0,  0,  0, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,
        0, -1,  1, -1, -1,  1, -1, -1,  0, -1, -1, -1, -1, -1, -1, -1,  0,
        0,  0,  0, -1, -1, -1,  0, -1, -1, -1,  1,  1,  1, -1, -1,  1, -1,
        0,  0, -1,  2, -1, -1,  2, -1,  0,  0,  0, -1, -1,  0,  0, -1,  0,
        0,  0,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  0, -1,
       -1,  0,  0, -1, -1, -1, -1,  0,  0,  0,  0,  0, -1, -1, -1, -1,  2,
       -1,  1,  1,  1,  1,  0,  0,  3, -1,  0, -1, -1,  0,  0,  0, -1,  0,
        0,  0,  0, -1,  0,  0,  2, -1, -1, -1,  0,  0,  0,  0, -1,  0,  0,
        0,  0,  0, -1,  0,  0,  0,  0,  0,  3,  0, -1, -1, -1, -1, -1, -1,
        0,  0,  0,  0, -1,  2, -1,  1,  0,  0, -1,  0,  0,  0, -1, -1, -1,
        1,  1, -1,  1,  0, -1,  0, -1, -1, -1, -1,  0,  0, -1, -1,  1,  1,
        1,  1,  1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1,  1,
        1,  1,  1, -1,  1

In [18]:
#papers = MyPapers('abstract_scraper/full.json')

print_cluster_context(0,papers, target, labels)

NameError: name 'print_cluster_context' is not defined

In [19]:
# Agglomorative clustering

ag = AgglomerativeClustering(n_clusters = 4, affinity = 'cosine', linkage = 'complete')
ag.fit(context_vectors)

AgglomerativeClustering(affinity='cosine', compute_full_tree='auto',
            connectivity=None, linkage='complete',
            memory=Memory(cachedir=None), n_clusters=4,
            pooling_func=<function mean at 0x7fa1180f8488>)

In [20]:
labels = ag.labels_
n_clusters = len(set(labels)) # - (1 if -1 in labels else 0)
print(n_clusters)

4


** Dimension Reduction **

Since vectors are dimension 100+, DBSCAN is ineffective. Other clustering algorithms are successful at seperating into a predetermined number of clusters which correspond to different definitions. However, If a word has more or less than these number of actual definitions, then this is counter-fproductive. 

In order for DBSCAN to me more effective, we will apply Singular Value Decomposition in order to reduce dimensions.

In [145]:
from sklearn.decomposition import TruncatedSVD

In [146]:
svd = TruncatedSVD(n_components=3)
X = svd.fit_transform(context_vectors)

In [147]:
print(X)

[[  2.09653699e+00   9.86560049e-01  -4.98928487e-02]
 [  1.94450880e+00  -6.11871523e-01   3.64613031e-01]
 [  2.03454784e+00  -1.19563971e-01   3.25418990e-01]
 [  2.09710585e+00  -1.30777934e-01  -6.34665916e-02]
 [  2.02457269e+00   3.07932152e-01   7.13693917e-02]
 [  1.91288530e+00   3.58860554e-01   1.25679875e-01]
 [  2.12255555e+00   4.89122976e-01  -5.58709171e-02]
 [  1.87757986e+00   9.92744570e-01   7.74797185e-01]
 [  2.07143207e+00  -4.44112490e-01   6.42279786e-02]
 [  1.86279062e+00  -4.43689195e-01   1.54736463e-01]
 [  1.92619547e+00  -1.95084935e-01   3.89550796e-01]
 [  2.12316574e+00  -4.92781616e-01   3.06446612e-01]
 [  2.25374134e+00  -4.70839000e-01   2.59099326e-01]
 [  1.99044402e+00  -4.46640690e-01   3.91463825e-01]
 [  1.97654672e+00  -2.35431915e-01   1.46650506e-01]
 [  1.69957163e+00  -4.33071575e-01   5.19589092e-01]
 [  2.07675030e+00  -4.04436838e-01  -7.57310571e-02]
 [  1.97515014e+00  -4.23584322e-01   2.21571590e-01]
 [  1.92706746e+00  -5.85987

In [157]:
dbscan = DBSCAN(eps = .2, metric = 'euclidean', algorithm = 'auto')
dbscan.fit(X)

DBSCAN(algorithm='auto', eps=0.2, leaf_size=30, metric='euclidean',
    min_samples=5, n_jobs=1, p=None)

In [158]:
labels = dbscan.labels_
n_clusters = len(set(labels)) # - (1 if -1 in labels else 0)
print(n_clusters)

3


In [159]:
labels

array([ 1,  0,  0,  0, -1, -1,  1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        1, -1,  1,  1, -1,  1, -1,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,
        0,  0, -1,  1, -1,  1,  1,  1,  1, -1,  0,  0,  0,  0,  0,  0, -1,
        0,  0,  1,  0,  0,  0,  0,  0,  1, -1,  1,  0,  0,  0, -1,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  0,  0,  0, -1,
        0,  0,  0,  0,  0, -1,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1,  1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  1,  1,  0,  0,  1,  1,  0,  0,  0,  0,
        0,  0, -1, -1, -1, -1, -1, -1,  0,  1])

In [161]:
#papers = MyPapers('abstract_scraper/full.json')

print_cluster_context(0,papers, target, labels)

['attach', 'onto', 'surface', 'due', 'gravity ', 'electrostatic', 'mechanical', 'effects', ' wind', 'water', 'droplets  ', 'deposition ']
['first', 'report', '[] ', 'increase', 'force', 'explained', 'accumulation', 'borders', 'microscopic', 'air', 'gap ', 'dielectrics']
['speeds ', 'model', 'needs', 'modifications', 'account', 'virtual', 'decrease', 'due', 'relative', 'motions the', 'voltages', 'used']
['broad', 'peak', 'centered', '~3', 'ev ', 'corresponding', 'transfer', 'absorption ', 'could', 'extract', 'band', 'gap']
['[nbo]', 'octahedral', 'structures', 'favor', 'possible', 'delocalization', 'carriers', '[] ', 'secondly ', 'conduction', 'bands', 'consisting']
['potential', 'h+/h', 'promote', 'separation', 'transfer', 'photo-induced', 'carriers', 'result', 'high', 'photocatalytic', 'activities', '[] ']
['470', 'nm', 'linbo ', 'means', 'linbo', 'longer', 'carrier', 'lifetime', 'improved', 'efficiency', 'interfacial', 'charge']
['charge', 'carrier', 'lifetime', 'improved', 'efficien