In [1]:
import time
import re
import numpy as np
import pyemd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.word2vec import Word2Vec
from sklearn.metrics import euclidean_distances

In [2]:
stopwords_set = set(stopwords.words('english')).union( set(stopwords.words('german')) )

In [3]:
class Document:
    
    def __init__(self, text):
        self.text   = text[1]
        self.id = text[0]
        self.avg    = []
        self.hash   = 0

In [4]:
start = time.time()
documents = []
with open('/data/Evaluations/training_set/tokens100.csv') as documentTokens:
    for tokens in documentTokens:
        line = tokens.strip().split('\t')
#         postingId = line[0]
#         doc = line[1]
        document = Document(line)
        documents.append(document)
        
print(time.time() - start)

33.042436838150024


In [5]:
start = time.time()
nbow = CountVectorizer(stop_words = stopwords_set)
nbow.fit([doc.text for doc in documents])
print(time.time() - start)

592.2192177772522


In [6]:
nbow

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words={'was', 'einiges', 'how', 'for', 'mich', 'demselben', 'meines', 'then', 'so', 'dieser', 'as', 'any', 'meinem', 'you', 'muss', 'andere', 'why', 'been', 'zur', 'daß', 'where', 'by', 'allen', 'yourself', 'wasn', 'da', 'keine', 'vor', 'alle', 'er', 'nun', 'auch', 'did', 'deines', 'wie', 'woll...', 'noch', 'didn', 'selbst', 'against', 'aller', 'is', 'würden', 'auf', 'derer', 'einmal', 'musste'},
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
# load word2vec model vectorSize = 200
start = time.time()
model200 = Word2Vec.load('/data/word2vec_new/word2vec_models/w2vmodel_200')
vocabulary = set(model200.index2word)
names = nbow.get_feature_names()
print(time.time() - start)

26.74746608734131


In [8]:
start = time.time()
filtered = []
for document in documents:
    if len(document.text.split()) > 30:
        filtered += [document]
print(time.time() - start)
filtered_dict = {}
for document in filtered:
    filtered_dict[document.id] = document.text

45.782058000564575


In [9]:
def avg(doc):
    '''
    A document is represented by it's average word vector
    '''
    return np.mean([model200[token] for token in doc.text.split() if token in vocabulary], 0)

In [10]:
start = time.time()
for document in filtered:
    document.avg = avg(document)
    
print(time.time() - start)

1219.6014111042023


In [11]:
def flow_graph_wmd(doc1, doc2):     
    v1, v2    = nbow.transform([doc1, doc2])   
    index     = np.union1d(v1.indices, v2.indices)
    
    v1  = v1.toarray().ravel()
    v2  = v2.toarray().ravel()
    
    n         = len(index)    
    index_map = [(index[i], i) for i in range(n) if names[index[i]] in vocabulary]
    source    = np.zeros(n)
    sink      = np.zeros(n)
    vecs      = np.zeros(shape = (n, 200))
    
    for i, j in index_map:
        source[j] = v1[i]
        sink[j]   = v2[i]
        vecs[j]   = model200[names[i]]
#         print(names[i])
    sum_source = sum(source)
    sum_sink = sum(sink)
    if sum_source == 0:
        sum_source = 1
    if sum_sink == 0:
        sum_sink = 1
    return (source / sum_source, sink / sum_sink, vecs)

def flow_graph_rwmd(doc1, doc2):     
    v1, v2    = nbow.transform([doc1, doc2])   
    index     = np.union1d(v1.indices, v2.indices)
    
    v1  = v1.toarray().ravel()
    v2  = v2.toarray().ravel()

    n         = len(index)    
#     index_map = [index[i] for i in range(n) if names[index[i]] in vocabulary]
    index_map = [i for i in index if names[i] in vocabulary]
    source    = np.zeros(len(index_map))
    sink      = np.zeros(len(index_map))
    vecs      = np.zeros(shape = (len(index_map), 200))

    for j, i in enumerate(index_map):
        source[j] = v1[i]
        sink[j]   = v2[i]
        vecs[j]   = model200[names[i]]
#         print(names[i])
    sum_source = sum(source)
    sum_sink = sum(sink)
    if sum_source == 0:
        sum_source = 1
    if sum_sink == 0:
        sum_sink = 1
    return (source / sum_source, sink / sum_sink, vecs)

In [12]:
doc1 = documents[1].text
doc2 = documents[2].text
%time source, sink, vecs = flow_graph_rwmd(doc1, doc2)

CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 5.86 ms


In [13]:
def emd(doc1, doc2): 
    source, sink, vecs = flow_graph_wmd(doc1, doc2)
    if len(vecs) > 2:
        weights = euclidean_distances(vecs)
#         print(weights)
        return pyemd.emd(source, sink, weights)
    else:
        return float('inf')

In [14]:
%time emd(doc1, doc2)

CPU times: user 1.89 s, sys: 2.74 s, total: 4.63 s
Wall time: 488 ms


30.075525931577488

In [15]:
def rwmd(doc1, doc2):
    source, sink, vecs = flow_graph_rwmd(doc1, doc2)
    weights = euclidean_distances(vecs)
    new_weights_dj = []
    potential_dj = list(j for j, dj in enumerate(sink) if dj > 0)
    new_weights_dj = list(min(weights[i, potential_dj]) for i in range(len(source)))
    potential_di = list(i for i, di in enumerate(source) if di > 0)
    new_weights_di = list(min(weights[j, potential_di]) for j in range(len(sink)))
    rwmd = max(np.dot(new_weights_dj, source), np.dot(new_weights_di, sink))
    return rwmd

In [16]:
%time rwmd(doc1, doc2)

CPU times: user 20 ms, sys: 96 ms, total: 116 ms
Wall time: 19 ms


24.60578776575931

In [17]:
# load groundTruth

groundTruthRDDraw = sc.textFile('/data/groundTruth/groundTruth.csv')
def parseGroundTruth(x):
    line = x.split('\t')
    postingId = int(line[0])
    similarIds = list(map(lambda s: int(s), line[1].split()))
    return postingId, similarIds

groundTruthRDD = groundTruthRDDraw.map(parseGroundTruth)
groundTruth = groundTruthRDD.collectAsMap()
sample_idsRDD = groundTruthRDD.map(lambda x: x[0])
sample_ids = sample_idsRDD.collect()

In [18]:
import random
from scipy.spatial.distance import euclidean

In [19]:
def getPredictions():
    
    qid     = random.randrange(0, len(filtered))
#     query   = filtered[qid]
    while int(filtered[qid].id) not in sample_ids:
        qid = random.randrange(0, len(filtered))
    query = filtered[qid]
    wcd = {}
    
# Check Average (Prooven Lower Bound to EMD)
    for i in range(len(filtered)):
        lb = euclidean(query.avg, filtered[i].avg)
        wcd[filtered[i].id] = lb
                 
# wcd_sorted = sorted(wcd.items(), key=lambda x: x[1])[1:100001]
    wcd_sorted = sorted(wcd.items(), key=lambda x: x[1])[1:50001]
    wcd_documents = [(x[0], filtered_dict[x[0]]) for x in wcd_sorted]
    wcd_20 = wcd_sorted[:20]
                
    wmd_20 = [(x[0], emd(filtered_dict[query.id], filtered_dict[x[0]])) for x in  wcd_20]
    kth = max(wmd_20, key=lambda x: x[1])
    keys = set(map(lambda x: x[0], wmd_20))
    num_prune = 0
    count = 0
    test = 0
    for key, value in wcd_documents:
        if key not in keys:
            if rwmd(query.text, value) < kth[1]:
                test += 1
                new_wmd = emd(query.text, value)
                if new_wmd < kth[1]:
#                 print(1)
                    wmd_20.remove(kth)
                    wmd_20.append((key, new_wmd))
                    kth = max(wmd_20, key=lambda x: x[1])
                    keys = set(map(lambda x: x[0], wmd_20))
            else:
                num_prune += 1
        count += 1  
    print('pruned %f' % (num_prune/len(wcd_documents)))
    wmd_20 = sorted(wmd_20, key=lambda x: x[1])
    prediction = list(map(lambda x: x[0],wmd_20))
    return (int(query.id), prediction)

In [20]:
sample_predictions = []