In [41]:
import sys
sys.path.insert(0, "/work/04233/sw33286/AIDA-SCRIPTS")
sys.path.insert(0, "/home/04233/sw33286/AIDA-package")

In [42]:
import os
import time
import random
import dill
import numpy as np

from sklearn.cluster import KMeans

from kmedoids import kMedoids
from scipy.spatial.distance import pdist, squareform

from sklearn.metrics import accuracy_score

from helpers import Indexer
from itertools import chain

In [8]:
# Link to NYT data folder

nyt_code_dir = "/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_event_code/"
FILE_NAMES = os.listdir(nyt_code_dir)
NUM_FILES = len(FILE_NAMES)

# Link to dictionary information

info_path = "/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_info/indexer_word2emb_100k_event.p"
indexer100k, word2emb100k = dill.load(open(info_path, 'rb'))

In [43]:
def sent_to_avg_emb(esent):
    embs = np.array([word2emb100k[indexer100k.get_object(code)] 
                     for code in chain.from_iterable(esent)])
    return embs.mean(axis=0)

def get_rand_mixture(): # and turn the stuff into a list of average word embs.
    rand_filename = FILE_NAMES[np.random.randint(0, NUM_FILES)]
    edoc_a,edoc_b,edoc_mix = dill.load(open(nyt_code_dir+rand_filename, 'rb'))
    edoc_avg_embs, edoc_lbs = [], []
    for esent in edoc_mix:
        edoc_lbs.append(0 if esent in edoc_a else 1)
        edoc_avg_embs.append(sent_to_avg_emb(esent))
    return np.array(edoc_avg_embs), np.array(edoc_lbs)

# K-Means

def run_kmeans(doc):
    km = KMeans(n_clusters=2).fit(doc)
    return km.labels_

# K-Medoids

def to_labels(C, doc_len): # C: {cls:[datum_id, ...], ...}
    lbs = [0]*doc_len
    for idx in C[1]:
        lbs[idx] = 1
    return lbs

def run_kmedoids(doc):
    _, C = kMedoids(squareform(pdist(doc)), 2)
    return to_labels(C, len(doc))

In [45]:
def flip_clust(clust):
    return np.array([0 if i==1 else 1 for i in clust])

def clust_accuracy(true, pred):
    return max(accuracy_score(true, pred),
               accuracy_score(true, flip_clust(pred)))

def rand_evaluation(eval_fn, k=1000):
    accuracies = []
    for _ in range(k):
        doc_mix, doc_labels = get_rand_mixture()
        acc = clust_accuracy(doc_labels, eval_fn(doc_mix))
        accuracies.append(acc)
    return np.mean(accuracies)

### K-Means results

In [47]:
%%time

n_round = 10
n_sample = 1000
avg_accs = []
start = time.time()
for i in range(n_round):
    print('... Round {}'.format(i+1), end=' ')
    avg_accs.append(rand_evaluation(run_kmeans, k=n_sample))
    print('(time elapsed = {})'.format(time.time()-start))
    start = time.time()
print('\n')
print('Average accuracy over {} samples = {}'.format(n_round*n_sample, np.mean(avg_accs)))

... Round 1 (time elapsed = 17.779610872268677)
... Round 2 (time elapsed = 18.366278886795044)
... Round 3 (time elapsed = 17.64204502105713)
... Round 4 (time elapsed = 17.59988760948181)
... Round 5 (time elapsed = 17.79797601699829)
... Round 6 (time elapsed = 17.701401233673096)
... Round 7 (time elapsed = 17.260531663894653)
... Round 8 (time elapsed = 17.30229115486145)
... Round 9 (time elapsed = 17.535613298416138)
... Round 10 (time elapsed = 17.329511642456055)


Average accuracy over 10000 samples = 0.6475835282767155
CPU times: user 1min 39s, sys: 2.21 s, total: 1min 42s
Wall time: 2min 56s


### K-Medoids result

In [48]:
%%time

n_round = 10
n_sample = 1000
avg_accs = []
start = time.time()
for i in range(n_round):
    print('... Round {}'.format(i+1), end=' ')
    avg_accs.append(rand_evaluation(run_kmedoids, k=n_sample))
    print('(time elapsed = {})'.format(time.time()-start))
    start = time.time()
print('\n')
print('Average accuracy over {} samples = {}'.format(n_round*n_sample, np.mean(avg_accs)))

... Round 1 (time elapsed = 9.545044183731079)
... Round 2 (time elapsed = 9.90142822265625)
... Round 3 (time elapsed = 9.728882551193237)
... Round 4 (time elapsed = 9.336431503295898)
... Round 5 (time elapsed = 9.819844722747803)
... Round 6 (time elapsed = 9.640764236450195)
... Round 7 (time elapsed = 9.59523868560791)
... Round 8 (time elapsed = 9.054999113082886)
... Round 9 (time elapsed = 8.870564699172974)
... Round 10 (time elapsed = 9.697452068328857)


Average accuracy over 10000 samples = 0.6469381204408126
CPU times: user 20 s, sys: 2.45 s, total: 22.5 s
Wall time: 1min 35s
