In [1]:
import sys
sys.path.insert(0, "/work/04233/sw33286/AIDA-SCRIPTS")

In [2]:
import random
import dill
import numpy as np

from kmedoids import kMedoids
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import accuracy_score

In [3]:
from nyt_reader import NytReader
nyt_reader = NytReader(nyt_dir="/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_code/",
                       utils_dir="/work/04233/sw33286/AIDA-SCRIPTS",
                       dic_path="/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_info/indexer_word2emb_100k.p")

In [4]:
save_path = "/work/04233/sw33286/AIDA-MODEL-SAVE/SCDV/nyt-v100k-gmm100comp-gmm50iter-p004.p"
scdv = dill.load(open(save_path, 'rb'))

In [5]:
# Preprocessing facilities

def sent_to_scdv_emb(reader, encoder, sent):
    dv = np.zeros(encoder.n_gmm_comps*encoder.wordemb_dim)
    for w_idx in sent:
        if w_idx>=0 and w_idx<=100001:
            dv += encoder.wordcode2unsp_emb[w_idx]
    return dv    

def doc_to_scdv_emb(reader, encoder, doc):
    return np.array([sent_to_scdv_emb(reader, encoder, sent) for sent in doc])

def to_labels(C, doc_len): # C: {cls:[datum_id, ...], ...}
    lbs = [0]*doc_len
    for idx in C[1]:
        lbs[idx] = 1
    return lbs

def run_kmedoids(reader, encoder, doc):
    _, C = kMedoids(squareform(pdist(doc_to_scdv_emb(reader,encoder,doc))), 2) # [cls,num-member] list is ignored.
    return to_labels(C, len(doc))

In [6]:
# Evaluation

def flip_clust(clust):
    return np.array([0 if i==1 else 1 for i in clust])

def clust_accuracy(true, pred):
    return max(accuracy_score(true, pred),
               accuracy_score(true, flip_clust(pred)))

def rand_evaluation(reader, encoder, k=1000):
    accuracies = []
    for _ in range(k):
        doc_mix, doc_labels = nyt_reader.get_rand_mixture()
        try:
            acc = clust_accuracy(doc_labels, run_kmedoids(reader, encoder, doc_mix))
        except:
            acc = 0.5
        accuracies.append(acc)
    return np.mean(accuracies)

In [7]:
%%time

n_round = 10
n_sample = 1000
avg_accs = []
for i in range(n_round):
    print('... Round {}'.format(i+1))
    avg_accs.append(rand_evaluation(nyt_reader, scdv, k=n_sample))
print('\n')
print('Average accuracy over {} samples = {}'.format(n_round*n_sample, np.mean(avg_accs)))

... Round 1
... Round 2
... Round 3
... Round 4
... Round 5
... Round 6
... Round 7
... Round 8
... Round 9
... Round 10


Average accuracy over 10000 samples = 0.6146036991396805
CPU times: user 2min 55s, sys: 4.68 s, total: 3min
Wall time: 5min 39s
