## Pairwise Classification Aided K-Medoids Clustering

* Original work: Disentangling Story Salads (Wang, Holgate, Durrett and Erk, 2018)

In [3]:
# Copyright 2018 @Jacob Su Wang. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import sys
sys.path.insert(0, "/work/04233/sw33286/AIDA-SCRIPTS")

import os
import time
import random
import shutil
import dill
import numpy as np

import tensorflow as tf
from tensorflow.contrib.rnn import LSTMCell, MultiRNNCell, DropoutWrapper

from helpers import Indexer, batch, checkpoint_model
from itertools import chain, product
from collections import defaultdict

from kmedoids import kMedoids
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import accuracy_score

from pairwise_classifier import PairwiseSentenceClassifier

class MixtureReader:
    
    def __init__(self, data_dir, data_type, context):
        
        assert data_type in ['nyt', 'wiki']
        
        self.data_dir = data_dir
        self.data_type = data_type
        self.context = context # int: 0 or context-length.
        
    def get_mixture(self, filename):
        
        if self.data_type == 'nyt':
            return self.__get_nyt_mixture(filename)
        else: # == wiki
            return self.__get_wiki_mixture(filename)
        
    def __get_nyt_mixture(self, filename):
        
        da, db, doc_mix = dill.load(open(self.data_dir+filename, 'rb'))
        doc_lbs = []
        for sentcode in doc_mix:
            if sentcode in da:
                doc_lbs.append(0)
            else:
                doc_lbs.append(1)
        if self.context:
            CTX_LEN = self.context
            doc_mix_flat = list(chain.from_iterable(doc_mix))
            doc_mix_len = len(doc_mix_flat)
            ctx = np.array([doc_mix_flat[:CTX_LEN]]) if doc_mix_len>=CTX_LEN else np.array([doc_mix_flat+[0]*(CTX_LEN-doc_mix_len)])            
            return doc_mix, doc_lbs, ctx
        return doc_mix, doc_lbs
    
    def __get_wiki_mixture(self, filename):
        
        doc_mix, doc_lbs = dill.load(open(self.data_dir+filename, 'rb'))
        if self.context:
            CTX_LEN = self.context
            doc_mix_flat = list(chain.from_iterable(doc_mix))
            doc_mix_len = len(doc_mix_flat)
            ctx = np.array([doc_mix_flat[:CTX_LEN]]) if doc_mix_len>=CTX_LEN else np.array([doc_mix_flat+[0]*(CTX_LEN-doc_mix_len)])            
            return doc_mix, doc_lbs, ctx
        return doc_mix, doc_lbs
        


class PscKMedoids:
    
    def __init__(self, psc_clf, data_type):
        
        self.psc_clf = psc_clf
        self.mix_reader = MixtureReader(self.psc_clf.config['data_dir'],
                                        data_type=data_type,
                                        context=self.psc_clf.config['context_length'] if self.psc_clf.config['context'] else 0)
    
    def __to_labels(self, C, doc_len): # C: {cls:[datum_id, ...], ...}
        lbs = [0]*doc_len
        for idx in C[1]:
            lbs[idx] = 1
        return lbs

    def __flip_clust(self, clust):
        return np.array([0 if i==1 else 1 for i in clust])

    def __clust_accuracy(self, true, pred):
        return max(accuracy_score(true, pred),
                   accuracy_score(true, self.__flip_clust(pred)))    
        
    def __dist(self, x1, x2):
        
        x1, x1_len = batch([x1])
        x2, x2_len = batch([x2])
        fd = {self.psc_clf.input_x1:x1, self.psc_clf.input_x1_length:x1_len,
              self.psc_clf.input_x2:x2, self.psc_clf.input_x2_length:x2_len,
              self.psc_clf.keep_prob:1.0}
        if self.psc_clf.config['context']:
            fd[self.psc_clf.input_ctx] = self.ctx
        conf = self.psc_clf.sess.run(self.psc_clf.scores, feed_dict=fd)
        return 1-conf[0]  
    
    def evaluate_single(self, doc_mix, doc_lbs, ctx=False, method='average'):
        
        if ctx is not False: # ctx, if true, is a [1,CTX_LEN] ndarray, true/false check causes ambiguity.
            self.ctx = ctx
        doc_mix_sq, _ = batch(doc_mix)
        doc_mix_sq = doc_mix_sq.T
        _, doc_mix_clust = kMedoids(squareform(pdist(doc_mix_sq,metric=self.__dist)), 2)
        doc_prd = self.__to_labels(doc_mix_clust, len(doc_mix))
        acc = self.__clust_accuracy(doc_lbs, doc_prd)
        return acc 
    
    def evaluate_rand(self, k=100, verbose=True):
        
        accs = []
        filenames = np.random.choice(self.psc_clf.FILENAMES, size=k, replace=False)
        for filename in filenames:
            try:
                acc = self.evaluate_single(*self.mix_reader.get_mixture(filename))
            except:
                acc = 0.5 # handling a bad files.
            accs.append(acc)
            if verbose:
                print('File {}: acc = {}'.format(filename, acc))
        avg_acc = np.mean(accs)
        print('\nAverage accuracy = {}'.format(avg_acc))
        return avg_acc
    
    def evaluate_given(self, filenames, verbose=False):
        
        accs = []
        for filename in filenames:
            try:
                acc = self.evaluate_single(*self.mix_reader.get_mixture(filename))
            except:
                acc = 0.5
            if verbose:
                print('File {}: acc = {}'.format(filename, acc))                
        avg_acc = np.mean(accs)
        print('\nAverage accuracy = {}'.format(avg_acc))
        return avg_acc     
    
    
if __name__ == "__main__": 
    
    import argparse
    parser = argparse.ArgumentParser()
    # PSC configs
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--vocab_size', type=int, default=100001)
    parser.add_argument('--emb_size', type=int, default=300)
    parser.add_argument('--n_layer', type=int, default=2)
    parser.add_argument('--hid_size', type=int, default=100)
    parser.add_argument('--keep_prob', type=float, default=0.7)
    parser.add_argument('--learning_rate', type=float, default=1e-5)
    parser.add_argument('--n_epoch', type=int, default=3)
    parser.add_argument('--train_size', type=int, default=10)
    parser.add_argument('--verbose', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=5)
    parser.add_argument('--data_dir', type=str, default="/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_code/")
    parser.add_argument('--info_path', type=str, default="/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_info/indexer_word2emb_100k.p")
    parser.add_argument('--init_with_glove', type=bool, default=True)
    parser.add_argument('--save_dir', type=str, default="/work/04233/sw33286/AIDA-INDIV-MODEL-SAVE/temp/")
    parser.add_argument('--save_name', type=str, default="temp-model")
    parser.add_argument('--restore_dir', type=str, default="/work/04233/sw33286/AIDA-INDIV-MODEL-SAVE/temp/")
    parser.add_argument('--restore_name', type=str, default="temp-model.meta")
    parser.add_argument('--load_from_saved', type=bool, default=True)
    parser.add_argument('--track_dir', type=str, default="/work/04233/sw33286/AIDA-TRACKS/sentence-tracks/")
    parser.add_argument('--new_track', type=bool, default=True)
    parser.add_argument('--session_id', type=str, default='0000')
    parser.add_argument('--mutual_attention', type=bool, default=False)
    parser.add_argument('--context', type=bool, default=False)
    parser.add_argument('--context_length', type=int, default=500)
    # KMed configs
    parser.add_argument('--data_type', type=str, default='nyt')
    parser.add_argument('--eval_rand', type=int, default=10) # evaluation size
    parser.add_argument('--eval_desig', type=str, default="/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_info/top_tsim_filename-tsim_tuples.p")
    parser.add_argument('--eval_verbose', type=int, default=0)
    parser.add_argument('--eval_res_path', type=str, default="/work/04233/sw33286/AIDA-TRACKS/sentence-tracks/eval_res.txt")
    args = parser.parse_args()

    config = {'batch_size': args.batch_size, 'vocab_size': args.vocab_size, 'emb_size': args.emb_size,
              'n_layer': args.n_layer, 'hid_size': args.hid_size,
              'keep_prob': args.keep_prob, 'learning_rate': args.learning_rate,
              'n_epoch': args.n_epoch, 'train_size': args.train_size, 'verbose': args.verbose,
              'save_freq': args.save_freq,
              'data_dir': args.data_dir, 'info_path': args.info_path,
              'init_with_glove': args.init_with_glove,
              'save_dir': args.save_dir, 'save_name': args.save_name,
              'restore_dir': args.restore_dir, 'restore_name': args.restore_name,
              'load_from_saved': args.load_from_saved,
              'track_dir': args.track_dir, 'new_track': args.new_track, 'session_id': args.session_id,
              'mutual_attention': args.mutual_attention, 
              'context': args.context, 'context_length': args.context_length}
    
    psc_clf = PairwiseSentenceClassifier(config)
    print('\n')
    
    data_type, eval_rand, eval_desig = args.data_type, args.eval_rand, args.eval_desig
    eval_verbose, eval_res_path = args.eval_verbose, args.eval_res_path
    kmed = PscKMedoids(psc_clf, data_type=data_type)
    if eval_rand:
        res = kmed.evaluate_rand(k=eval_rand, verbose=eval_verbose)
    else:
        filename_tsim_tuples = dill.load(open(eval_desig, 'rb'))
        res = kmed.evaluate_given([fn for fn,tsim in filename_tsim_tuples], verbose=eval_verbose)
    with open(eval_res_path, 'w') as f:
        f.write('Average clustering accuracy = {}'.format(res))
    
#     # testing
#     psc_clf = PairwiseSentenceClassifier(config)
#     kmed = PscKMedoids(psc_clf, data_type='wiki')
#     print('\n')
#     kmed.evaluate_rand(k=10)

### Functionality check

In [11]:
!python3 psc_kmed.py # load temp model (only trained with 10 instances)


I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcublas.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcudnn.so.5 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcufft.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcurand.so.8.0 locally
I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 0 with properties: 
name: Tesla K40m
major: 3 minor: 5 memoryClockRate (GHz) 0.745
pciBusID 0000:08:00.0
Total memory: 11.17GiB
Free memory: 11.10GiB
I tensorflow/core/common_runtime/gpu/gpu_device.cc:906] DMA: 0 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 0:   Y 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K40m, pci bu

### NYT evaluation check

In [19]:
!python3 psc_kmed.py --restore_dir /work/04233/sw33286/AIDA-INDIV-MODEL-SAVE/our-model-with-context-mutual-attention/ --restore_name our-model-with-context-mutual-attention-00.meta --mutual_attention 1 --context 1


I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcublas.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcudnn.so.5 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcufft.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcurand.so.8.0 locally
I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 0 with properties: 
name: Tesla K40m
major: 3 minor: 5 memoryClockRate (GHz) 0.745
pciBusID 0000:08:00.0
Total memory: 11.17GiB
Free memory: 11.10GiB
I tensorflow/core/common_runtime/gpu/gpu_device.cc:906] DMA: 0 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 0:   Y 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K40m, pci bu

### Wiki evaluation check

In [20]:
!python3 psc_kmed.py --data_dir /work/04233/sw33286/AIDA-DATA/wiki-sample-50k-clean-nytcode/ --restore_dir /work/04233/sw33286/AIDA-INDIV-MODEL-SAVE/our-model-with-context-mutual-attention/ --restore_name our-model-with-context-mutual-attention-00.meta --mutual_attention 1 --context 1


I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcublas.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcudnn.so.5 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcufft.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcurand.so.8.0 locally
I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 0 with properties: 
name: Tesla K40m
major: 3 minor: 5 memoryClockRate (GHz) 0.745
pciBusID 0000:08:00.0
Total memory: 11.17GiB
Free memory: 11.10GiB
I tensorflow/core/common_runtime/gpu/gpu_device.cc:906] DMA: 0 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 0:   Y 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K40m, pci bu