Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Yu Zhang committed Aug 23, 2019
1 parent ed4b08b commit 1b4a58c
Show file tree
Hide file tree
Showing 11 changed files with 1,636 additions and 0 deletions.
Empty file modified README.md
100644 → 100755
Empty file.
47 changes: 47 additions & 0 deletions eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import string
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

import argparse
parser = argparse.ArgumentParser(description='main', formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('--dataset', default='ai', choices=['ai', 'bio'])
args = parser.parse_args()

dataset = args.dataset

print('\n### Testing ###')

p = dict()
with open(dataset+'/label_hier.txt') as fin:
for line in fin:
tmp = line.strip().split()
for label in tmp[1:]:
p[label.lower()] = tmp[0].lower()

y_u = []
y_d = []
with open(dataset+'/labels.txt') as fin:
for line in fin:
dl = line.strip().lower()
y_u.append(p[dl])
y_d.append(dl)

y_u_pred = []
y_d_pred = []
with open(dataset+'/out.txt') as fin:
for line in fin:
tmp = line.strip().split()
y_u_pred.append(tmp[0].lower())
y_d_pred.append(tmp[1].lower())

print('Upper Micro F1 score:', f1_score(y_u, y_u_pred, average='micro'))
print('Upper Macro F1 score:', f1_score(y_u, y_u_pred, average='macro'))

print('Lower Micro F1 score:', f1_score(y_d, y_d_pred, average='micro'))
print('Lower Macro F1 score:', f1_score(y_d, y_d_pred, average='macro'))
print('Lower-level Confusion Matrix:')
print(confusion_matrix(y_d, y_d_pred))

print('Overall Micro F1 score:', f1_score(y_u+y_d, y_u_pred+y_d_pred, average='micro'))
print('Overall Macro F1 score:', f1_score(y_u+y_d, y_u_pred+y_d_pred, average='macro'))
41 changes: 41 additions & 0 deletions flat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import string
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

dataset = 'bio'

p = dict()
with open(dataset+'/label_hier_hier.txt') as fin:
for line in fin:
tmp = line.strip().split()
for label in tmp[1:]:
p[label.lower()] = tmp[0].lower()

y_u = []
y_d = []
with open(dataset+'/labels.txt') as fin:
for line in fin:
dl = line.strip().lower()
y_u.append(p[dl])
y_d.append(dl)

y_u_pred = []
y_d_pred = []
with open(dataset+'/out.txt') as fin:
for line in fin:
tmp = line.strip().split()
y_u_pred.append(p[tmp[0].lower()])
y_d_pred.append(tmp[0].lower())

print('Upper Micro/Macro:')
print(f1_score(y_u, y_u_pred, average='micro'))
print(f1_score(y_u, y_u_pred, average='macro'))

print('Lower Micro/Macro:')
print(f1_score(y_d, y_d_pred, average='micro'))
print(f1_score(y_d, y_d_pred, average='macro'))
print(confusion_matrix(y_d, y_d_pred))

print('Overall Micro/Macro:')
print(f1_score(y_u+y_d, y_u_pred+y_d_pred, average='micro'))
print(f1_score(y_u+y_d, y_u_pred+y_d_pred, average='macro'))
307 changes: 307 additions & 0 deletions gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,307 @@
import numpy as np
import os
np.random.seed(1234)
from spherecluster import SphericalKMeans, VonMisesFisherMixture, sample_vMF
from collections import defaultdict
from keras.preprocessing.sequence import pad_sequences
import pickle
from time import time
from multiprocessing import Pool


def sample_mix_vMF(center, kappa, weight, num_doc):
distrib_idx = np.random.choice(range(len(center)), num_doc, p=weight)
samples = []
for idx in distrib_idx:
samples.append(sample_vMF(center[idx], kappa[idx], 1))
samples = np.array(samples)
samples = np.reshape(samples, (num_doc, -1))
return samples


def seed_expansion(relevant_nodes, prob_sup_array, sz, vocab_dict, embedding_mat):
vocab_sz = len(vocab_dict)
for j, relevant_node in enumerate(relevant_nodes):
word_class = relevant_node.keywords
prob_sup_class = prob_sup_array[j]
expanded_class = []
seed_vec = np.zeros(vocab_sz)
if len(word_class) < sz:
for i, word in enumerate(word_class):
seed_vec[vocab_dict[word]] = prob_sup_class[i]
expanded = np.dot(embedding_mat.transpose(), seed_vec)
expanded = np.dot(embedding_mat, expanded)
word_expanded = sorted(range(len(expanded)), key=lambda k: expanded[k], reverse=True)
for i in range(sz):
expanded_class.append(word_expanded[i])
relevant_node.expanded = np.array(expanded_class)
else:
relevant_node.expanded = np.array([vocab_dict[w] for w in word_class])


def label_expansion(relevant_nodes, write_path, vocabulary_inv, embedding_mat, manual_num=None, fitting='mix'):
print("Retrieving top-t nearest words...")
vocab_dict = {v: k for k, v in vocabulary_inv.items()}
prob_sup_array = []
current_szes = []
all_class_keywords = []
children_nodes = []
for relevant_node in relevant_nodes:
if relevant_node.children:
children_nodes += relevant_node.children
else:
children_nodes += [relevant_node]
for children_node in children_nodes:
current_sz = len(children_node.keywords)
current_szes.append(current_sz)
prob_sup_array.append([1/current_sz] * current_sz)
all_class_keywords += children_node.keywords
current_sz = np.min(current_szes)
if manual_num is None:
while len(all_class_keywords) == len(set(all_class_keywords)):
print(f'current_sz: {current_sz}')
current_sz += 1
# print(f'len_kw: {len(all_class_keywords)}')
seed_expansion(children_nodes, prob_sup_array, current_sz, vocab_dict, embedding_mat)
all_class_keywords = [w for relevant_node in children_nodes for w in relevant_node.expanded]
seed_expansion(children_nodes, prob_sup_array, current_sz-1, vocab_dict, embedding_mat)
# seed_expansion(children_nodes, prob_sup_array, current_sz, vocab_dict, embedding_mat)
else:
seed_expansion(children_nodes, prob_sup_array, manual_num, vocab_dict, embedding_mat)
if manual_num is None:
print(f"Final expansion size t = {len(children_nodes[0].expanded)}")
else:
print(f"Manual expansion size t = {manual_num}")

centers = []
kappas = []
weights = []
if write_path is not None:
if not os.path.exists(write_path):
os.makedirs(write_path)
else:
f = open(os.path.join(write_path, 'expanded.txt'), 'w')
f.close()
for relevant_node in relevant_nodes:
children_nodes = relevant_node.children if relevant_node.children else [relevant_node]
num_children = len(children_nodes)
expanded_class = []
if fitting == 'mix':
for child in children_nodes:
# assert child.expanded != []
expanded_class = np.concatenate((expanded_class, child.expanded))
print([vocabulary_inv[w] for w in child.expanded])
vocab_expanded = [vocabulary_inv[w] for w in expanded_class]
expanded_mat = embedding_mat[np.asarray(list(set(expanded_class)), dtype='int32')]
vmf_soft = VonMisesFisherMixture(n_clusters=num_children, n_jobs=15, random_state=0)
vmf_soft.fit(expanded_mat)
center = vmf_soft.cluster_centers_
kappa = vmf_soft.concentrations_
weight = vmf_soft.weights_
print(f'weight: {weight}')
print(f'kappa: {kappa}')
centers.append(center)
kappas.append(kappa)
weights.append(weight)
elif fitting == 'separate':
center = []
kappa = []
weight = []
for child in children_nodes:
assert child.expanded != []
expanded_class = np.concatenate((expanded_class, child.expanded))
expanded_mat = embedding_mat[np.asarray(child.expanded, dtype='int32')]
vmf_soft = VonMisesFisherMixture(n_clusters=1, n_jobs=15, random_state=0)
vmf_soft.fit(expanded_mat)
center.append(vmf_soft.cluster_centers_[0])
kappa.append(vmf_soft.concentrations_[0])
weight.append(1/num_children)
expanded = np.dot(embedding_mat, center[-1])
word_expanded = sorted(range(len(expanded)), key=lambda k: expanded[k], reverse=True)
vocab_expanded = [vocabulary_inv[w] for w in expanded_class]
print(f'Class {relevant_node.name}:')
print(vocab_expanded)
print(f'weight: {weight}')
print(f'kappa: {kappa}')
centers.append(center)
kappas.append(kappa)
weights.append(weight)
if write_path is not None:
f = open(os.path.join(write_path, 'expanded.txt'), 'a')
f.write(relevant_node.name + '\t')
f.write(' '.join(vocab_expanded) + '\n')
f.close()

print("Finished vMF distribution fitting.")
return centers, kappas, weights


def bow_pseudodocs(relevant_nodes, expand_num, background_array, sequence_length, len_avg,
len_std, num_doc, interp_weight, vocabulary_inv, embedding_mat, save_dir=None, total_num=50):
n_classes = len(relevant_nodes)

# if os.path.exists(os.path.join(save_dir, 'pseudo_docs.pkl')):
# print(f'Loading pseudodocs for bow...')
# f = open(os.path.join(save_dir, 'pseudo_docs.pkl'), 'rb')
# docs, labels = pickle.load(f)
# f = open(os.path.join(save_dir, 'pseudo_docs.txt'), 'w')
# for doc in docs:
# f.write(" ".join([vocabulary_inv[ele] for ele in doc]) + '\n')
# f.close()
# return docs, labels

for i in range(len(embedding_mat)):
embedding_mat[i] = embedding_mat[i] / np.linalg.norm(embedding_mat[i])

centers, kappas, weights = label_expansion(relevant_nodes, save_dir, vocabulary_inv, embedding_mat, expand_num)

background_vec = interp_weight * background_array
docs = np.zeros((num_doc*n_classes, sequence_length), dtype='int32')
label = np.zeros((num_doc*n_classes, n_classes))

for i in range(n_classes):
docs_len = len_avg*np.ones(num_doc)
center = centers[i]
kappa = kappas[i]
weight = weights[i]
discourses = sample_mix_vMF(center, kappa, weight, num_doc)
for j in range(num_doc):
discourse = discourses[j]
prob_vec = np.dot(embedding_mat, discourse)
prob_vec = np.exp(prob_vec)
sorted_idx = np.argsort(-prob_vec)
delete_idx = sorted_idx[total_num:]
prob_vec[delete_idx] = 0
prob_vec /= np.sum(prob_vec)
prob_vec *= 1 - interp_weight
prob_vec += background_vec
doc_len = int(docs_len[j])
docs[i*num_doc+j][:doc_len] = np.random.choice(len(prob_vec), size=doc_len, p=prob_vec)
label[i*num_doc+j] = interp_weight/n_classes*np.ones(n_classes)
label[i*num_doc+j][i] += 1 - interp_weight

f = open(os.path.join(save_dir, 'pseudo_docs_bow.txt'), 'w')
for doc in docs:
f.write(" ".join([vocabulary_inv[ele] for ele in doc]) + '\n')
f.close()
with open(os.path.join(save_dir, 'pseudo_docs_bow.pkl'), 'wb') as f:
pickle.dump([docs, label], f, protocol=4)
return docs, label


def lstm_pseudodocs(parent_node, expand_num, sequence_length, len_avg, sent_length, len_std, num_doc,
interp_weight, vocabulary_inv, lm, common_words, save_dir=None):
relevant_nodes = parent_node.children
embedding_mat = parent_node.embedding
n_classes = len(relevant_nodes)

for i in range(len(embedding_mat)):
embedding_mat[i] = embedding_mat[i] / np.linalg.norm(embedding_mat[i])

centers, kappas, weights = label_expansion(relevant_nodes, save_dir, vocabulary_inv, embedding_mat, expand_num)

seed_words = []
for i in range(n_classes):
center = centers[i]
kappa = kappas[i]
weight = weights[i]
# discourses = sample_mix_vMF(center, kappa, weight, num_doc*num_sent)
discourses = sample_mix_vMF(center, kappa, weight, num_doc)
prob_mat = np.dot(discourses, embedding_mat.transpose())
seeds = np.argmax(prob_mat, axis=1)
seed_words.append(seeds)

doc_len = int(len_avg)
num_sent = int(np.ceil(doc_len/sent_length))
docs = np.zeros((num_doc*n_classes, sequence_length), dtype='int32')
label = np.zeros((num_doc*n_classes, n_classes))
for i in range(n_classes):
# seeds = np.reshape(seeds, (num_doc, num_sent))
docs_class = gen_with_seeds(relevant_nodes[i].name, lm, seed_words[i], doc_len, sent_length, \
common_words, vocabulary_inv, save_dir=save_dir)
for j in range(num_doc):
docs[i*num_doc+j, :doc_len] = docs_class[j]
label[i*num_doc+j] = interp_weight/n_classes*np.ones(n_classes)
label[i*num_doc+j][i] += 1 - interp_weight

return docs, label


def gen_next(common_words, total_words, pred):
select = np.random.choice(common_words+1, p=pred)
pred_trim = select
if select == common_words:
pred_real = np.random.choice(range(common_words,total_words))
else:
pred_real = select
return pred_real, pred_trim


def gen_with_seeds(class_name, lm, seeds, doc_len, sent_length, common_words, vocabulary_inv, save_dir=None):
docs = np.zeros((len(seeds), doc_len), dtype='int32')
# if os.path.exists(os.path.join(save_dir, f'{class_name}_pseudo_docs.pkl')):
# print(f'Loading pseudodocs for class {class_name}...')
# f = open(os.path.join(save_dir, f'{class_name}_pseudo_docs.pkl'), 'rb')
# return pickle.load(f)
t0 = time()
pool = Pool(10)
doc_len = int(doc_len)

sent_cnt = 0
print(f'Pseudodocs generation for class {class_name}...')

cur_seq = [[] for _ in range(len(seeds))]
for i in range(doc_len):
if i % sent_length == 0:
# pred_real = [seed[sent_cnt] for seed in seeds]
# pred_trim = [min(seed[sent_cnt], common_words) for seed in seeds]
pred_real = [seed for seed in seeds]
pred_trim = [min(seed, common_words) for seed in seeds]
temp_seq = [[] for _ in range(len(seeds))]
sent_cnt += 1
else:
padded_seq = pad_sequences(temp_seq, maxlen=sent_length-1, padding='pre')
pred = lm.predict(padded_seq, verbose=0)
args = [(common_words, len(vocabulary_inv), ele) for ele in pred]
res = pool.starmap(gen_next, args)
pred_real = [ele[0] for ele in res]
pred_trim = [ele[1] for ele in res]
assert len(pred_real) == len(cur_seq)
for j in range(len(cur_seq)):
cur_seq[j].append(pred_real[j])
temp_seq[j].append(pred_trim[j])

cur_seq = np.array(cur_seq)
print(f'Pseudodocs generation time: {time() - t0:.2f}s')
if not os.path.exists(save_dir):
os.makedirs(save_dir)
f = open(os.path.join(save_dir, f'{class_name}_pseudo_docs.txt'), 'w')
for seq in cur_seq:
f.write(" ".join([vocabulary_inv[ele] for ele in seq]) + '\n')
f.close()
with open(os.path.join(save_dir, f'{class_name}_pseudo_docs.pkl'), 'wb') as f:
pickle.dump(cur_seq, f, protocol=4)
return cur_seq


def augment(x, relevant_nodes, total_len, save_dir=None):
docs = []
print("Labeled documents augmentation...")
y = np.zeros((0, len(relevant_nodes)))
sup_idx = []
for i, node in enumerate(relevant_nodes):
sup_idx += node.sup_idx
labels = np.zeros((len(node.sup_idx), len(relevant_nodes)))
labels[:, i] = 1.0
y = np.concatenate((y, labels), axis=0)
docs = x[sup_idx]
curr_len = len(docs)
copy_times = int(total_len/curr_len) - 1
new_docs = docs
new_y = y
for _ in range(copy_times):
new_docs = np.concatenate((new_docs, docs), axis=0)
new_y = np.concatenate((new_y, y), axis=0)

print("Finished labeled documents augmentation.")
return new_docs, new_y
Loading

0 comments on commit 1b4a58c

Please sign in to comment.