-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Yu Zhang
committed
Aug 23, 2019
1 parent
ed4b08b
commit 1b4a58c
Showing
11 changed files
with
1,636 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import string | ||
from sklearn.metrics import f1_score | ||
from sklearn.metrics import confusion_matrix | ||
|
||
import argparse | ||
parser = argparse.ArgumentParser(description='main', formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
|
||
parser.add_argument('--dataset', default='ai', choices=['ai', 'bio']) | ||
args = parser.parse_args() | ||
|
||
dataset = args.dataset | ||
|
||
print('\n### Testing ###') | ||
|
||
p = dict() | ||
with open(dataset+'/label_hier.txt') as fin: | ||
for line in fin: | ||
tmp = line.strip().split() | ||
for label in tmp[1:]: | ||
p[label.lower()] = tmp[0].lower() | ||
|
||
y_u = [] | ||
y_d = [] | ||
with open(dataset+'/labels.txt') as fin: | ||
for line in fin: | ||
dl = line.strip().lower() | ||
y_u.append(p[dl]) | ||
y_d.append(dl) | ||
|
||
y_u_pred = [] | ||
y_d_pred = [] | ||
with open(dataset+'/out.txt') as fin: | ||
for line in fin: | ||
tmp = line.strip().split() | ||
y_u_pred.append(tmp[0].lower()) | ||
y_d_pred.append(tmp[1].lower()) | ||
|
||
print('Upper Micro F1 score:', f1_score(y_u, y_u_pred, average='micro')) | ||
print('Upper Macro F1 score:', f1_score(y_u, y_u_pred, average='macro')) | ||
|
||
print('Lower Micro F1 score:', f1_score(y_d, y_d_pred, average='micro')) | ||
print('Lower Macro F1 score:', f1_score(y_d, y_d_pred, average='macro')) | ||
print('Lower-level Confusion Matrix:') | ||
print(confusion_matrix(y_d, y_d_pred)) | ||
|
||
print('Overall Micro F1 score:', f1_score(y_u+y_d, y_u_pred+y_d_pred, average='micro')) | ||
print('Overall Macro F1 score:', f1_score(y_u+y_d, y_u_pred+y_d_pred, average='macro')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import string | ||
from sklearn.metrics import f1_score | ||
from sklearn.metrics import confusion_matrix | ||
|
||
dataset = 'bio' | ||
|
||
p = dict() | ||
with open(dataset+'/label_hier_hier.txt') as fin: | ||
for line in fin: | ||
tmp = line.strip().split() | ||
for label in tmp[1:]: | ||
p[label.lower()] = tmp[0].lower() | ||
|
||
y_u = [] | ||
y_d = [] | ||
with open(dataset+'/labels.txt') as fin: | ||
for line in fin: | ||
dl = line.strip().lower() | ||
y_u.append(p[dl]) | ||
y_d.append(dl) | ||
|
||
y_u_pred = [] | ||
y_d_pred = [] | ||
with open(dataset+'/out.txt') as fin: | ||
for line in fin: | ||
tmp = line.strip().split() | ||
y_u_pred.append(p[tmp[0].lower()]) | ||
y_d_pred.append(tmp[0].lower()) | ||
|
||
print('Upper Micro/Macro:') | ||
print(f1_score(y_u, y_u_pred, average='micro')) | ||
print(f1_score(y_u, y_u_pred, average='macro')) | ||
|
||
print('Lower Micro/Macro:') | ||
print(f1_score(y_d, y_d_pred, average='micro')) | ||
print(f1_score(y_d, y_d_pred, average='macro')) | ||
print(confusion_matrix(y_d, y_d_pred)) | ||
|
||
print('Overall Micro/Macro:') | ||
print(f1_score(y_u+y_d, y_u_pred+y_d_pred, average='micro')) | ||
print(f1_score(y_u+y_d, y_u_pred+y_d_pred, average='macro')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,307 @@ | ||
import numpy as np | ||
import os | ||
np.random.seed(1234) | ||
from spherecluster import SphericalKMeans, VonMisesFisherMixture, sample_vMF | ||
from collections import defaultdict | ||
from keras.preprocessing.sequence import pad_sequences | ||
import pickle | ||
from time import time | ||
from multiprocessing import Pool | ||
|
||
|
||
def sample_mix_vMF(center, kappa, weight, num_doc): | ||
distrib_idx = np.random.choice(range(len(center)), num_doc, p=weight) | ||
samples = [] | ||
for idx in distrib_idx: | ||
samples.append(sample_vMF(center[idx], kappa[idx], 1)) | ||
samples = np.array(samples) | ||
samples = np.reshape(samples, (num_doc, -1)) | ||
return samples | ||
|
||
|
||
def seed_expansion(relevant_nodes, prob_sup_array, sz, vocab_dict, embedding_mat): | ||
vocab_sz = len(vocab_dict) | ||
for j, relevant_node in enumerate(relevant_nodes): | ||
word_class = relevant_node.keywords | ||
prob_sup_class = prob_sup_array[j] | ||
expanded_class = [] | ||
seed_vec = np.zeros(vocab_sz) | ||
if len(word_class) < sz: | ||
for i, word in enumerate(word_class): | ||
seed_vec[vocab_dict[word]] = prob_sup_class[i] | ||
expanded = np.dot(embedding_mat.transpose(), seed_vec) | ||
expanded = np.dot(embedding_mat, expanded) | ||
word_expanded = sorted(range(len(expanded)), key=lambda k: expanded[k], reverse=True) | ||
for i in range(sz): | ||
expanded_class.append(word_expanded[i]) | ||
relevant_node.expanded = np.array(expanded_class) | ||
else: | ||
relevant_node.expanded = np.array([vocab_dict[w] for w in word_class]) | ||
|
||
|
||
def label_expansion(relevant_nodes, write_path, vocabulary_inv, embedding_mat, manual_num=None, fitting='mix'): | ||
print("Retrieving top-t nearest words...") | ||
vocab_dict = {v: k for k, v in vocabulary_inv.items()} | ||
prob_sup_array = [] | ||
current_szes = [] | ||
all_class_keywords = [] | ||
children_nodes = [] | ||
for relevant_node in relevant_nodes: | ||
if relevant_node.children: | ||
children_nodes += relevant_node.children | ||
else: | ||
children_nodes += [relevant_node] | ||
for children_node in children_nodes: | ||
current_sz = len(children_node.keywords) | ||
current_szes.append(current_sz) | ||
prob_sup_array.append([1/current_sz] * current_sz) | ||
all_class_keywords += children_node.keywords | ||
current_sz = np.min(current_szes) | ||
if manual_num is None: | ||
while len(all_class_keywords) == len(set(all_class_keywords)): | ||
print(f'current_sz: {current_sz}') | ||
current_sz += 1 | ||
# print(f'len_kw: {len(all_class_keywords)}') | ||
seed_expansion(children_nodes, prob_sup_array, current_sz, vocab_dict, embedding_mat) | ||
all_class_keywords = [w for relevant_node in children_nodes for w in relevant_node.expanded] | ||
seed_expansion(children_nodes, prob_sup_array, current_sz-1, vocab_dict, embedding_mat) | ||
# seed_expansion(children_nodes, prob_sup_array, current_sz, vocab_dict, embedding_mat) | ||
else: | ||
seed_expansion(children_nodes, prob_sup_array, manual_num, vocab_dict, embedding_mat) | ||
if manual_num is None: | ||
print(f"Final expansion size t = {len(children_nodes[0].expanded)}") | ||
else: | ||
print(f"Manual expansion size t = {manual_num}") | ||
|
||
centers = [] | ||
kappas = [] | ||
weights = [] | ||
if write_path is not None: | ||
if not os.path.exists(write_path): | ||
os.makedirs(write_path) | ||
else: | ||
f = open(os.path.join(write_path, 'expanded.txt'), 'w') | ||
f.close() | ||
for relevant_node in relevant_nodes: | ||
children_nodes = relevant_node.children if relevant_node.children else [relevant_node] | ||
num_children = len(children_nodes) | ||
expanded_class = [] | ||
if fitting == 'mix': | ||
for child in children_nodes: | ||
# assert child.expanded != [] | ||
expanded_class = np.concatenate((expanded_class, child.expanded)) | ||
print([vocabulary_inv[w] for w in child.expanded]) | ||
vocab_expanded = [vocabulary_inv[w] for w in expanded_class] | ||
expanded_mat = embedding_mat[np.asarray(list(set(expanded_class)), dtype='int32')] | ||
vmf_soft = VonMisesFisherMixture(n_clusters=num_children, n_jobs=15, random_state=0) | ||
vmf_soft.fit(expanded_mat) | ||
center = vmf_soft.cluster_centers_ | ||
kappa = vmf_soft.concentrations_ | ||
weight = vmf_soft.weights_ | ||
print(f'weight: {weight}') | ||
print(f'kappa: {kappa}') | ||
centers.append(center) | ||
kappas.append(kappa) | ||
weights.append(weight) | ||
elif fitting == 'separate': | ||
center = [] | ||
kappa = [] | ||
weight = [] | ||
for child in children_nodes: | ||
assert child.expanded != [] | ||
expanded_class = np.concatenate((expanded_class, child.expanded)) | ||
expanded_mat = embedding_mat[np.asarray(child.expanded, dtype='int32')] | ||
vmf_soft = VonMisesFisherMixture(n_clusters=1, n_jobs=15, random_state=0) | ||
vmf_soft.fit(expanded_mat) | ||
center.append(vmf_soft.cluster_centers_[0]) | ||
kappa.append(vmf_soft.concentrations_[0]) | ||
weight.append(1/num_children) | ||
expanded = np.dot(embedding_mat, center[-1]) | ||
word_expanded = sorted(range(len(expanded)), key=lambda k: expanded[k], reverse=True) | ||
vocab_expanded = [vocabulary_inv[w] for w in expanded_class] | ||
print(f'Class {relevant_node.name}:') | ||
print(vocab_expanded) | ||
print(f'weight: {weight}') | ||
print(f'kappa: {kappa}') | ||
centers.append(center) | ||
kappas.append(kappa) | ||
weights.append(weight) | ||
if write_path is not None: | ||
f = open(os.path.join(write_path, 'expanded.txt'), 'a') | ||
f.write(relevant_node.name + '\t') | ||
f.write(' '.join(vocab_expanded) + '\n') | ||
f.close() | ||
|
||
print("Finished vMF distribution fitting.") | ||
return centers, kappas, weights | ||
|
||
|
||
def bow_pseudodocs(relevant_nodes, expand_num, background_array, sequence_length, len_avg, | ||
len_std, num_doc, interp_weight, vocabulary_inv, embedding_mat, save_dir=None, total_num=50): | ||
n_classes = len(relevant_nodes) | ||
|
||
# if os.path.exists(os.path.join(save_dir, 'pseudo_docs.pkl')): | ||
# print(f'Loading pseudodocs for bow...') | ||
# f = open(os.path.join(save_dir, 'pseudo_docs.pkl'), 'rb') | ||
# docs, labels = pickle.load(f) | ||
# f = open(os.path.join(save_dir, 'pseudo_docs.txt'), 'w') | ||
# for doc in docs: | ||
# f.write(" ".join([vocabulary_inv[ele] for ele in doc]) + '\n') | ||
# f.close() | ||
# return docs, labels | ||
|
||
for i in range(len(embedding_mat)): | ||
embedding_mat[i] = embedding_mat[i] / np.linalg.norm(embedding_mat[i]) | ||
|
||
centers, kappas, weights = label_expansion(relevant_nodes, save_dir, vocabulary_inv, embedding_mat, expand_num) | ||
|
||
background_vec = interp_weight * background_array | ||
docs = np.zeros((num_doc*n_classes, sequence_length), dtype='int32') | ||
label = np.zeros((num_doc*n_classes, n_classes)) | ||
|
||
for i in range(n_classes): | ||
docs_len = len_avg*np.ones(num_doc) | ||
center = centers[i] | ||
kappa = kappas[i] | ||
weight = weights[i] | ||
discourses = sample_mix_vMF(center, kappa, weight, num_doc) | ||
for j in range(num_doc): | ||
discourse = discourses[j] | ||
prob_vec = np.dot(embedding_mat, discourse) | ||
prob_vec = np.exp(prob_vec) | ||
sorted_idx = np.argsort(-prob_vec) | ||
delete_idx = sorted_idx[total_num:] | ||
prob_vec[delete_idx] = 0 | ||
prob_vec /= np.sum(prob_vec) | ||
prob_vec *= 1 - interp_weight | ||
prob_vec += background_vec | ||
doc_len = int(docs_len[j]) | ||
docs[i*num_doc+j][:doc_len] = np.random.choice(len(prob_vec), size=doc_len, p=prob_vec) | ||
label[i*num_doc+j] = interp_weight/n_classes*np.ones(n_classes) | ||
label[i*num_doc+j][i] += 1 - interp_weight | ||
|
||
f = open(os.path.join(save_dir, 'pseudo_docs_bow.txt'), 'w') | ||
for doc in docs: | ||
f.write(" ".join([vocabulary_inv[ele] for ele in doc]) + '\n') | ||
f.close() | ||
with open(os.path.join(save_dir, 'pseudo_docs_bow.pkl'), 'wb') as f: | ||
pickle.dump([docs, label], f, protocol=4) | ||
return docs, label | ||
|
||
|
||
def lstm_pseudodocs(parent_node, expand_num, sequence_length, len_avg, sent_length, len_std, num_doc, | ||
interp_weight, vocabulary_inv, lm, common_words, save_dir=None): | ||
relevant_nodes = parent_node.children | ||
embedding_mat = parent_node.embedding | ||
n_classes = len(relevant_nodes) | ||
|
||
for i in range(len(embedding_mat)): | ||
embedding_mat[i] = embedding_mat[i] / np.linalg.norm(embedding_mat[i]) | ||
|
||
centers, kappas, weights = label_expansion(relevant_nodes, save_dir, vocabulary_inv, embedding_mat, expand_num) | ||
|
||
seed_words = [] | ||
for i in range(n_classes): | ||
center = centers[i] | ||
kappa = kappas[i] | ||
weight = weights[i] | ||
# discourses = sample_mix_vMF(center, kappa, weight, num_doc*num_sent) | ||
discourses = sample_mix_vMF(center, kappa, weight, num_doc) | ||
prob_mat = np.dot(discourses, embedding_mat.transpose()) | ||
seeds = np.argmax(prob_mat, axis=1) | ||
seed_words.append(seeds) | ||
|
||
doc_len = int(len_avg) | ||
num_sent = int(np.ceil(doc_len/sent_length)) | ||
docs = np.zeros((num_doc*n_classes, sequence_length), dtype='int32') | ||
label = np.zeros((num_doc*n_classes, n_classes)) | ||
for i in range(n_classes): | ||
# seeds = np.reshape(seeds, (num_doc, num_sent)) | ||
docs_class = gen_with_seeds(relevant_nodes[i].name, lm, seed_words[i], doc_len, sent_length, \ | ||
common_words, vocabulary_inv, save_dir=save_dir) | ||
for j in range(num_doc): | ||
docs[i*num_doc+j, :doc_len] = docs_class[j] | ||
label[i*num_doc+j] = interp_weight/n_classes*np.ones(n_classes) | ||
label[i*num_doc+j][i] += 1 - interp_weight | ||
|
||
return docs, label | ||
|
||
|
||
def gen_next(common_words, total_words, pred): | ||
select = np.random.choice(common_words+1, p=pred) | ||
pred_trim = select | ||
if select == common_words: | ||
pred_real = np.random.choice(range(common_words,total_words)) | ||
else: | ||
pred_real = select | ||
return pred_real, pred_trim | ||
|
||
|
||
def gen_with_seeds(class_name, lm, seeds, doc_len, sent_length, common_words, vocabulary_inv, save_dir=None): | ||
docs = np.zeros((len(seeds), doc_len), dtype='int32') | ||
# if os.path.exists(os.path.join(save_dir, f'{class_name}_pseudo_docs.pkl')): | ||
# print(f'Loading pseudodocs for class {class_name}...') | ||
# f = open(os.path.join(save_dir, f'{class_name}_pseudo_docs.pkl'), 'rb') | ||
# return pickle.load(f) | ||
t0 = time() | ||
pool = Pool(10) | ||
doc_len = int(doc_len) | ||
|
||
sent_cnt = 0 | ||
print(f'Pseudodocs generation for class {class_name}...') | ||
|
||
cur_seq = [[] for _ in range(len(seeds))] | ||
for i in range(doc_len): | ||
if i % sent_length == 0: | ||
# pred_real = [seed[sent_cnt] for seed in seeds] | ||
# pred_trim = [min(seed[sent_cnt], common_words) for seed in seeds] | ||
pred_real = [seed for seed in seeds] | ||
pred_trim = [min(seed, common_words) for seed in seeds] | ||
temp_seq = [[] for _ in range(len(seeds))] | ||
sent_cnt += 1 | ||
else: | ||
padded_seq = pad_sequences(temp_seq, maxlen=sent_length-1, padding='pre') | ||
pred = lm.predict(padded_seq, verbose=0) | ||
args = [(common_words, len(vocabulary_inv), ele) for ele in pred] | ||
res = pool.starmap(gen_next, args) | ||
pred_real = [ele[0] for ele in res] | ||
pred_trim = [ele[1] for ele in res] | ||
assert len(pred_real) == len(cur_seq) | ||
for j in range(len(cur_seq)): | ||
cur_seq[j].append(pred_real[j]) | ||
temp_seq[j].append(pred_trim[j]) | ||
|
||
cur_seq = np.array(cur_seq) | ||
print(f'Pseudodocs generation time: {time() - t0:.2f}s') | ||
if not os.path.exists(save_dir): | ||
os.makedirs(save_dir) | ||
f = open(os.path.join(save_dir, f'{class_name}_pseudo_docs.txt'), 'w') | ||
for seq in cur_seq: | ||
f.write(" ".join([vocabulary_inv[ele] for ele in seq]) + '\n') | ||
f.close() | ||
with open(os.path.join(save_dir, f'{class_name}_pseudo_docs.pkl'), 'wb') as f: | ||
pickle.dump(cur_seq, f, protocol=4) | ||
return cur_seq | ||
|
||
|
||
def augment(x, relevant_nodes, total_len, save_dir=None): | ||
docs = [] | ||
print("Labeled documents augmentation...") | ||
y = np.zeros((0, len(relevant_nodes))) | ||
sup_idx = [] | ||
for i, node in enumerate(relevant_nodes): | ||
sup_idx += node.sup_idx | ||
labels = np.zeros((len(node.sup_idx), len(relevant_nodes))) | ||
labels[:, i] = 1.0 | ||
y = np.concatenate((y, labels), axis=0) | ||
docs = x[sup_idx] | ||
curr_len = len(docs) | ||
copy_times = int(total_len/curr_len) - 1 | ||
new_docs = docs | ||
new_y = y | ||
for _ in range(copy_times): | ||
new_docs = np.concatenate((new_docs, docs), axis=0) | ||
new_y = np.concatenate((new_y, y), axis=0) | ||
|
||
print("Finished labeled documents augmentation.") | ||
return new_docs, new_y |
Oops, something went wrong.