In [31]:
%load_ext autoreload
%autoreload 2
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,6,7"
os.environ['HF_HOME'] = '/shared/data3/pk36/.cache'

In [32]:
!export HF_HOME=/shared/data3/pk36/.cache

In [33]:
%load_ext autoreload
%autoreload 2
from collections import Counter
from taxonomy import Taxonomy, Paper
from utils import filter_phrases, cosine_similarity_embeddings, average_with_harmonic_series, rank_by_significance, rank_by_discriminative_significance, rank_by_relation
from model_definitions import sentence_model
import subprocess
import shutil
from tqdm import tqdm
import json
import pickle as pk
import hdbscan
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [63]:
class Args:
    def __init__(self):
        self.track = "Question Answering"
        self.dim = "Methodology"
        self.dataset = "qa_papers"
        self.input_file = "datasets/phrase_emnlp.txt"
        self.iters = 4
        self.model = "bert_full_ft"
        self.override = True

args = Args()

**Pre-Processing**

In [65]:
print(args.input_file)

datasets/phrase_emnlp.txt


In [60]:
if args.override or (not os.path.exists(f"datasets/{args.dataset}/phrase_{args.dataset}.txt")):
    # pre-process
    os.chdir("./preprocessing")
    subprocess.check_call(['./auto_phrase.sh', args.dataset])
    os.chdir("../")
else:
    print("already pre-processed!")

[32m===Corpus Pre-processing===[m


100%|██████████| 287/287 [00:08<00:00, 33.95it/s]


[32m===Compilation===[m
[32m===Tokenization===[m
Current step: Tokenizing input file...[0K


real	0m2.342s
user	0m19.168s
sys	0m1.422s


Detected Language: EN[0K
Current step: Tokenizing wikipedia phrases...[0K
No provided expert labels.[0K
[32m===Part-Of-Speech Tagging===[m
Current step: Merging...[0Ks...[0K
[32m===AutoPhrasing===[m


=== Current Settings ===
Iterations = 2
Minimum Support Threshold = 10
Maximum Length Threshold = 6
POS-Tagging Mode Enabled
Number of threads = 10
Labeling Method = DPDN
	Auto labels from knowledge bases
	Max Positive Samples = -1
Loading data...
# of total tokens = 2088439
max word token id = 66199
# of documents = 287
# of distinct POS tags = 57
Mining frequent phrases...
selected MAGIC = 66221
# of frequent phrases = 87032
Extracting features...
Constructing label pools...
	The size of the positive pool = 3873
	The size of the negative pool = 82754
# truth patterns = 75189
Estimating Phrase Quality...
Segmenting...
Rectifying features...
Estimating Phrase Quality...
Segmenting...
Dumping results...
Done.

real	0m6.444s
user	0m31.714s
sys	0m2.139s


[32m===Saving Model and Results===[m
[32m===Generating Output===[m
[32m===Tokenization===[m
Current step: Tokenizing input file...[0K


real	0m2.087s
user	0m17.306s
sys	0m0.813s


Detected Language: EN[0K
[32m===Part-Of-Speech Tagging===[m
Current step: Merging...[0Ks...[0K
[32m===Phrasal Segmentation===[m


=== Current Settings ===
Segmentation Model Path = models/NEW/segmentation.model
After the phrasal segmentation, only following phrases will be highlighted with <phrase> and </phrase>
	Q(multi-word phrases) >= 0.700000
	Q(single-word phrases) >= 1.000000
POS guided model loaded.
# of loaded patterns = 21920
# of loaded truth patterns = 79062
POS transition matrix loaded
Phrasal segmentation finished.
   # of total highlighted quality phrases = 161016
   # of total processed sentences = 316830
   avg highlights per sentence = 0.508209

real	0m2.945s
user	0m2.778s
sys	0m0.040s


[32m===Generating Output===[m
[32m===Segmented Corpus Post-processing===[m


287it [00:00, 1900.12it/s]


Phrase segmented corpus written to ../datasets/qa_papers/phrase_qa_papers.txt


**Base Taxonomy Construction & Reading in Papers**

In [64]:
# input: track, dimension -> get base taxonomy (2 levels) -> Class Tree, Class Node (description, seed words)

taxo = Taxonomy(args.track, args.dim, args.input_file)
base_taxo = taxo.buildBaseTaxo(levels=1, k=5, num_terms=20)

print(base_taxo)

FileNotFoundError: [Errno 2] No such file or directory: 'datasets/phrase_emnlp.txt'

In [62]:
taxo.root.label

'Types of Methodology Proposed in Text Classification Research Papers'

In [92]:
# format the input keywords file for seetopic -> get phrases -> filter using LLM
dir_name = (args.track + "_" + args.dim).lower().replace(" ", "_")

if not os.path.exists(f"SeeTopic/{dir_name}"):
    os.makedirs(f"SeeTopic/{dir_name}")

with open(f"SeeTopic/{dir_name}/{dir_name}.txt", "w") as f:
    for p in taxo.collection:
        f.write(f"{p.text}\n")


## get first level of children
children_with_terms = taxo.root.getChildren(terms=True)
with open(f"SeeTopic/{dir_name}/keywords_0.txt", "w") as f:
    for idx, c in enumerate(children_with_terms):
        str_c = ",".join(c[1])
        f.write(f"{idx}:{c[0]},{str_c}\n")

**Phrase Mining for Level 1**

In [93]:
os.chdir("./SeeTopic")
subprocess.check_call(['./seetopic.sh', dir_name, str(args.iters), args.model])
os.chdir("../")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[32m===Get PLM Embeddings===[m


Some weights of BertModel were not initialized from the model checkpoint at /home/pk36/Comparative-Summarization/bert_full_ft/checkpoint-8346/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


####### CONSTRUCTING AND TOKENIZING VOCAB #######
####### COMPUTING STATIC EMBEDDINGS #######


100%|██████████| 5349/5349 [00:41<00:00, 130.25it/s]


[32m===Iter 0: PLM Module===[m
[32m===Iter 1: PLM Module===[m
[32m===Iter 1: Local Module===[m
make: 'cate' is up to date.
Starting training using file ../text_classification_methodology/text_classification_methodology.txt
Reading topics from file text_classification_methodology_1/keywords.txt
Vocab size: 5313
Words in train file: 186360
Read 5 topics
naive_bayes	decision_trees	random_forest	
non_negative_matrix_factorization	latent_semantic_analysis	topic_modeling	
self_training	co_training	transfer_learning	
convolutional_neural_networks	recurrent_neural_networks	long_short_term_memory	
bagging	stacking	voting	
Pre-training for 2 epochs, in total 2 + 10 = 12 epochs
Topic mining results written to file text_classification_methodology_1/res_cate.txt
[32m===Iter 1: Ensemble===[m
[32m===Iter 2: PLM Module===[m
[32m===Iter 2: Local Module===[m
make: 'cate' is up to date.
Starting training using file ../text_classification_methodology/text_classification_methodology.txt
Reading

In [94]:
word2emb = {}
with open(f'./SeeTopic/{dir_name}/embedding_{args.model}.txt') as fin:
	for line in fin:
		data = line.strip().split()
		if len(data) != 769:
			continue
		word = data[0]
		emb = np.array([float(x) for x in data[1:]])
		emb = emb / np.linalg.norm(emb)
		word2emb[word] = emb

taxo.word2emb = word2emb

In [95]:
if os.path.exists(os.path.join(f'SeeTopic/{dir_name}', 'static_emb.pk')):
	with open(os.path.join(f'SeeTopic/{dir_name}', 'static_emb.pk'), "rb") as f:
		saved_emb = pk.load(f)
		static_emb = saved_emb["static_emb"]
		token_lens = saved_emb["token_lens"]
		tokenized_sents = saved_emb["tokenized_sents"]
		tokenized_docs = saved_emb["tokenized_docs"]

	for p_id, paper in enumerate(taxo.collection):
		paper.sentences = tokenized_docs[p_id]
		paper.tokenized = tokenized_sents[p_id]

In [96]:
taxo.static_emb = static_emb
taxo.root.children

[supervised_learning,
 unsupervised_learning,
 semi_supervised_learning,
 deep_learning,
 ensemble_methods]

In [99]:
with open(f"./SeeTopic/{dir_name}/keywords_seetopic.txt", "r") as f:
    children_phrases = [i.strip().split(":")[1].split(",") for i in f.readlines()]
    filtered_children_phrases = []
    for c_id, c in enumerate(taxo.root.children):
        # other parents
        other_parents = "\n".join([f"Sibling topic: {i.label}; Description: {i.desc}" for i in taxo.root.children if i != c])
        other_terms = [p for child in children_phrases[:c_id] + children_phrases[c_id+1:] for p in child]
        # filter the child phrases
        child_phrases = filter_phrases(c, children_phrases[c_id], word2emb, other_parents, other_terms)
        # child_phrases = filter_phrases(c, children_phrases[c_id], other_parents=other_parents)
        filtered_children_phrases.append(child_phrases)

---
supervised_learning_filtering_explanation:'supervised_learning' is redundant and 'naive_bayes', 'decision_trees', 'random_forest','support_vector_machines', 'logistic_regression', 'k_nearest_neighbors', 'gradient_boosting', 'neural_networks', 'feature_selection', 'feature_engineering', 'data_augmentation', 'cross_validation', 'bagging','stacking', 'voting', 'feature_space', 'capsule_network', 'projection_method', 'knn', 'feature_weighting', 'dimensionality_reduction', 'tc', 'text_categorization', 'feature_representations', 'weights','snad', 'alleviate','multi-grained','statistical_methods', 'ontology-based', 'tackling', 'higher_education', 'epat-bert', 'document-level', 'aspect-level_sentiment_classification', 'feature_mapping', 'climate_change_denial', 'digitalization', 'classification_algorithms', 'commentary', 'feature_vectors','specialized', 'parameterization', 'arises', 'tried', 'domain_knowledge', 'feature_weights', 'conceptum','mlm', 'feature_extraction', 'external','maintai

In [100]:
for c_id, c in enumerate(taxo.root.children):
    c.addTerms(filtered_children_phrases[c_id], mined=True, addToParent=True)

**Get initial, exact-matching pool of papers**

In [101]:
for c in taxo.root.children:
    print(c.label, len(c.papers))

supervised_learning 58
unsupervised_learning 132
semi_supervised_learning 28
deep_learning 111
ensemble_methods 29


**Node-Oriented Sentence Representations**

In [102]:
class_phrase_reprs = taxo.getClassReprs(taxo.root.children, phrase=True)

In [103]:
for c in tqdm(taxo.root.children):
    c.rankPapers(class_phrase_reprs)

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:29<00:00,  5.99s/it]


In [104]:
class_reprs = taxo.getClassReprs(taxo.root.children, phrase=False)

In [105]:
paper_reprs = []
for p in taxo.root.papers:
    if p.emb is None:
        paper_reprs.append(p.computePaperEmb(class_reprs, phrase=False))
    else:
        paper_reprs.append(p.emb)

In [106]:
class_labels, mapping = taxo.mapPapers(paper_reprs, taxo.root.children, class_reprs)

In [122]:
for c in taxo.root.children:
    c.density = len(c.papers)/(len(c.parent.papers)/(1 + len(c.parent.children)))

In [123]:
print(f"unmapped: {len(mapping[-1])}, {len(mapping[-1])/(len(taxo.root.papers)/(1 + len(taxo.root.children)))}")
for k in taxo.root.children:
    print(f"{k.label}: {len(k.papers)}, {k.density}")

unmapped: 216, 1.296
supervised_learning: 193, 1.1580000000000001
unsupervised_learning: 201, 1.206
semi_supervised_learning: 468, 2.8080000000000003
deep_learning: 212, 1.272
ensemble_methods: 225, 1.35


**Depth Expansion**

In [125]:
curr_node = taxo.root
global_taxo = taxo.toDict()

for c in curr_node.children:
    if c.density >= 1:
        c.genCommonSenseChildren(global_taxo, k=2, num_terms=20)
    



In [133]:
", ".join([l.label for l in taxo.root.children[4].children])

'bagging, boosting, stacking, voting, hybrid'

In [None]:
# format the input keywords file for seetopic -> get phrases -> filter using LLM
dir_name = (args.track + "_" + args.dim).lower().replace(" ", "_")

if not os.path.exists(f"SeeTopic/{dir_name}"):
    os.makedirs(f"SeeTopic/{dir_name}")

with open(f"SeeTopic/{dir_name}/{dir_name}.txt", "w") as f:
    for p in curr_node.papers:
        f.write(f"{p.text}\n")


## get first level of children
children_with_terms = curr_node.getChildren(terms=True)
with open(f"SeeTopic/{dir_name}/keywords_0.txt", "w") as f:
    for idx, c in enumerate(children_with_terms):
        str_c = ",".join(c[1])
        f.write(f"{idx}:{c[0]},{str_c}\n")

In [56]:
len(taxo.root.papers)

308

In [135]:
root_repr = taxo.root.updateNodeEmb(phrase=True)

In [51]:
for k, v in mapping.items():
    if k == -1:
        print(f"unmapped: {len(v)}")
    else:
        print(f"{taxo.root.children[k]}: {len(v)}")

unmapped: 185
supervised_learning: 659
unsupervised_learning: 26
semi_supervised_learning: 86
deep_learning: 308
ensemble_methods: 73


In [134]:
potential_siblings = taxo.siblingExpansion(taxo.root, mapping)

ValueError: shapes (5215,768) and (1,) not aligned: 768 (dim 1) != 1 (dim 0)

In [37]:
potential_siblings

{'learning': (0, 365845168),
 'phrase': (1, 1024959936),
 'graph': (2, 1615149900),
 'deep_learning': (3, 1698770700),
 'models': (4, 2831294100),
 'fusion': (5, 3547610640),
 'fuzzy': (6, 3946380102),
 'model': (7, 6582211200),
 'machine': (8, 6842318000),
 'academy': (9, 8327732224),
 'linear': (10, 10512634170),
 'optimization': (11, 10868002320),
 'hierarchical': (12, 11764767168),
 'neural': (13, 12319519744),
 'supervised_learning': (14, 12370806720),
 'algorithms': (15, 12546031680),
 'sampling': (16, 13347406368),
 'deep_belief_network': (17, 14062305852),
 'network': (18, 14796022164),
 'ctas': (19, 16306138212),
 'pivotal': (20, 17494725600),
 'algorithm': (21, 18272217600),
 'deep': (22, 18588007440),
 'technique': (23, 19392854737),
 'classification': (24, 19428041880),
 'architecture': (25, 20834904200),
 'ensemble_learning': (26, 22131726120),
 'supervised_machine_learning': (27, 23189579800),
 'machine_learning': (28, 23568924464),
 'posing': (29, 26935359616),
 'supervi

In [45]:
vocab_list = list(potential_siblings.keys())
phrase_reprs = [taxo.static_emb[w] for w in vocab_list]

In [46]:
distance = pairwise_distances(phrase_reprs, metric='cosine')
clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='precomputed')
db = clusterer.fit(distance.astype('float64'))

In [47]:
clusters = {l:[] for l in set(db.labels_)}
for idx, l in enumerate(db.labels_):
    clusters[l].append(vocab_list[idx])

In [48]:
clusters

{0: ['extractive_summarization',
  'summarization_techniques',
  'summarize',
  'summarisation',
  'summaries',
  'summarization'],
 1: ['86.93', '96.54', '97.2', '82.67', '93.3', '60.3'],
 2: ['0.59',
  '0.530',
  '0.97',
  '0.689',
  '0.8638',
  '0.9648',
  '0.635',
  '0.99',
  '0.81',
  '0.80',
  '0.90',
  '0.75'],
 3: ['assimilation-individualization',
  'objectivity-abstraction',
  'determination-indetermination',
  'nomination-categorization',
  'nomination-identification'],
 4: ['1988',
  '1992',
  '2011',
  '©2004',
  '2012',
  '2003',
  '2006',
  '2009',
  '1995',
  '2014',
  '1980',
  '2007',
  '2013',
  '2010',
  '©2010',
  '©_springer-verlag',
  '©_2011_springer-verlag',
  '2005',
  '2008'],
 5: ['posing',
  'mary',
  'petersburg',
  'highlighting',
  'attracts',
  'bringing',
  'succeeding',
  'yielding',
  'whereas',
  'listings',
  'suggesting'],
 6: ['manual_annotation',
  'annotate',
  'annotators',
  'annotator',
  'annotated_data',
  'annotations',
  'annotation',
  

In [35]:
cos_sim = cosine_similarity_embeddings(paper_reprs, class_reprs)

In [37]:
cos_sim[10]

array([0.78230924, 0.7538578 , 0.77479338, 0.80200045, 0.75536154])

In [38]:
[cos_sim[c.papers[-1].id, c_id] for c_id, c in enumerate(taxo.root.children)]

[0.8891159917311015,
 0.9054942803449103,
 0.8125112536457095,
 0.8037140825601731,
 0.7696589366397533]

In [41]:
temp_d = {1:3, 4:5}

In [43]:
[i for i in temp_d]

[1, 4]

In [39]:
id = 3
mapping[-1][id], taxo.collection[mapping[-1][id]]

(12,
 title : a comparison of classification methods for predicting deception in computer-mediated_communication ; abstract : the increased chance of deception in computer-mediated_communication and the potential risk of taking action based on deceptive information calls for automatic detection of deception . to achieve the ultimate_goal of automatic prediction of deception , we selected four common classification methods and empirically compared their performance in predicting deception . the deception and truth data were collected during two experimental studies . the results suggest that all of the four methods were promising for predicting deception with cues to deception . among them , neural_networks exhibited consistent performance and were robust across test settings . the comparisons also highlighted the importance of selecting important input variables and removing noise in an attempt to enhance the performance of classification methods . the selected cues offer both methodol

In [276]:
all_mapped = []
unmapped = []
class_map = {i:[] for i in np.arange(len(taxo.root.children))}

for p_id, l in enumerate(class_labels):
    if len(l) <4:
        all_mapped.append(p_id)
        for c in l:
            class_map[c].append(p_id)
    else:
        unmapped.append(p_id)

class_map = {i:sorted(class_map[i], key=lambda x: -cos_sim[x, i]) for i in np.arange(len(taxo.root.children))}

In [278]:
for c_id, c in enumerate(taxo.root.children):
    print(c.label, len(class_map[c_id]))

supervised_learning 445
unsupervised_learning 463
semi_supervised_learning 98
deep_learning 104
ensemble_methods 262


In [233]:
# bottom_classes = np.argmax(np.diff(np.sort(cos_sim, axis=1), axis=1), axis=1) + 1

# classes = np.argsort(cos_sim, axis=1)
# class_labels = [classes[p_id][b:] for p_id, b in enumerate(bottom_classes)] 

In [99]:
taxo.root.children

[supervised_learning,
 unsupervised_learning,
 semi_supervised_learning,
 deep_learning,
 ensemble_methods]

In [98]:
total = 0
for idx, l in enumerate(class_labels):
    if len(l) < 4:
        print(idx, l)
        total += 1
print(total)

1 [0 2]
20 [0 2]
30 [0 2]
32 [1 0 2]
41 [1 0 2]
49 [2]
61 [3 0 2]
63 [2]
69 [2 0]
74 [2]
149 [1 0 2]
152 [2]
153 [0 1 2]
155 [3 0 2]
159 [0]
215 [0]
232 [1 0 2]
235 [0 2]
261 [1 0 2]
291 [1 0 2]
292 [1 2 0]
293 [0 2]
326 [1 0 2]
341 [2]
367 [1 0 2]
380 [0 2]
391 [2]
404 [1 0 2]
412 [0 2]
413 [0]
416 [0 1 2]
419 [1 0 2]
428 [0 1 2]
430 [1 2 0]
449 [1 0 2]
451 [0 2]
471 [0]
475 [1 0 2]
482 [1 0 2]
500 [2]
508 [0]
510 [0 2]
528 [2 0]
536 [0]
586 [2]
597 [2]
600 [2 0]
616 [0 2]
663 [0 1 2]
670 [0 2]
702 [0 2 1]
703 [2]
736 [0 2]
758 [1 0 2]
759 [0 2]
766 [1 0 2]
777 [0 2]
794 [3 0 2]
798 [1 0 2]
803 [0 2]
811 [2 0]
849 [2 0]
870 [2]
928 [2 0]
948 [0 1 2]
950 [0 2]
954 [1 0 2]
957 [0 2]
960 [0 1 2]
967 [2 0]
974 [2 0]
978 [0 2]
979 [0 1 2]
73


In [96]:
cos_sim

array([[0.41849622, 0.41452385, 0.4203428 , 0.39699356, 0.4142338 ],
       [0.45134438, 0.44479938, 0.45326388, 0.44773577, 0.44168179],
       [0.367142  , 0.36561152, 0.37268315, 0.34918534, 0.35883899],
       ...,
       [0.39107231, 0.38916193, 0.39409682, 0.37851251, 0.38478113],
       [0.35289365, 0.35163603, 0.35420271, 0.32782443, 0.3481511 ],
       [0.47117052, 0.46534621, 0.46529638, 0.44525649, 0.46789527]])

In [19]:
[len([w for w in c.all_node_terms if w in static_emb]) for c in taxo.root.children]

[28, 20, 11, 97, 16]

In [17]:
[len(c.all_node_terms) for c in taxo.root.children]

[29, 33, 24, 105, 23]

In [17]:
class_reprs = [average_with_harmonic_series(np.concatenate([static_emb[w].reshape((1,-1)) for w in c.all_node_terms if w in static_emb], axis=0)) for c in taxo.root.children]

In [24]:
iv_terms = [w for w in taxo.collection[0].vocabulary if w in word2emb]
ranked_tok = rank_by_significance(np.concatenate([word2emb[w].reshape((-1, 768)) for w in iv_terms], axis=0), class_reprs)

In [26]:
ranked_tok

{92: 0,
 9: 1,
 70: 2,
 87: 3,
 65: 4,
 17: 5,
 50: 6,
 60: 7,
 32: 8,
 44: 9,
 84: 10,
 105: 11,
 2: 12,
 39: 13,
 13: 14,
 7: 15,
 41: 16,
 28: 17,
 59: 18,
 57: 19,
 66: 20,
 71: 21,
 11: 22,
 85: 23,
 67: 24,
 94: 25,
 93: 26,
 102: 27,
 78: 28,
 99: 29,
 89: 30,
 8: 31,
 61: 32,
 74: 33,
 101: 34,
 23: 35,
 52: 36,
 77: 37,
 72: 38,
 18: 39,
 33: 40,
 62: 41,
 42: 42,
 98: 43,
 3: 44,
 37: 45,
 43: 46,
 6: 47,
 88: 48,
 45: 49,
 53: 50,
 0: 51,
 104: 52,
 25: 53,
 81: 54,
 76: 55,
 1: 56,
 91: 57,
 79: 58,
 54: 59,
 86: 60,
 51: 61,
 15: 62,
 22: 63,
 97: 64,
 100: 65,
 14: 66,
 10: 67,
 35: 68,
 69: 69,
 21: 70,
 36: 71,
 55: 72,
 64: 73,
 106: 74,
 95: 75,
 48: 76,
 12: 77,
 80: 78,
 96: 79,
 19: 80,
 24: 81,
 73: 82,
 30: 83,
 107: 84,
 4: 85,
 46: 86,
 108: 87,
 83: 88,
 38: 89,
 26: 90,
 47: 91,
 16: 92,
 58: 93,
 5: 94,
 82: 95,
 56: 96,
 31: 97,
 68: 98,
 34: 99,
 63: 100,
 75: 101,
 49: 102,
 40: 103,
 103: 104,
 20: 105,
 27: 106,
 90: 107,
 29: 108}

In [120]:
cosine_similarity_embeddings([static_emb["deep_learning"].numpy()], class_reprs)

array([[0.95788369, 0.93276707, 0.95489257, 0.94776222, 0.93760071]])

In [23]:
for idx, rank in ranked_tok.items():
    print(f"rank: {rank}; token: {iv_terms[idx]}")

rank: 0; token: semi-supervised_learning
rank: 1; token: multi-task_learning
rank: 2; token: transfer_learning
rank: 3; token: multi-class
rank: 4; token: binary_classification
rank: 5; token: supervised_learning
rank: 6; token: prediction_accuracy
rank: 7; token: hierarchies
rank: 8; token: generalization
rank: 9; token: outperforms
rank: 10; token: classification_task
rank: 11; token: nearest_neighbor
rank: 12; token: mtl
rank: 13; token: classifying
rank: 14; token: training_examples
rank: 15; token: datasets
rank: 16; token: jointly
rank: 17; token: task
rank: 18; token: concept
rank: 19; token: approach
rank: 20; token: classification
rank: 21; token: tasks
rank: 22; token: abstract
rank: 23; token: also
rank: 24; token: problem
rank: 25; token: methods
rank: 26; token: using
rank: 27; token: developed
rank: 28; token: approaches
rank: 29; token: namely
rank: 30; token: especially
rank: 31; token: independently
rank: 32; token: classes
rank: 33; token: work
rank: 34; token: improv

In [25]:
for idx, rank in ranked_tok.items():
    print(f"rank: {rank}; token: {iv_terms[idx]}")

rank: 0; token: semi-supervised_learning
rank: 1; token: multi-task_learning
rank: 2; token: binary_classification
rank: 3; token: transfer_learning
rank: 4; token: multi-class
rank: 5; token: supervised_learning
rank: 6; token: prediction_accuracy
rank: 7; token: hierarchies
rank: 8; token: generalization
rank: 9; token: outperforms
rank: 10; token: nearest_neighbor
rank: 11; token: classification_task
rank: 12; token: classifying
rank: 13; token: training_examples
rank: 14; token: mtl
rank: 15; token: datasets
rank: 16; token: task
rank: 17; token: jointly
rank: 18; token: concept
rank: 19; token: approach
rank: 20; token: classification
rank: 21; token: tasks
rank: 22; token: abstract
rank: 23; token: also
rank: 24; token: problem
rank: 25; token: our
rank: 26; token: approaches
rank: 27; token: especially
rank: 28; token: independently
rank: 29; token: developed
rank: 30; token: methods
rank: 31; token: using
rank: 32; token: namely
rank: 33; token: classes
rank: 34; token: improve

In [92]:
# class representations
class_reprs = [c.emb for c in taxo.root.children]
taxo.root.children

[supervised_learning_methods,
 unsupervised_learning_methods,
 deep_learning_methods,
 ensemble_methods,
 transfer_learning_and_domain_adaptation]

In [91]:
cosine_similarity_embeddings([sentence_model.encode(taxo.collection[7].title + 
                                                    "[SEP]" + 
                                                    taxo.collection[7].abstract)], 
                             class_reprs)

array([[0.78571259, 0.76266018, 0.8442609 , 0.87398888, 0.87627614]])

In [None]:
array([[0.85956018, 0.90474514, 0.90334376, 0.88769259, 0.8412127 ]])


In [59]:
classes = [[] for i in taxo.root.children]
unmapped = []

for p in range(len(collection)):
    class_freq = [0] * len(taxo.root.children)

    for c_id, c in enumerate(taxo.root.children):
        # how many total mentions of the node terms
        class_freq[c_id] = np.sum([collection[p].vocabulary[ele] for ele in c.all_node_terms if ele in collection[p].vocabulary.keys()])
    
    nonzero_idx = np.nonzero(class_freq)[0]
    if len(nonzero_idx) == 0:
        unmapped.append(p)
        continue

    for i in nonzero_idx:
        # score: class_i_mentions / log(total_len)
        score = class_freq[i] / np.log(collection[p].length)
        classes[i].append((score, p))

classes = [sorted(c, reverse=True) for c in classes]

In [60]:
len(unmapped)

303

In [55]:
taxo.root.children[-1].all_node_terms

['bagging',
 'boosting',
 'stacking',
 'voting',
 'weighted_voting',
 'random_forest',
 'gradient_boosting',
 'neural_network_ensemble',
 'decision_tree_ensemble',
 'support_vector_machine_ensemble',
 'k_nearest_neighbors_ensemble',
 'feature_bagging',
 'feature_boosting',
 'model_selection',
 'hyperparameter_tuning',
 'cross_validation',
 'ensemble_methods',
 'random_forests',
 'base_learners',
 'ensemble_learning',
 'feature_combination',
 'ensemble_techniques',
 'cluster_based',
 'rbf',
 'mnb',
 'dt',
 'radial_basis_function',
 'base_classifiers',
 'gaussian_naive_bayes',
 'multilayer_perceptron',
 'c4.5',
 'adaboost',
 'attention_layer',
 'feed-forward',
 'thresholding',
 'multinomial_logistic_regression',
 'ensemble_classifier',
 'memory-based',
 'k-nearest_neighbor',
 'nearest_neighbor',
 'principal_component_analysis']

In [67]:
for idx, i in enumerate(classes):
    print(taxo.root.children[idx])
    for p in i[:10]:
        print(p[-1], collection[p[-1]].title)
    print("\n")

supervised_learning
15 arabic_text_categorization via binary particle_swarm_optimization and support_vector_machines ; abstract : document_categorization concerns automatically assigning a category label to a text document , and has increasingly many applications , particularly in the domains of organizing , browsing and search in large_document_collections . it is typically achieved via machine_learning , where a model is built on the basis of a ( typically ) large collection of document features . feature_selection is critical in this process , since there are typically several thousand potential features ( distinct words or terms ) . here we explore binary particle_swarm_optimization ( bpso ) hybridized with either k-nearest-neighbour ( knn ) or a support_vector_machine ( svm ) , for feature_selection in arabic document_categorization tasks . comparison between feature_selection methods is done on the basis of using the selected features , in conjunction with each of svm , c4.5 and 

In [None]:
with open("datasets/updated_qa_phrases.json", "w", encoding='utf-8') as f:
    json_out = {}
    json_out[taxo.root.label] = {"description": taxo.root.desc, "seeds": taxo.root.seeds, "terms": taxo.root.all_node_terms}
    for c in taxo.root.children:
        json_out[c.label] = {"description": c.desc, "seeds": c.seeds, "terms": c.all_node_terms}
    json.dump(json_out, f, ensure_ascii=False, indent=4)