In [12]:
%load_ext autoreload
%autoreload 2
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1,2,3,4"
from collections import Counter
from taxonomy import Taxonomy, Paper
from utils import filter_phrases, cosine_similarity_embeddings
import subprocess
import shutil
import re
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
class Args:
    def __init__(self):
        self.track = "Text Classification"
        self.dim = "Methodology"
        self.input_file = "datasets/sample_1k.txt"
        self.iters = 4
        self.model = "bert_full_ft"

args = Args()

**Base Taxonomy Construction & Reading in Papers**

In [51]:
# input: track, dimension -> get base taxonomy (2 levels) -> Class Tree, Class Node (description, seed words)

taxo = Taxonomy(args.track, args.dim, args.input_file)
base_taxo = taxo.buildBaseTaxo(levels=1, num_terms=20)

print(base_taxo)



{'Types of Methodology Proposed in Text Classification Research Papers': {'description': None, 'seeds': None, 'terms': ['naive_bayes', 'decision_trees', 'random_forest', 'support_vector_machines', 'logistic_regression', 'k_nearest_neighbors', 'gradient_boosting', 'neural_networks', 'feature_selection', 'feature_engineering', 'data_augmentation', 'cross_validation', 'hyperparameter_tuning', 'model_selection', 'ensemble_methods', 'bagging', 'boosting', 'stacking', 'voting', 'weighted_voting', 'kmeans', 'hierarchical_clustering', 'density_based_clustering', 'dbscan', 'apriori', 'association_rule_learning', 'frequent_itemset_mining', 'decision_trees_for_clustering', 'self_organizing_maps', 'competitive_learning', 'non_negative_matrix_factorization', 'latent_semantic_analysis', 'topic_modeling', 'non_negative_factorization', 'matrix_factorization', 'collaborative_filtering', 'content_based_filtering', 'self_training', 'co_training', 'generative_adversarial_networks', 'semi_supervised_neural

In [52]:
# format the input keywords file for seetopic -> get phrases -> filter using LLM
dir_name = (args.track + "_" + args.dim).lower().replace(" ", "_")

if not os.path.exists(f"SeeTopic/{dir_name}"):
    os.makedirs(f"SeeTopic/{dir_name}")

if not os.path.exists(f"SeeTopic/{dir_name}/{dir_name}.txt"):
    shutil.copyfile(args.input_file, f"SeeTopic/{dir_name}/{dir_name}.txt")

## get first level of children
children_with_terms = taxo.root.getChildren(terms=True)
with open(f"SeeTopic/{dir_name}/keywords_0.txt", "w") as f:
    for idx, c in enumerate(children_with_terms):
        str_c = ",".join(c[1])
        f.write(f"{idx}:{c[0]},{str_c}\n")

In [53]:
taxo

{"Types of Methodology Proposed in Text Classification Research Papers": {"description": null, "seeds": null, "terms": ["naive_bayes", "decision_trees", "random_forest", "support_vector_machines", "logistic_regression", "k_nearest_neighbors", "gradient_boosting", "neural_networks", "feature_selection", "feature_engineering", "data_augmentation", "cross_validation", "hyperparameter_tuning", "model_selection", "ensemble_methods", "bagging", "boosting", "stacking", "voting", "weighted_voting", "kmeans", "hierarchical_clustering", "density_based_clustering", "dbscan", "apriori", "association_rule_learning", "frequent_itemset_mining", "decision_trees_for_clustering", "self_organizing_maps", "competitive_learning", "non_negative_matrix_factorization", "latent_semantic_analysis", "topic_modeling", "non_negative_factorization", "matrix_factorization", "collaborative_filtering", "content_based_filtering", "self_training", "co_training", "generative_adversarial_networks", "semi_supervised_neural

**Phrase Mining for Level 1**

In [54]:
os.chdir("./SeeTopic")
subprocess.check_call(['./seetopic.sh', dir_name, str(args.iters), args.model])
os.chdir("../")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[32m===Get PLM Embeddings===[m


Some weights of BertModel were not initialized from the model checkpoint at /home/pk36/Comparative-Summarization/bert_full_ft/checkpoint-8346/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 5359/5359 [00:45<00:00, 118.00it/s]


[32m===Iter 0: PLM Module===[m
[32m===Iter 1: PLM Module===[m
[32m===Iter 1: Local Module===[m
make: 'cate' is up to date.
Starting training using file ../text_classification_methodology/text_classification_methodology.txt
Reading topics from file text_classification_methodology_1/keywords.txt
Vocab size: 5312
Words in train file: 186462
Read 5 topics
naive_bayes	decision_trees	random_forest	
latent_semantic_analysis	topic_modeling	classic_feature_selection	
transfer_learning	active_learning	supervised_learning	
convolutional_neural_networks	recurrent_neural_networks	attention_mechanism	
bagging	boosting	stacking	
Pre-training for 2 epochs, in total 2 + 10 = 12 epochs
Topic mining results written to file text_classification_methodology_1/res_cate.txt
[32m===Iter 1: Ensemble===[m
[32m===Iter 2: PLM Module===[m
[32m===Iter 2: Local Module===[m
make: 'cate' is up to date.
Starting training using file ../text_classification_methodology/text_classification_methodology.txt
Readin

In [55]:
with open(f"./SeeTopic/{dir_name}/keywords_seetopic.txt", "r") as f:
    children_phrases = [i.strip().split(":")[1].split(",") for i in f.readlines()]
    filtered_children_phrases = []
    for c_id, c in enumerate(taxo.root.children):
        # filter the child phrases
        child_phrases = filter_phrases(c, f"{c}: {children_phrases[c_id]}\n")
        filtered_children_phrases.append(child_phrases)

In [56]:
for c_id, c in enumerate(taxo.root.children):
    c.addTerms(filtered_children_phrases[c_id], addToParent=True)

**Get initial, exact-matching pool of papers**

In [58]:
taxo.root

Types of Methodology Proposed in Text Classification Research Papers

In [70]:
for i in sorted(taxo.root.children[4].papers, key=lambda x: x[0], reverse=True)[:10]:
    print(i[1].title)

comparative_analysis of binary classifiers on an array of scientific_publications
hybrid supervised clustering based ensemble scheme for text_classification
automatic polarity identification on twitter using machine_learning
application of bagging_ensemble classifier based on genetic_algorithm in the text_classification of railway fault hazards
comparison of machine_learning for sentiment_analysis in detecting anxiety based on social_media data
predicting software defect severity_level using sentence_embedding and ensemble_learning
forestexter : an efficient random_forest algorithm for imbalanced text_categorization
enhanced malay sentiment_analysis with an ensemble classification machine_learning approach
sentiment_analysis of chinese_micro-blog using semantic sentiment space model
an ensemble model for stance_detection in social_media texts


**Node-Oriented Sentence Representations**

In [10]:
word2emb = {}
with open(f'./SeeTopic/{dir_name}/embedding_{args.model}.txt') as fin:
	for line in fin:
		data = line.strip().split()
		if len(data) != 769:
			continue
		word = data[0]
		emb = np.array([float(x) for x in data[1:]])
		emb = emb / np.linalg.norm(emb)
		word2emb[word] = emb

In [27]:
# class representations
class_reprs = []
taxo.root.children[1].all_node_terms

['kmeans',
 'hierarchical_clustering',
 'density_based_clustering',
 'dbscan',
 'apriori',
 'association_rule_learning',
 'frequent_itemset_mining',
 'decision_trees_for_clustering',
 'self_organizing_maps',
 'competitive_learning',
 'non_negative_matrix_factorization',
 'latent_semantic_analysis',
 'topic_modeling',
 'non_negative_factorization',
 'matrix_factorization',
 'collaborative_filtering',
 'content_based_filtering',
 'unsupervised_learning',
 'string_vectors',
 'similarity_measure',
 'numerical_vectors',
 'self_organizing_map',
 'clustering_algorithm',
 'mutual_information',
 'vector_space_model',
 'text_representation',
 'word_vectors',
 'term_frequency',
 'feature_vectors',
 'word_vector',
 'som',
 'text_generation',
 'vector_space',
 'relation_extraction',
 'clustering',
 'vsm']

In [26]:
cosine_similarity_embeddings([word2emb["kmeans"]], 
                             [word2emb["hierarchical_clustering"]])

array([[0.66561028]])

In [59]:
classes = [[] for i in taxo.root.children]
unmapped = []

for p in range(len(collection)):
    class_freq = [0] * len(taxo.root.children)

    for c_id, c in enumerate(taxo.root.children):
        # how many total mentions of the node terms
        class_freq[c_id] = np.sum([collection[p].vocabulary[ele] for ele in c.all_node_terms if ele in collection[p].vocabulary.keys()])
    
    nonzero_idx = np.nonzero(class_freq)[0]
    if len(nonzero_idx) == 0:
        unmapped.append(p)
        continue

    for i in nonzero_idx:
        # score: class_i_mentions / log(total_len)
        score = class_freq[i] / np.log(collection[p].length)
        classes[i].append((score, p))

classes = [sorted(c, reverse=True) for c in classes]

In [60]:
len(unmapped)

303

In [55]:
taxo.root.children[-1].all_node_terms

['bagging',
 'boosting',
 'stacking',
 'voting',
 'weighted_voting',
 'random_forest',
 'gradient_boosting',
 'neural_network_ensemble',
 'decision_tree_ensemble',
 'support_vector_machine_ensemble',
 'k_nearest_neighbors_ensemble',
 'feature_bagging',
 'feature_boosting',
 'model_selection',
 'hyperparameter_tuning',
 'cross_validation',
 'ensemble_methods',
 'random_forests',
 'base_learners',
 'ensemble_learning',
 'feature_combination',
 'ensemble_techniques',
 'cluster_based',
 'rbf',
 'mnb',
 'dt',
 'radial_basis_function',
 'base_classifiers',
 'gaussian_naive_bayes',
 'multilayer_perceptron',
 'c4.5',
 'adaboost',
 'attention_layer',
 'feed-forward',
 'thresholding',
 'multinomial_logistic_regression',
 'ensemble_classifier',
 'memory-based',
 'k-nearest_neighbor',
 'nearest_neighbor',
 'principal_component_analysis']

In [67]:
for idx, i in enumerate(classes):
    print(taxo.root.children[idx])
    for p in i[:10]:
        print(p[-1], collection[p[-1]].title)
    print("\n")

supervised_learning
15 arabic_text_categorization via binary particle_swarm_optimization and support_vector_machines ; abstract : document_categorization concerns automatically assigning a category label to a text document , and has increasingly many applications , particularly in the domains of organizing , browsing and search in large_document_collections . it is typically achieved via machine_learning , where a model is built on the basis of a ( typically ) large collection of document features . feature_selection is critical in this process , since there are typically several thousand potential features ( distinct words or terms ) . here we explore binary particle_swarm_optimization ( bpso ) hybridized with either k-nearest-neighbour ( knn ) or a support_vector_machine ( svm ) , for feature_selection in arabic document_categorization tasks . comparison between feature_selection methods is done on the basis of using the selected features , in conjunction with each of svm , c4.5 and 