In [1]:
%load_ext autoreload
%autoreload 2
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1,2,3,4"
from taxonomy import Taxonomy, Paper
from utils import filter_phrases
import subprocess
import shutil
import re

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.13s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
class Args:
    def __init__(self):
        self.track = "Text Classification"
        self.dim = "Methodology"
        self.input_file = "datasets/sample_1k.txt"
        self.iters = 4

args = Args()

**Reading in Papers**

In [3]:
collection = []
id = 0
with open(args.input_file, "r") as f:
    papers = f.read().strip().splitlines()
    for p in papers:
        title = re.findall(r'title\s*:\s*(.*) ; ', p, re.IGNORECASE)
        abstract = re.findall(r'abstract\s*:\s*(.*)', p, re.IGNORECASE)
        collection.append(Paper(id, title, abstract))
        id += 1


**Base Taxonomy Construction**

In [4]:
# input: track, dimension -> get base taxonomy (2 levels) -> Class Tree, Class Node (description, seed words)

taxo = Taxonomy(args.track, args.dim)
base_taxo = taxo.buildBaseTaxo(levels=1, num_terms=20)

print(base_taxo)



{'Types of Methodology Proposed in Text Classification Research Papers': {'description': None, 'seeds': None, 'terms': ['naive_bayes', 'decision_trees', 'random_forest', 'support_vector_machines', 'logistic_regression', 'k_nearest_neighbors', 'gradient_boosting', 'neural_networks', 'feature_selection', 'feature_engineering', 'data_augmentation', 'cross_validation', 'hyperparameter_tuning', 'model_selection', 'ensemble_methods', 'bagging', 'boosting', 'stacking', 'voting', 'weighted_voting', 'kmeans', 'hierarchical_clustering', 'density_based_clustering', 'dbscan', 'apriori', 'association_rule_learning', 'frequent_itemset_mining', 'decision_trees_for_clustering', 'self_organizing_maps', 'competitive_learning', 'non_negative_matrix_factorization', 'latent_semantic_analysis', 'topic_modeling', 'non_negative_factorization', 'matrix_factorization', 'collaborative_filtering', 'content_based_filtering', 'self_training', 'co_training', 'generative_adversarial_networks', 'semi_supervised_neural

In [5]:
# format the input keywords file for seetopic -> get phrases -> filter using LLM
dir_name = (args.track + "_" + args.dim).lower().replace(" ", "_")

if not os.path.exists(f"SeeTopic/{dir_name}"):
    os.makedirs(f"SeeTopic/{dir_name}")

if not os.path.exists(f"SeeTopic/{dir_name}/{dir_name}.txt"):
    shutil.copyfile(args.input_file, f"SeeTopic/{dir_name}/{dir_name}.txt")

## get first level of children
children_with_terms = taxo.root.getChildren(terms=True)
with open(f"SeeTopic/{dir_name}/keywords_0.txt", "w") as f:
    for idx, c in enumerate(children_with_terms):
        str_c = ",".join(c[1])
        f.write(f"{idx}:{c[0]},{str_c}\n")

In [6]:
taxo

{"Types of Methodology Proposed in Text Classification Research Papers": {"description": null, "seeds": null, "terms": ["naive_bayes", "decision_trees", "random_forest", "support_vector_machines", "logistic_regression", "k_nearest_neighbors", "gradient_boosting", "neural_networks", "feature_selection", "feature_engineering", "data_augmentation", "cross_validation", "hyperparameter_tuning", "model_selection", "ensemble_methods", "bagging", "boosting", "stacking", "voting", "weighted_voting", "kmeans", "hierarchical_clustering", "density_based_clustering", "dbscan", "apriori", "association_rule_learning", "frequent_itemset_mining", "decision_trees_for_clustering", "self_organizing_maps", "competitive_learning", "non_negative_matrix_factorization", "latent_semantic_analysis", "topic_modeling", "non_negative_factorization", "matrix_factorization", "collaborative_filtering", "content_based_filtering", "self_training", "co_training", "generative_adversarial_networks", "semi_supervised_neural

**Phrase Mining for Level 1**

In [7]:
os.chdir("./SeeTopic")
subprocess.check_call(['./seetopic.sh', dir_name, str(args.iters), "bert_full_ft"])
os.chdir("../")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[32m===Get PLM Embeddings===[m


Some weights of BertModel were not initialized from the model checkpoint at /home/pk36/Comparative-Summarization/bert_full_ft/checkpoint-8346/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 5359/5359 [00:43<00:00, 123.17it/s]


[32m===Iter 0: PLM Module===[m
[32m===Iter 1: PLM Module===[m
[32m===Iter 1: Local Module===[m
make: 'cate' is up to date.
Starting training using file ../text_classification_methodology/text_classification_methodology.txt
Reading topics from file text_classification_methodology_1/keywords.txt
Vocab size: 5312
Words in train file: 186462
Read 5 topics
naive_bayes	decision_trees	random_forest	
latent_semantic_analysis	topic_modeling	classic_feature_selection	
transfer_learning	active_learning	supervised_learning	
convolutional_neural_networks	recurrent_neural_networks	attention_mechanism	
bagging	boosting	stacking	
Pre-training for 2 epochs, in total 2 + 10 = 12 epochs
Topic mining results written to file text_classification_methodology_1/res_cate.txt
[32m===Iter 1: Ensemble===[m
[32m===Iter 2: PLM Module===[m
[32m===Iter 2: Local Module===[m
make: 'cate' is up to date.
Starting training using file ../text_classification_methodology/text_classification_methodology.txt
Readin

In [14]:
with open(f"./SeeTopic/{dir_name}/keywords_seetopic.txt", "r") as f:
    children_phrases = [i.strip().split(":")[1].split(",") for i in f.readlines()]
    filtered_children_phrases = []
    for c_id, c in enumerate(taxo.root.children):
        # filter the child phrases
        child_phrases = filter_phrases(c, f"{c}: {children_phrases[c_id]}\n")
        filtered_children_phrases.append(child_phrases)

---
supervised_learning: [supervised_learning, naive_bayes, decision_trees, random_forest, support_vector_machines, logistic_regression, k_nearest_neighbors, neural_networks, feature_selection, feature_engineering, data_augmentation, cross_validation, bagging, boosting, stacking, voting, feature_space, knn, document_frequency, weighting, statistical_methods, classification_algorithms, feature_weights, feature_vector, feature_values, membership, category_labels, maximum_entropy, feature_reduction, regularization, determination]
---
---
unsupervised_learning: [unsupervised_learning, latent_semantic_analysis, topic_modeling, string_vectors, similarity_measure, numerical_vectors, self_organizing_map, clustering_algorithm, term_weighting, mutual_information, string_vector, vector_space_model, feature_transformation, vectorization, gibbs_sampling, word_vectors, feature_vectors, lsi, word_vector, som, sentence_similarity, cluster_analysis, vsm, latent_dirichlet_allocation, lsa, text_classific

In [17]:
for c_id, c in enumerate(taxo.root.children):
    c.addTerms(filtered_children_phrases[c_id], addToParent=True)

In [57]:
phrase_terms = ""
for c in taxo.root.children:
    phrase_terms += f"{c}: {c.all_node_terms}\n"

supervised_learning: ['naive_bayes', 'logistic_regression', 'decision_trees', 'random_forest', 'support_vector_machines', 'k_nearest_neighbors', 'neural_networks', 'gradient_boosting', 'feature_extraction', 'text_features', 'supervised_learning', 'feature_transformation', 'rule_based', 'shot', 'sentence_classification', 'parameterization', 'capsule_network', 'commentary', 'feature_representations', 'opportunities', 'numerical_vectors', 'approaches', 'feature_vectors', '3a', 'phenotype', 'biomedical_abstracts', 'aspect-level_sentiment_classification', 'algerian', 'auto-tagging', 'open_information_extraction', 'maximum_entropy', 'pathways', 'string_vectors', 'tackling', 'bug_triaging', 'network-based', 'statistical_methods', 'online_handwritten', 'feature_vector', 'multi-lingual', 'dialog_act', 'machine_learning', 'multi-modal', 'gene_expression', 'cross-lingual', 'text_mining', 'document-level', 'decision_making', 'classification_algorithms', 'feature_space', 'social_event', 'conceptum'