**Setup**

In [1]:
%load_ext autoreload
%autoreload 2
import os
os.environ['HF_HOME'] = '/shared/data3/pk36/.cache'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
!export HF_HOME=/shared/data3/pk36/.cache

In [3]:
from model_definitions import initializeLLM, promptLLM, constructPrompt
import json
from utils import clean_json_string
from collections import deque
from taxonomy import Node
import re

In [4]:
class Args:
    def __init__(self):
        
        self.topic = "natural language processing"
        self.dimensions = ["tasks", "datasets", "methodologies", "evaluation_methods", "real_world_domains"]
        self.llm = 'vllm'
        self.init_levels = 2

        self.dataset = "Reasoning"
        self.data_dir = f"datasets/multi_dim/{self.dataset.lower().replace(' ', '_')}/"
        self.internal = f"{self.dataset}.txt"
        self.external = f"{self.dataset}_external.txt"
        self.groundtruth = "groundtruth.txt"
        
        self.length = 512
        self.dim = 768

        self.iters = 4

args = Args()

In [5]:
args = initializeLLM(args)

INFO 11-25 07:32:56 config.py:729] Defaulting to use mp for distributed inference
INFO 11-25 07:32:56 config.py:820] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 11-25 07:32:56 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='meta-llama/Meta-Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=meta-llama/Meta-

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=3229229)[0;0m INFO 11-25 07:33:02 model_runner.py:732] Loading model weights took 7.5122 GB
INFO 11-25 07:33:02 model_runner.py:732] Loading model weights took 7.5122 GB
INFO 11-25 07:33:03 distributed_gpu_executor.py:56] # GPU blocks: 15716, # CPU blocks: 4096
INFO 11-25 07:33:05 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-25 07:33:05 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
[1;36m(VllmWorkerProcess pid=3229229)[0;0m INFO 11-25 07:33:05 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not sta



**Construct a 2-Level Multi-Dimensional Taxonomy**

In [6]:
from prompts import multi_dim_prompt, NodeListSchema

In [11]:
# we want to make this a directed acyclic graph (DAG) so maintain a list of the nodes
roots = {}
id2node = {}
label2node = {}
idx = 0

for dim in args.dimensions:
    mod_topic = args.topic.replace(' ', '_').lower() + f"_{dim}"
    root = Node(
            id=idx,
            label=mod_topic,
            dimension=dim
        )
    roots[dim] = root
    id2node[idx] = root
    label2node[mod_topic] = root
    idx += 1

In [12]:
queue = deque([node for id, node in id2node.items()])

# if taking long, you can probably parallelize this between the different taxonomies (expand by level)
while queue:
    curr_node = queue.popleft()
    label = curr_node.label
    # expand
    system_instruction, main_prompt, json_output_format = multi_dim_prompt(curr_node)
    prompts = [constructPrompt(args, system_instruction, main_prompt + "\n\n" + json_output_format)]
    outputs = promptLLM(args=args, prompts=prompts, schema=NodeListSchema, max_new_tokens=3000, json_mode=True, temperature=0.1, top_p=0.99)[0]
    outputs = json.loads(clean_json_string(outputs)) if "```" in outputs else json.loads(outputs.strip())
    outputs = outputs['root_topic']

    # add all children
    for key, value in outputs.items():
        key = key.replace(' ', '_').lower()
        if (key not in label2node) or ((key in label2node) and (label2node[key].dimension != curr_node.dimension)):
            child_node = Node(
                    id=len(id2node),
                    label=key,
                    dimension=curr_node.dimension,
                    description=value['description'],
                    parents=[curr_node]
                )
            curr_node.add_child(key, child_node)
            id2node[child_node.id] = child_node
            label2node[key] = child_node
            if child_node.level < args.init_levels:
                queue.append(child_node)
        
        else:
            child_node = label2node[key]
            child_node.add_parent(curr_node)

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.63s/it, est. speed input: 80.14 toks/s, output: 64.57 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.61s/it, est. speed input: 80.81 toks/s, output: 65.49 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.84s/it, est. speed input: 75.79 toks/s, output: 64.86 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.89s/it, est. speed input: 74.81 toks/s, output: 64.77 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.40s/it, est. speed input: 107.96 toks/s, output: 64.61 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.89s/it, est. speed input: 74.01 toks/s, output: 64.67 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.71s/it, est. speed input: 81.27 toks/s, output: 64.65 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.09s/it, est. speed input: 73.88 toks/s, output: 64.80 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<

In [13]:
roots

{'tasks': Node(label=natural_language_processing_tasks, dim=tasks, description=None, level=0),
 'datasets': Node(label=natural_language_processing_datasets, dim=datasets, description=None, level=0),
 'methodologies': Node(label=natural_language_processing_methodologies, dim=methodologies, description=None, level=0),
 'evaluation_methods': Node(label=natural_language_processing_evaluation_methods, dim=evaluation_methods, description=None, level=0),
 'real_world_domains': Node(label=natural_language_processing_real_world_domains, dim=real_world_domains, description=None, level=0)}

In [None]:
roots['evaluation_methods'].display(0, indent_multiplier=5)

Label: natural_language_processing_methodologies
Dimension: methodologies
Description: None
Level: 0
----------------------------------------
Children:
     Label: supervised_learning
     Dimension: methodologies
     Description: Supervised learning is a type of machine learning where the algorithm is trained on labeled data to learn the mapping between input and output.
     Level: 1
     ----------------------------------------
     Children:
          Label: regression
          Dimension: methodologies
          Description: A type of supervised learning where the goal is to predict a continuous output value.
          Level: 2
          ----------------------------------------
          Label: classification
          Dimension: methodologies
          Description: A type of supervised learning where the goal is to predict a categorical output value.
          Level: 2
          ----------------------------------------
          Label: clustering
          Dimension: methodologi

**Read in dataset**

In [15]:
from datasets import load_dataset
from tqdm import tqdm
from paper import Paper

In [16]:
if not os.path.exists(args.data_dir):
    os.makedirs(args.data_dir)

In [17]:
ds = load_dataset("EMNLP/EMNLP2024-papers")

In [18]:
internal_collection = {}

with open(os.path.join(args.data_dir, 'internal.txt'), 'w') as i:
    internal_count = 0
    id = 0
    for p in tqdm(ds['train']):
        temp_dict = {"Title": p['title'], "Abstract": p['abstract']}
        formatted_dict = json.dumps(temp_dict)
        i.write(f'{formatted_dict}\n')
        internal_collection[id] = Paper(id, p['title'], p['abstract'], label_opts=args.dimensions, internal=True)
        internal_count += 1
        id += 1
print(f'Internal: {internal_count}')

  0%|          | 0/2954 [00:00<?, ?it/s]

100%|██████████| 2954/2954 [00:00<00:00, 5190.99it/s]

Internal: 2954





In [19]:
external_ds = load_dataset("TimSchopf/nlp_taxonomy_data")

In [20]:
external_collection = {}

with open(os.path.join(args.data_dir, 'external.txt'), 'w') as e:
    external_count = 0
    id = len(internal_collection)
    for p in tqdm(external_ds['train']):
        temp_dict = {"Title": p['title'], "Abstract": p['abstract']}
        formatted_dict = json.dumps(temp_dict)
        e.write(f'{formatted_dict}\n')
        external_collection[id] = Paper(id, p['title'], p['abstract'], label_opts=args.dimensions, internal=False)
        external_count += 1
        id += 1
print(f'External Count: {external_count}')

100%|██████████| 178521/178521 [00:14<00:00, 12194.10it/s]

External Count: 178521





**Enrich each node with a set of terms and sentences**

In [21]:
from taxonomy import DAG

In [22]:
dags = {dim:DAG(root=root, dim=dim) for dim, root in roots.items()}

In [23]:
enriched_phrases = {dim:[] for dim in args.dimensions}
enriched_sentences = {dim:[] for dim in args.dimensions}

for dim, dag in dags.items():
    all_phrases, all_sentences = dag.enrich_dag(args, id2node)
    enriched_phrases[dim].extend(all_phrases)
    enriched_sentences[dim].extend(all_sentences)

Compiling FSM index for all state transitions: 100%|██████████| 1012/1012 [01:03<00:00, 15.82it/s]
Processed prompts: 100%|██████████| 26/26 [00:10<00:00,  2.40it/s, est. speed input: 1147.85 toks/s, output: 892.82 toks/s]
Processed prompts: 100%|██████████| 27/27 [00:12<00:00,  2.24it/s, est. speed input: 1060.84 toks/s, output: 758.86 toks/s]
Processed prompts: 100%|██████████| 26/26 [00:10<00:00,  2.47it/s, est. speed input: 1142.85 toks/s, output: 860.17 toks/s]
Processed prompts: 100%|██████████| 29/29 [00:10<00:00,  2.65it/s, est. speed input: 1288.17 toks/s, output: 919.46 toks/s]
Processed prompts: 100%|██████████| 26/26 [00:08<00:00,  3.04it/s, est. speed input: 1346.50 toks/s, output: 888.25 toks/s]


In [33]:
roots['tasks'].children

{'text_classification': Node(label=text_classification, dim=tasks, description=The process of assigning a label to a piece of text based on its content, such as spam vs. non-spam emails or positive vs. negative reviews., level=1),
 'sentiment_analysis': Node(label=sentiment_analysis, dim=tasks, description=The process of determining the emotional tone or sentiment of a piece of text, such as whether a review is positive or negative., level=1),
 'named_entity_recognition': Node(label=named_entity_recognition, dim=tasks, description=The process of identifying and categorizing named entities in unstructured text, such as people, places, and organizations., level=1),
 'language_translation': Node(label=language_translation, dim=tasks, description=The process of converting text from one language to another, such as translating English to Spanish., level=1),
 'question_answering': Node(label=question_answering, dim=tasks, description=The process of automatically answering questions based on 

In [36]:
roots['tasks'].children['language_translation'].get_phrases()

['text_localization',
 'language_modeling_approaches',
 'language_generation_models',
 'language_alignment',
 'text_decomposition',
 'text_segmentation_software',
 'text_equivalence',
 'language_compression',
 'text_expansion',
 'sentence_splitting',
 'translation_accuracy',
 'phrase_extraction',
 'text_modification',
 'language_model',
 'text_forecasting',
 'text_parsing',
 'language_prediction',
 'statistical_models',
 'text_segmentation_algorithms',
 'text_rewriting',
 'sequence_prediction',
 'language_pair',
 'machine_translation',
 'text_division',
 'target_language',
 'next_word_prediction',
 'predictive_language',
 'tokenization',
 'machine_learning',
 'language_understanding',
 'translation_error',
 'language_generation',
 'sentence_boundary_detection',
 'translation_speed',
 'text_forecasting_models',
 'translation',
 'word_identification',
 'algorithmic_approach',
 'language_transfer',
 'text_chunking',
 'text_segmentation_techniques',
 'segmentation_techniques',
 'text_gener

**Identify Pseudo-labels for Dimension/Type Classification**

In [35]:
def find_any_match(patterns, input_string):
    """
    Check if any pattern in the list matches the input string.

    :param patterns: List of regex patterns (as strings)
    :param input_string: The string to search within
    :return: True if any pattern matches, otherwise False
    """
    # Compile all the patterns to make matching more efficient
    compiled_patterns = [re.compile(pattern) for pattern in patterns]
    
    # Check if any compiled pattern matches the input string
    for compiled_pattern in compiled_patterns:
        if compiled_pattern.search(input_string):
            return True
    
    return False

In [36]:
args.dimensions

['tasks',
 'datasets',
 'methodologies',
 'evaluation_methods',
 'real_world_applications']

In [37]:
pseudo_labels = {d:[] for d in args.dimensions}
paper_dims = {}

patterns = {"datasets": [r'introduce [\s\w]* benchmark', r'introduce [\s\w]* dataset', r'construct [\s\w]* benchmark', r'construct [\s\w]* dataset', r'propose [\s\w]* dataset', r'propose [\s\w]* benchmark', r'present [\s\w]* benchmark', r'present [\s\w]* dataset', r'develop [\s\w]* benchmark', r'develop [\s\w]* dataset', r'create [\s\w]* benchmark', r'create [\s\w]* dataset', r'provide [\s\w]* benchmark', r'provide [\s\w]* dataset', r'describe [\s\w]* benchmark', r'describe [\s\w]* dataset', r'propose a new benchmark', r'propose a new dataset', r'introduce a new benchmark', r'introduce a new dataset', r'we release [\s\w]* dataset', r'we release [\s\w]* benchmark', r'a new dataset for [\s\w]*', r'a new benchmark for [\s\w]*', r'dataset for [\s\w]* task', r'benchmark for [\s\w]* task', r'we present [\s\w]* dataset', r'we present [\s\w]* benchmark', r'dataset designed for [\s\w]*', r'benchmark designed for [\s\w]*', r'introducing [\s\w]* dataset', r'introducing [\s\w]* benchmark'],
            "methodologies": [r'introduce [\s\w]* method', r'propose [\s\w]* method', r'design [\s\w]* method', r'present [\s\w]* method', r'develop [\s\w]* method', r'introduce [\s\w]* approach', r'propose [\s\w]* approach', r'design [\s\w]* approach', r'present [\s\w]* approach', r'develop [\s\w]* approach', r'we propose [\s\w]* method', r'we propose [\s\w]* approach', r'we introduce [\s\w]* method', r'we introduce [\s\w]* approach', r'we present [\s\w]* method', r'we present [\s\w]* approach', r'propose a novel method', r'propose a novel approach', r'introduce a novel method', r'introduce a novel approach', r'present a novel method', r'present a novel approach', r'propose [\s\w]* framework', r'introduce [\s\w]* framework', r'present [\s\w]* framework', r'design [\s\w]* framework', r'we propose [\s\w]* framework', r'we introduce [\s\w]* framework', r'we present [\s\w]* framework', r'our proposed method [\s\w]*', r'our proposed approach [\s\w]*', r'our proposed framework [\s\w]*', r'this paper proposes [\s\w]* method', r'this paper introduces [\s\w]* method', r'this paper presents [\s\w]* method', r'this paper develops [\s\w]* method', r'this paper proposes [\s\w]* approach', r'this paper introduces [\s\w]* approach', r'this paper presents [\s\w]* approach', r'this paper develops [\s\w]* approach', r'this paper proposes [\s\w]* framework', r'this paper introduces [\s\w]* framework', r'this paper presents [\s\w]* framework', r'this paper develops [\s\w]* framework'],
            "evaluation_methods": [r'construct a [\s\w]* evaluate', r'design a [\s\w]* evaluate', r'propose a [\s\w]* evaluate', r'introduce [\s\w]* evaluation method', r'propose [\s\w]* evaluation method', r'design [\s\w]* evaluation method', r'develop [\s\w]* evaluation method', r'introduce [\s\w]* evaluation metric', r'propose [\s\w]* evaluation metric', r'design [\s\w]* evaluation metric', r'develop [\s\w]* evaluation metric', r'propose a novel evaluation method', r'propose a novel evaluation metric', r'present a novel evaluation framework', r'introduce a framework for evaluation', r'this paper proposes [\s\w]* evaluation', r'this paper introduces [\s\w]* evaluation', r'introduce [\s\w]* automatic evaluation', r'propose [\s\w]* automatic evaluation', r'develop [\s\w]* automatic evaluation', r'design [\s\w]* automatic evaluation', r'propose a novel automatic evaluation method', r'automatic evaluation of [\s\w]* task', r'develop a method for automatic evaluation', r'introduce [\s\w]* human evaluation', r'propose [\s\w]* human evaluation', r'develop [\s\w]* human evaluation', r'design [\s\w]* human evaluation', r'propose a framework for human evaluation', r'introduce a novel human evaluation method', r'conduct human evaluation of [\s\w]*', r'compare human and automatic evaluation', r'comparison of human evaluation and automatic evaluation', r'human evaluation versus automatic evaluation', r'evaluate using both human and automatic methods', r'analyze results from human and automatic evaluation']}

for id, paper in tqdm(internal_collection.items(), total=len(internal_collection)):
    for dim, dim_patterns in patterns.items():
        if find_any_match(dim_patterns, f'{paper.title}: {paper.abstract}'.lower()):
            pseudo_labels[dim].append(paper)
            if id in paper_dims:
                paper_dims[id].append(dim)
            else:
                paper_dims[id] = [dim]
print({dim: len(papers) for dim, papers in pseudo_labels.items()})

for id, paper in tqdm(external_collection.items(), total=len(external_collection)):
    for dim, dim_patterns in patterns.items():
        if find_any_match(dim_patterns, f'{paper.title}: {paper.abstract}'.lower()):
            pseudo_labels[dim].append(paper)
            if id in paper_dims:
                paper_dims[id].append(dim)
            else:
                paper_dims[id] = [dim]

print({dim: len(papers) for dim, papers in pseudo_labels.items()})

100%|██████████| 2954/2954 [00:00<00:00, 6785.56it/s]


{'tasks': 0, 'datasets': 233, 'methodologies': 487, 'evaluation_methods': 23, 'real_world_applications': 0}


100%|██████████| 178521/178521 [00:26<00:00, 6771.40it/s]

{'tasks': 0, 'datasets': 3377, 'methodologies': 24568, 'evaluation_methods': 416, 'real_world_applications': 0}





In [38]:
len(paper_dims), paper_dims

(27488,
 {0: ['datasets', 'methodologies'],
  2: ['methodologies'],
  6: ['methodologies'],
  7: ['methodologies'],
  9: ['datasets'],
  10: ['datasets'],
  13: ['datasets', 'methodologies'],
  16: ['datasets'],
  19: ['methodologies'],
  21: ['datasets'],
  31: ['methodologies'],
  36: ['methodologies'],
  40: ['methodologies'],
  49: ['methodologies'],
  54: ['methodologies'],
  56: ['methodologies'],
  59: ['methodologies'],
  64: ['methodologies'],
  85: ['methodologies'],
  89: ['methodologies'],
  90: ['methodologies'],
  91: ['datasets'],
  94: ['methodologies'],
  97: ['datasets', 'methodologies'],
  106: ['evaluation_methods'],
  109: ['methodologies'],
  112: ['datasets', 'methodologies'],
  122: ['methodologies'],
  123: ['methodologies'],
  131: ['datasets'],
  132: ['methodologies'],
  133: ['methodologies'],
  135: ['methodologies'],
  148: ['methodologies'],
  149: ['methodologies'],
  150: ['methodologies'],
  151: ['methodologies'],
  156: ['methodologies'],
  164: ['m

In [41]:
internal_collection[2494].abstract

'Large Language Models (LLMs) struggle with providing current information due to the outdated pre-training data. Existing methods for updating LLMs, such as knowledge editing and continual fine-tuning, have significant drawbacks in generalizability of new information and the requirements on structured updating corpus. We identify the core challenge behind these drawbacks: the LM-logical discrepancy featuring the difference between language modeling probabilities and logical probabilities. To evaluate and address the core challenge, we propose a new task formulation of the information updating task that only requires the provision of an unstructured updating corpus and evaluates the performance of information updating on the generalizability to question-answer pairs pertaining to the updating information.We further propose a novel and effective pipeline approach for the task, highlighting a self-prompting-based question-answer generation process and a associative distillation methods to

**Loose Classification of Papers**

In [20]:
args.llm = 'vllm'
# initializeLLM(args)

In [21]:
len(internal_collection)

662

In [23]:
dag.classify_dag(args, collection=internal_collection, label2node=label2node)

visiting:  natural_language_processing


Processed prompts: 100%|██████████| 662/662 [02:50<00:00,  3.89it/s, est. speed input: 7856.39 toks/s, output: 163.52 toks/s]


visiting:  text_generation


Processed prompts: 100%|██████████| 36/36 [00:08<00:00,  4.37it/s, est. speed input: 7234.70 toks/s, output: 169.95 toks/s]


visiting:  style_transfer


Processed prompts: 100%|██████████| 8/8 [00:08<00:00,  1.07s/it, est. speed input: 416.70 toks/s, output: 145.10 toks/s]


visiting:  conditional_text_generation


Processed prompts: 100%|██████████| 25/25 [00:10<00:00,  2.42it/s, est. speed input: 1047.19 toks/s, output: 272.59 toks/s]


visiting:  language_modeling


Processed prompts: 100%|██████████| 8/8 [00:10<00:00,  1.31s/it, est. speed input: 310.58 toks/s, output: 138.41 toks/s]


visiting:  machine_translation


Processed prompts: 100%|██████████| 142/142 [00:31<00:00,  4.47it/s, est. speed input: 7600.24 toks/s, output: 196.53 toks/s]


visiting:  multimodal_machine_translation


Processed prompts: 100%|██████████| 3/3 [00:02<00:00,  1.39it/s, est. speed input: 574.18 toks/s, output: 144.58 toks/s]


visiting:  unsupervised_machine_translation


Processed prompts: 100%|██████████| 42/42 [00:21<00:00,  1.92it/s, est. speed input: 864.95 toks/s, output: 469.65 toks/s] 


visiting:  neural_machine_translation


Processed prompts: 100%|██████████| 63/63 [00:29<00:00,  2.11it/s, est. speed input: 882.75 toks/s, output: 399.36 toks/s] 


JSONDecodeError: Unterminated string starting at: line 1 column 2513 (char 2512)

In [33]:
root.children

{'text_classification': Node(label=text_classification, description=Text classification involves categorizing text data into predefined classes or categories., level=1),
 'named_entity_recognition': Node(label=named_entity_recognition, description=Named entity recognition is the task of identifying and classifying named entities in text., level=1),
 'machine_translation': Node(label=machine_translation, description=Machine translation involves translating text from one language to another., level=1),
 'text_generation': Node(label=text_generation, description=Text generation focuses on generating coherent and contextually relevant text., level=1)}

In [40]:
unlabeled = []

for paper_id, paper in tqdm(root.papers.items()):
    add = True
    for c in root.children.values():
        if paper_id in c.papers:
            add = False
    if add:
        unlabeled.append(paper_id)

100%|██████████| 662/662 [00:00<00:00, 426911.02it/s]


In [41]:
len(unlabeled)

51