**Setup**

In [1]:
%load_ext autoreload
%autoreload 2
import os
os.environ['HF_HOME'] = '/shared/data3/pk36/.cache'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
!export HF_HOME=/shared/data3/pk36/.cache

In [3]:
from model_definitions import initializeLLM, promptLLM, constructPrompt
import json
from utils import clean_json_string
from collections import deque
from taxonomy import Node
import re

In [4]:
class Args:
    def __init__(self):
        
        self.topic = "natural language processing"
        self.dimensions = ["tasks", "datasets", "methodologies", "evaluation_methods", "real_world_domains"]
        # self.dimensions = ["evaluation_methods"]
        self.llm = 'gpt'
        self.init_levels = 2

        self.dataset = "Reasoning"
        self.data_dir = f"datasets/multi_dim/{self.dataset.lower().replace(' ', '_')}/"
        self.internal = f"{self.dataset}.txt"
        self.external = f"{self.dataset}_external.txt"
        self.groundtruth = "groundtruth.txt"
        
        self.length = 512
        self.dim = 768

        self.iters = 4

args = Args()

In [5]:
args = initializeLLM(args)

INFO 12-04 07:36:42 config.py:729] Defaulting to use mp for distributed inference
INFO 12-04 07:36:42 config.py:820] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 12-04 07:36:42 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='meta-llama/Meta-Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=meta-llama/Meta-

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=3166293)[0;0m INFO 12-04 07:36:47 model_runner.py:732] Loading model weights took 7.5122 GB
INFO 12-04 07:36:47 model_runner.py:732] Loading model weights took 7.5122 GB
INFO 12-04 07:36:49 distributed_gpu_executor.py:56] # GPU blocks: 15716, # CPU blocks: 4096
INFO 12-04 07:36:51 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 12-04 07:36:51 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
[1;36m(VllmWorkerProcess pid=3166293)[0;0m INFO 12-04 07:36:51 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not sta



**Construct a 2-Level Multi-Dimensional Taxonomy**

In [6]:
from prompts import multi_dim_prompt, NodeListSchema

In [7]:
# we want to make this a directed acyclic graph (DAG) so maintain a list of the nodes
roots = {}
id2node = {}
label2node = {}
idx = 0

for dim in args.dimensions:
    mod_topic = args.topic.replace(' ', '_').lower() + f"_{dim}"
    root = Node(
            id=idx,
            label=mod_topic,
            dimension=dim
        )
    roots[dim] = root
    id2node[idx] = root
    label2node[mod_topic] = root
    idx += 1

In [8]:
queue = deque([node for id, node in id2node.items()])

# if taking long, you can probably parallelize this between the different taxonomies (expand by level)
while queue:
    curr_node = queue.popleft()
    label = curr_node.label
    # expand
    system_instruction, main_prompt, json_output_format = multi_dim_prompt(curr_node)
    prompts = [constructPrompt(args, system_instruction, main_prompt + "\n\n" + json_output_format)]
    outputs = promptLLM(args=args, prompts=prompts, schema=NodeListSchema, max_new_tokens=3000, json_mode=True, temperature=0.1, top_p=0.99)[0]
    outputs = json.loads(clean_json_string(outputs)) if "```" in outputs else json.loads(outputs.strip())
    outputs = outputs['root_topic'] if 'root_topic' in outputs else outputs[label]

    # add all children
    for key, value in outputs.items():
        key = key.replace(' ', '_').lower()
        if (key not in label2node) or ((key in label2node) and (label2node[key].dimension != curr_node.dimension)):
            child_node = Node(
                    id=len(id2node),
                    label=key,
                    dimension=curr_node.dimension,
                    description=value['description'],
                    parents=[curr_node]
                )
            curr_node.add_child(key, child_node)
            id2node[child_node.id] = child_node
            label2node[key] = child_node
            if child_node.level < args.init_levels:
                queue.append(child_node)
        
        else:
            child_node = label2node[key]
            child_node.add_parent(curr_node)

100%|██████████| 1/1 [00:02<00:00,  2.51s/it]
100%|██████████| 1/1 [00:02<00:00,  2.02s/it]
100%|██████████| 1/1 [00:02<00:00,  2.34s/it]
100%|██████████| 1/1 [00:02<00:00,  2.03s/it]
100%|██████████| 1/1 [00:02<00:00,  2.47s/it]
100%|██████████| 1/1 [00:01<00:00,  1.92s/it]
100%|██████████| 1/1 [00:01<00:00,  1.60s/it]
100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
100%|██████████| 1/1 [00:01<00:00,  1.72s/it]
100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
100%|██████████| 1/1 [00:01<00:00,  1.89s/it]
100%|██████████| 1/1 [00:02<00:00,  2.99s/it]
100%|██████████| 1/1 [00:02<00:00,  2.20s/it]
100%|██████████| 1/1 [00:02<00:00,  2.14s/it]
100%|██████████| 1/1 [00:01<00:00,  1.59s/it]
100%|██████████| 1/1 [00:02<00:00,  2.08s/it]
100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
100%|██████████| 1/1 [00:01<00:00,  1.82s/it]
100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
100%|██████████| 1/1 [00:01<00:00,  1.74s/it]
100%|██████████| 1/1 [00:01<00:00,

In [9]:
roots

{'tasks': Node(label=natural_language_processing_tasks, dim=tasks, description=None, level=0),
 'datasets': Node(label=natural_language_processing_datasets, dim=datasets, description=None, level=0),
 'methodologies': Node(label=natural_language_processing_methodologies, dim=methodologies, description=None, level=0),
 'evaluation_methods': Node(label=natural_language_processing_evaluation_methods, dim=evaluation_methods, description=None, level=0),
 'real_world_domains': Node(label=natural_language_processing_real_world_domains, dim=real_world_domains, description=None, level=0)}

In [10]:
roots['evaluation_methods'].display(0, indent_multiplier=5)

Label: natural_language_processing_evaluation_methods
Dimension: evaluation_methods
Description: None
Level: 0
----------------------------------------
Children:
     Label: intrinsic_evaluation_methods
     Dimension: evaluation_methods
     Description: Intrinsic evaluation methods assess the performance of specific components or algorithms in natural language processing tasks, such as part-of-speech tagging or named entity recognition.
     Level: 1
     ----------------------------------------
     Children:
          Label: annotation_based_evaluation
          Dimension: evaluation_methods
          Description: Annotation-based evaluation methods involve manual annotation of data for tasks such as sentiment analysis or named entity recognition.
          Level: 2
          ----------------------------------------
          Label: correlation_evaluation
          Dimension: evaluation_methods
          Description: Correlation evaluation methods assess the correlation between pre

**Read in dataset**

In [11]:
from datasets import load_dataset
from tqdm import tqdm
from paper import Paper

In [12]:
if not os.path.exists(args.data_dir):
    os.makedirs(args.data_dir)

In [13]:
ds = load_dataset("EMNLP/EMNLP2024-papers")

In [14]:
internal_collection = {}

with open(os.path.join(args.data_dir, 'internal.txt'), 'w') as i:
    internal_count = 0
    id = 0
    for p in tqdm(ds['train']):
        temp_dict = {"Title": p['title'], "Abstract": p['abstract']}
        formatted_dict = json.dumps(temp_dict)
        i.write(f'{formatted_dict}\n')
        internal_collection[id] = Paper(id, p['title'], p['abstract'], label_opts=args.dimensions, internal=True)
        internal_count += 1
        id += 1
print(f'Internal: {internal_count}')

100%|██████████| 2954/2954 [00:00<00:00, 5019.14it/s]

Internal: 2954





In [15]:
external_ds = load_dataset("TimSchopf/nlp_taxonomy_data")

In [16]:
external_collection = {}

with open(os.path.join(args.data_dir, 'external.txt'), 'w') as e:
    external_count = 0
    id = len(internal_collection)
    for p in tqdm(external_ds['train']):
        temp_dict = {"Title": p['title'], "Abstract": p['abstract']}
        formatted_dict = json.dumps(temp_dict)
        e.write(f'{formatted_dict}\n')
        external_collection[id] = Paper(id, p['title'], p['abstract'], label_opts=args.dimensions, internal=False)
        external_count += 1
        id += 1
print(f'External Count: {external_count}')

100%|██████████| 178521/178521 [00:15<00:00, 11656.05it/s]

External Count: 178521





**Enrich each node with a set of terms and sentences**

In [17]:
from taxonomy import DAG
args.llm = 'vllm'

In [18]:
dags = {dim:DAG(root=root, dim=dim) for dim, root in roots.items()}

In [19]:
enriched_phrases = {dim:[] for dim in args.dimensions}
enriched_sentences = {dim:[] for dim in args.dimensions}

for dim, dag in dags.items():
    all_phrases, all_sentences = dag.enrich_dag(args, id2node)
    enriched_phrases[dim].extend(all_phrases)
    enriched_sentences[dim].extend(all_sentences)

Compiling FSM index for all state transitions: 100%|██████████| 1012/1012 [01:03<00:00, 15.91it/s]
Processed prompts: 100%|██████████| 30/30 [00:13<00:00,  2.29it/s, est. speed input: 1097.06 toks/s, output: 820.64 toks/s]
Processed prompts: 100%|██████████| 30/30 [00:10<00:00,  2.76it/s, est. speed input: 1328.58 toks/s, output: 941.38 toks/s]
Processed prompts: 100%|██████████| 30/30 [00:11<00:00,  2.70it/s, est. speed input: 1367.89 toks/s, output: 946.61 toks/s]
Processed prompts: 100%|██████████| 28/28 [00:10<00:00,  2.61it/s, est. speed input: 1243.65 toks/s, output: 898.29 toks/s]
Processed prompts: 100%|██████████| 31/31 [00:11<00:00,  2.67it/s, est. speed input: 1268.72 toks/s, output: 881.56 toks/s]


In [20]:
roots['tasks'].children

{'text_classification': Node(label=text_classification, dim=tasks, description=Text classification involves categorizing text documents into predefined classes or categories based on their content., level=1),
 'named_entity_recognition': Node(label=named_entity_recognition, dim=tasks, description=Named Entity Recognition (NER) is the task of identifying and classifying named entities in text into predefined categories such as names of persons, organizations, locations, etc., level=1),
 'sentiment_analysis': Node(label=sentiment_analysis, dim=tasks, description=Sentiment analysis aims to determine the sentiment expressed in a piece of text, whether it is positive, negative, or neutral., level=1),
 'machine_translation': Node(label=machine_translation, dim=tasks, description=Machine translation involves automatically translating text from one language to another, preserving the meaning of the original text., level=1),
 'question_answering': Node(label=question_answering, dim=tasks, descr

In [26]:
roots['tasks'].children['question_answering'].get_phrases()

['question_answering_models',
 'specific_facts',
 'multiple_passage',
 'knowledge_graph',
 'inference_chain',
 'questioning_technique',
 'multi-hop_inference',
 'entity_disambiguation',
 'questioning_strategy',
 'comprehension_assessment',
 'questioning_methodology',
 'question_formulation',
 'pronoun_resolution',
 'entity_resolution',
 'text_to_question',
 'text_understanding',
 'questioning_framework',
 'questioning_tool',
 'anaphora_resolution',
 'contextual_understanding',
 'entity_linking',
 'factual_information',
 'cross_document',
 'answer_selection',
 'questioning_engine',
 'co-reference',
 'information_retrieval',
 'question_chaining',
 'factoid_questions',
 'knowledge_base',
 'question_answering_techniques',
 'passage_retrieval',
 'question_classification',
 'answer_type',
 'natural_language_inference',
 'information_fusion',
 'factual_data',
 'question_generation_system',
 'question_creation',
 'contextual_reasoning',
 'question_answering',
 'inference_network',
 'pronoun_id

**Identify Pseudo-labels for Dimension/Type Classification**

In [28]:
from prompts import type_cls_system_instruction, type_cls_main_prompt, TypeClsSchema

In [32]:
# do for internal collection

prompts = [constructPrompt(args, type_cls_system_instruction, type_cls_main_prompt(paper)) for paper in internal_collection.values()]
outputs = promptLLM(args=args, prompts=prompts, schema=TypeClsSchema, max_new_tokens=500, json_mode=True, temperature=0.1, top_p=0.99)
outputs = [json.loads(clean_json_string(c)) if "```" in c else json.loads(c.strip()) for c in outputs]

# do for external collection

Compiling FSM index for all state transitions: 100%|██████████| 139/139 [00:04<00:00, 29.89it/s]
Processed prompts: 100%|██████████| 2954/2954 [04:08<00:00, 11.87it/s, est. speed input: 7570.80 toks/s, output: 367.93 toks/s]


In [41]:
for r in roots:
    roots[r].papers = {}
type_dist = {dim:[] for dim in args.dimensions}
for p_id, out in enumerate(outputs):
    internal_collection[p_id].labels = {}
    for key, val in out.items():
        if val:
            type_dist[key].append(internal_collection[p_id])
            internal_collection[p_id].labels[key] = []
            roots[key].papers[p_id] = internal_collection[p_id]

In [42]:
for key, p in type_dist.items():
    print(key, len(p))

tasks 2954
datasets 696
methodologies 2241
evaluation_methods 1946
real_world_domains 1408


In [55]:
count = 0
dim_type = 'evaluation_methods'
for paper_id in roots[dim_type].papers:
    if count < 10:
        print(roots[dim_type].papers[paper_id].title, roots[dim_type].papers[paper_id].abstract)
        count += 1

FIZZ: Factual Inconsistency Detection by Zoom-in Summary and Zoom-out Document Through the advent of pre-trained language models, there have been notable advancements in abstractive summarization systems. Simultaneously, a considerable number of novel methods for evaluating factual consistency in abstractive summarization systems has been developed. But these evaluation approaches incorporate substantial limitations, especially on refinement and interpretability. In this work, we propose highly effective and interpretable factual inconsistency detection method FIZZ (Factual Inconsistency Detection by Zoom-in Summary and Zoom-out Document) for abstractive summarization systems that is based on fine-grained atomic facts decomposition. Moreover, we align atomic facts decomposed from the summary with the source document through adaptive granularity expansion. These atomic facts represent a more fine-grained unit of information, facilitating detailed understanding and interpretability of th

In [57]:
count = 0
dim_type = 'real_world_domains'
for paper_id in roots[dim_type].papers:
    if count < 10:
        print(roots[dim_type].papers[paper_id].title, roots[dim_type].papers[paper_id].abstract)
        count += 1

UniGen: Universal Domain Generalization for Sentiment Classification via Zero-shot Dataset Generation Although pre-trained language models have exhibited great flexibility and versatility with prompt-based few-shot learning, they suffer from the extensive parameter size and limited applicability for inference. Recent studies have suggested that PLMs be used as dataset generators and a tiny task-specific model be trained to achieve efficient inference. However, their applicability to various domains is limited because they tend to generate domain-specific datasets. In this work, we propose a novel approach to universal domain generalization that generates a dataset regardless of the target domain. This allows for generalization of the tiny task model to any domain that shares the label space, thus enhancing the real-world applicability of the dataset generation paradigm. Our experiments indicate that the proposed method accomplishes generalizability across various domains while using a 

In [49]:
internal_collection[0].labels

{'tasks': [], 'methodologies': [], 'real_world_domains': []}

In [35]:
def find_any_match(patterns, input_string):
    """
    Check if any pattern in the list matches the input string.

    :param patterns: List of regex patterns (as strings)
    :param input_string: The string to search within
    :return: True if any pattern matches, otherwise False
    """
    # Compile all the patterns to make matching more efficient
    compiled_patterns = [re.compile(pattern) for pattern in patterns]
    
    # Check if any compiled pattern matches the input string
    for compiled_pattern in compiled_patterns:
        if compiled_pattern.search(input_string):
            return True
    
    return False

In [36]:
args.dimensions

['tasks',
 'datasets',
 'methodologies',
 'evaluation_methods',
 'real_world_applications']

In [37]:
pseudo_labels = {d:[] for d in args.dimensions}
paper_dims = {}

patterns = {"datasets": [r'introduce [\s\w]* benchmark', r'introduce [\s\w]* dataset', r'construct [\s\w]* benchmark', r'construct [\s\w]* dataset', r'propose [\s\w]* dataset', r'propose [\s\w]* benchmark', r'present [\s\w]* benchmark', r'present [\s\w]* dataset', r'develop [\s\w]* benchmark', r'develop [\s\w]* dataset', r'create [\s\w]* benchmark', r'create [\s\w]* dataset', r'provide [\s\w]* benchmark', r'provide [\s\w]* dataset', r'describe [\s\w]* benchmark', r'describe [\s\w]* dataset', r'propose a new benchmark', r'propose a new dataset', r'introduce a new benchmark', r'introduce a new dataset', r'we release [\s\w]* dataset', r'we release [\s\w]* benchmark', r'a new dataset for [\s\w]*', r'a new benchmark for [\s\w]*', r'dataset for [\s\w]* task', r'benchmark for [\s\w]* task', r'we present [\s\w]* dataset', r'we present [\s\w]* benchmark', r'dataset designed for [\s\w]*', r'benchmark designed for [\s\w]*', r'introducing [\s\w]* dataset', r'introducing [\s\w]* benchmark'],
            "methodologies": [r'introduce [\s\w]* method', r'propose [\s\w]* method', r'design [\s\w]* method', r'present [\s\w]* method', r'develop [\s\w]* method', r'introduce [\s\w]* approach', r'propose [\s\w]* approach', r'design [\s\w]* approach', r'present [\s\w]* approach', r'develop [\s\w]* approach', r'we propose [\s\w]* method', r'we propose [\s\w]* approach', r'we introduce [\s\w]* method', r'we introduce [\s\w]* approach', r'we present [\s\w]* method', r'we present [\s\w]* approach', r'propose a novel method', r'propose a novel approach', r'introduce a novel method', r'introduce a novel approach', r'present a novel method', r'present a novel approach', r'propose [\s\w]* framework', r'introduce [\s\w]* framework', r'present [\s\w]* framework', r'design [\s\w]* framework', r'we propose [\s\w]* framework', r'we introduce [\s\w]* framework', r'we present [\s\w]* framework', r'our proposed method [\s\w]*', r'our proposed approach [\s\w]*', r'our proposed framework [\s\w]*', r'this paper proposes [\s\w]* method', r'this paper introduces [\s\w]* method', r'this paper presents [\s\w]* method', r'this paper develops [\s\w]* method', r'this paper proposes [\s\w]* approach', r'this paper introduces [\s\w]* approach', r'this paper presents [\s\w]* approach', r'this paper develops [\s\w]* approach', r'this paper proposes [\s\w]* framework', r'this paper introduces [\s\w]* framework', r'this paper presents [\s\w]* framework', r'this paper develops [\s\w]* framework'],
            "evaluation_methods": [r'construct a [\s\w]* evaluate', r'design a [\s\w]* evaluate', r'propose a [\s\w]* evaluate', r'introduce [\s\w]* evaluation method', r'propose [\s\w]* evaluation method', r'design [\s\w]* evaluation method', r'develop [\s\w]* evaluation method', r'introduce [\s\w]* evaluation metric', r'propose [\s\w]* evaluation metric', r'design [\s\w]* evaluation metric', r'develop [\s\w]* evaluation metric', r'propose a novel evaluation method', r'propose a novel evaluation metric', r'present a novel evaluation framework', r'introduce a framework for evaluation', r'this paper proposes [\s\w]* evaluation', r'this paper introduces [\s\w]* evaluation', r'introduce [\s\w]* automatic evaluation', r'propose [\s\w]* automatic evaluation', r'develop [\s\w]* automatic evaluation', r'design [\s\w]* automatic evaluation', r'propose a novel automatic evaluation method', r'automatic evaluation of [\s\w]* task', r'develop a method for automatic evaluation', r'introduce [\s\w]* human evaluation', r'propose [\s\w]* human evaluation', r'develop [\s\w]* human evaluation', r'design [\s\w]* human evaluation', r'propose a framework for human evaluation', r'introduce a novel human evaluation method', r'conduct human evaluation of [\s\w]*', r'compare human and automatic evaluation', r'comparison of human evaluation and automatic evaluation', r'human evaluation versus automatic evaluation', r'evaluate using both human and automatic methods', r'analyze results from human and automatic evaluation']}

for id, paper in tqdm(internal_collection.items(), total=len(internal_collection)):
    for dim, dim_patterns in patterns.items():
        if find_any_match(dim_patterns, f'{paper.title}: {paper.abstract}'.lower()):
            pseudo_labels[dim].append(paper)
            if id in paper_dims:
                paper_dims[id].append(dim)
            else:
                paper_dims[id] = [dim]
print({dim: len(papers) for dim, papers in pseudo_labels.items()})

for id, paper in tqdm(external_collection.items(), total=len(external_collection)):
    for dim, dim_patterns in patterns.items():
        if find_any_match(dim_patterns, f'{paper.title}: {paper.abstract}'.lower()):
            pseudo_labels[dim].append(paper)
            if id in paper_dims:
                paper_dims[id].append(dim)
            else:
                paper_dims[id] = [dim]

print({dim: len(papers) for dim, papers in pseudo_labels.items()})

100%|██████████| 2954/2954 [00:00<00:00, 6785.56it/s]


{'tasks': 0, 'datasets': 233, 'methodologies': 487, 'evaluation_methods': 23, 'real_world_applications': 0}


100%|██████████| 178521/178521 [00:26<00:00, 6771.40it/s]

{'tasks': 0, 'datasets': 3377, 'methodologies': 24568, 'evaluation_methods': 416, 'real_world_applications': 0}





In [38]:
len(paper_dims), paper_dims

(27488,
 {0: ['datasets', 'methodologies'],
  2: ['methodologies'],
  6: ['methodologies'],
  7: ['methodologies'],
  9: ['datasets'],
  10: ['datasets'],
  13: ['datasets', 'methodologies'],
  16: ['datasets'],
  19: ['methodologies'],
  21: ['datasets'],
  31: ['methodologies'],
  36: ['methodologies'],
  40: ['methodologies'],
  49: ['methodologies'],
  54: ['methodologies'],
  56: ['methodologies'],
  59: ['methodologies'],
  64: ['methodologies'],
  85: ['methodologies'],
  89: ['methodologies'],
  90: ['methodologies'],
  91: ['datasets'],
  94: ['methodologies'],
  97: ['datasets', 'methodologies'],
  106: ['evaluation_methods'],
  109: ['methodologies'],
  112: ['datasets', 'methodologies'],
  122: ['methodologies'],
  123: ['methodologies'],
  131: ['datasets'],
  132: ['methodologies'],
  133: ['methodologies'],
  135: ['methodologies'],
  148: ['methodologies'],
  149: ['methodologies'],
  150: ['methodologies'],
  151: ['methodologies'],
  156: ['methodologies'],
  164: ['m

In [41]:
internal_collection[2494].abstract

'Large Language Models (LLMs) struggle with providing current information due to the outdated pre-training data. Existing methods for updating LLMs, such as knowledge editing and continual fine-tuning, have significant drawbacks in generalizability of new information and the requirements on structured updating corpus. We identify the core challenge behind these drawbacks: the LM-logical discrepancy featuring the difference between language modeling probabilities and logical probabilities. To evaluate and address the core challenge, we propose a new task formulation of the information updating task that only requires the provision of an unstructured updating corpus and evaluates the performance of information updating on the generalizability to question-answer pairs pertaining to the updating information.We further propose a novel and effective pipeline approach for the task, highlighting a self-prompting-based question-answer generation process and a associative distillation methods to

**Loose Classification of Papers**

In [20]:
args.llm = 'vllm'
# initializeLLM(args)

In [50]:
len(internal_collection)

2954

In [54]:
dags

{'tasks': <taxonomy.DAG at 0x7f4a46135550>,
 'datasets': <taxonomy.DAG at 0x7f4a46135430>,
 'methodologies': <taxonomy.DAG at 0x7f4a46135bb0>,
 'evaluation_methods': <taxonomy.DAG at 0x7f4a461353d0>,
 'real_world_domains': <taxonomy.DAG at 0x7f4a46135d90>}

In [59]:
dags['methodologies'].classify_dag(args, collection=dags['methodologies'].root.papers, label2node=label2node)

visiting:  natural_language_processing_methodologies


Compiling FSM index for all state transitions: 100%|██████████| 1056/1056 [01:08<00:00, 15.50it/s]
Processed prompts: 100%|██████████| 2241/2241 [11:00<00:00,  3.39it/s, est. speed input: 7712.11 toks/s, output: 168.19 toks/s]


visiting:  topic_modeling


Processed prompts: 0it [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


visiting:  machine_translation


Processed prompts: 100%|██████████| 1313/1313 [06:24<00:00,  3.42it/s, est. speed input: 7576.98 toks/s, output: 194.43 toks/s]


visiting:  sentiment_analysis


Processed prompts: 0it [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


visiting:  named_entity_recognition


Processed prompts: 0it [00:00, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


visiting:  text_classification


Processed prompts: 100%|██████████| 1103/1103 [05:48<00:00,  3.16it/s, est. speed input: 7535.86 toks/s, output: 189.02 toks/s]


In [66]:
len(roots['methodologies'].children['text_classification'].papers)

1103

In [71]:
roots

{'tasks': Node(label=natural_language_processing_tasks, dim=tasks, description=None, level=0),
 'datasets': Node(label=natural_language_processing_datasets, dim=datasets, description=None, level=0),
 'methodologies': Node(label=natural_language_processing_methodologies, dim=methodologies, description=None, level=0),
 'evaluation_methods': Node(label=natural_language_processing_evaluation_methods, dim=evaluation_methods, description=None, level=0),
 'real_world_domains': Node(label=natural_language_processing_real_world_domains, dim=real_world_domains, description=None, level=0)}

In [72]:
roots['datasets'].children

{'text_classification_datasets': Node(label=text_classification_datasets, dim=datasets, description=Datasets specifically curated for training and evaluating text classification models in natural language processing tasks., level=1),
 'named_entity_recognition_datasets': Node(label=named_entity_recognition_datasets, dim=datasets, description=Datasets containing annotated entities such as names, locations, and organizations for training and evaluating named entity recognition models., level=1),
 'sentiment_analysis_datasets': Node(label=sentiment_analysis_datasets, dim=datasets, description=Datasets designed for sentiment analysis tasks, providing labeled data for sentiment polarity classification., level=1),
 'question_answering_datasets': Node(label=question_answering_datasets, dim=datasets, description=Datasets structured to support question answering systems by providing question-answer pairs for training and evaluation., level=1),
 'machine_translation_datasets': Node(label=machine

In [33]:
root.children

{'text_classification': Node(label=text_classification, description=Text classification involves categorizing text data into predefined classes or categories., level=1),
 'named_entity_recognition': Node(label=named_entity_recognition, description=Named entity recognition is the task of identifying and classifying named entities in text., level=1),
 'machine_translation': Node(label=machine_translation, description=Machine translation involves translating text from one language to another., level=1),
 'text_generation': Node(label=text_generation, description=Text generation focuses on generating coherent and contextually relevant text., level=1)}

In [40]:
unlabeled = []

for paper_id, paper in tqdm(root.papers.items()):
    add = True
    for c in root.children.values():
        if paper_id in c.papers:
            add = False
    if add:
        unlabeled.append(paper_id)

100%|██████████| 662/662 [00:00<00:00, 426911.02it/s]


In [41]:
len(unlabeled)

51