**Setup**

In [2]:
%load_ext autoreload
%autoreload 2
import os
os.environ['HF_HOME'] = '/shared/data3/pk36/.cache'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
import os
import json

In [19]:
import json
from collections import deque

def bfs_json(data):
    out_json = {}
    queue = deque([(data, out_json)])

    while queue:
        current, json_pointer = queue.popleft()
        json_pointer["label"] = current["label"]
        json_pointer["description"] = current["description"]
        if "paper_ids" not in current:
            print(current["label"], ": NO PAPERS")
        else:
            json_pointer["papers"] = len(current["paper_ids"])

        if ("children" in current) and (len(current["children"]) > 0):
            json_pointer["children"] = []
            for c in range(len(current["children"])):
                json_pointer["children"].append({})
                queue.append((current["children"][c], json_pointer["children"][c]))

    return out_json

In [20]:
with open('datasets/emnlp_2024/final_taxo_tasks.json', 'r') as f:
    output = json.load(f)

filtered_output = bfs_json(output)

hate_speech_detection : NO PAPERS


In [21]:
filtered_output

{'label': 'natural_language_processing',
 'description': None,
 'papers': 2954,
 'children': [{'label': 'text_classification',
   'description': 'The task of assigning predefined categories to text documents based on their content, often used in spam detection, sentiment analysis, and topic labeling.',
   'papers': 161,
   'children': [{'label': 'sentiment_analysis',
     'description': 'The task of determining the emotional tone behind a series of words, used to understand the attitudes, opinions, and emotions expressed in text.',
     'papers': 18,
     'children': [{'label': 'aspect_based_sentiment_analysis',
       'description': 'Aspect-based sentiment analysis focuses on determining sentiment towards specific aspects or features of a product or service.',
       'papers': 2},
      {'label': 'emotion_classification',
       'description': 'Emotion classification involves categorizing text based on the emotional states expressed within it.',
       'papers': 5},
      {'label': 'h

In [13]:
with open('datasets/multi_dim/emnlp_2024/final_taxo_tasks.json', 'r') as f:
    output = json.load(f)

filtered_output = bfs_json(output)

In [15]:
with open('datasets/multi_dim/emnlp_2024/final_taxo_tasks_filtered.json', 'w') as f:
    json.dump(filtered_output, f)

In [3]:
!export HF_HOME=/shared/data3/pk36/.cache

In [4]:
from model_definitions import initializeLLM, promptLLM, constructPrompt
import json
from utils import clean_json_string
from collections import deque
from taxonomy import Node
import re

In [5]:
class Args:
    def __init__(self):
        
        self.topic = "natural language processing"
        self.dimensions = ["tasks", "datasets", "methodologies", "evaluation_methods", "real_world_domains"]
        # self.dimensions = ["evaluation_methods"]
        self.llm = 'gpt'
        self.init_levels = 2

        self.dataset = "Reasoning"
        self.data_dir = f"datasets/multi_dim/{self.dataset.lower().replace(' ', '_')}/"
        self.internal = f"{self.dataset}.txt"
        self.external = f"{self.dataset}_external.txt"
        self.groundtruth = "groundtruth.txt"
        
        self.length = 512
        self.dim = 768

        self.iters = 4

args = Args()

In [6]:
args = initializeLLM(args)

INFO 12-18 06:34:48 config.py:729] Defaulting to use mp for distributed inference
INFO 12-18 06:34:48 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='meta-llama/Meta-Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=meta-llama/Meta-Llama-3.1-8B-Instruct, use_v2_block_manager=False, enable_prefix_caching=True)
INFO 12-18 06:34

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=2098263)[0;0m INFO 12-18 06:34:52 model_runner.py:732] Loading model weights took 3.7710 GB
[1;36m(VllmWorkerProcess pid=2098261)[0;0m INFO 12-18 06:34:52 model_runner.py:732] Loading model weights took 3.7710 GB
[1;36m(VllmWorkerProcess pid=2098262)[0;0m INFO 12-18 06:34:52 model_runner.py:732] Loading model weights took 3.7710 GB
INFO 12-18 06:34:53 model_runner.py:732] Loading model weights took 3.7710 GB
INFO 12-18 06:35:06 distributed_gpu_executor.py:56] # GPU blocks: 48911, # CPU blocks: 8192
[1;36m(VllmWorkerProcess pid=2098262)[0;0m INFO 12-18 06:35:08 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
[1;36m(VllmWorkerProcess pid=2098262)[0;0m INFO 12-18 06:35:08 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memor



**Construct a 2-Level Multi-Dimensional Taxonomy**

In [7]:
from prompts import multi_dim_prompt, NodeListSchema

In [8]:
# we want to make this a directed acyclic graph (DAG) so maintain a list of the nodes
roots = {}
id2node = {}
label2node = {}
idx = 0

for dim in args.dimensions:
    mod_topic = args.topic.replace(' ', '_').lower() + f"_{dim}"
    root = Node(
            id=idx,
            label=mod_topic,
            dimension=dim
        )
    roots[dim] = root
    id2node[idx] = root
    label2node[mod_topic] = root
    idx += 1

In [9]:
queue = deque([node for id, node in id2node.items()])

# if taking long, you can probably parallelize this between the different taxonomies (expand by level)
while queue:
    curr_node = queue.popleft()
    label = curr_node.label
    dim = curr_node.dimension
    # expand
    system_instruction, main_prompt, json_output_format = multi_dim_prompt(curr_node)
    prompts = [constructPrompt(args, system_instruction, main_prompt + "\n\n" + json_output_format)]
    outputs = promptLLM(args=args, prompts=prompts, schema=NodeListSchema, max_new_tokens=3000, json_mode=True, temperature=0.1, top_p=0.99)[0]
    outputs = json.loads(clean_json_string(outputs)) if "```" in outputs else json.loads(outputs.strip())
    outputs = outputs['root_topic'] if 'root_topic' in outputs else outputs[label]

    # add all children
    for key, value in outputs.items():
        mod_key = key.replace(' ', '_').lower() + f"_{dim}"
        if mod_key not in label2node:
            child_node = Node(
                    id=len(id2node),
                    label=mod_key,
                    dimension=dim,
                    description=value['description'],
                    parents=[curr_node]
                )
            curr_node.add_child(mod_key, child_node)
            id2node[child_node.id] = child_node
            label2node[mod_key] = child_node
            if child_node.level < args.init_levels:
                queue.append(child_node)
        elif label2node[mod_key] in label2node[label].get_ancestors():
            continue
        else:
            child_node = label2node[mod_key]
            curr_node.add_child(mod_key, child_node)
            child_node.add_parent(curr_node)

100%|██████████| 1/1 [00:02<00:00,  2.55s/it]
100%|██████████| 1/1 [00:02<00:00,  2.54s/it]
100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
100%|██████████| 1/1 [00:02<00:00,  2.06s/it]
100%|██████████| 1/1 [00:01<00:00,  1.82s/it]
100%|██████████| 1/1 [00:02<00:00,  2.24s/it]
100%|██████████| 1/1 [00:01<00:00,  1.81s/it]
100%|██████████| 1/1 [00:02<00:00,  2.09s/it]
100%|██████████| 1/1 [00:01<00:00,  1.74s/it]
100%|██████████| 1/1 [00:01<00:00,  1.72s/it]
100%|██████████| 1/1 [00:01<00:00,  1.67s/it]
100%|██████████| 1/1 [00:02<00:00,  2.03s/it]
100%|██████████| 1/1 [00:01<00:00,  1.77s/it]
100%|██████████| 1/1 [00:01<00:00,  1.82s/it]
100%|██████████| 1/1 [00:01<00:00,  1.65s/it]
100%|██████████| 1/1 [00:01<00:00,  1.51s/it]
100%|██████████| 1/1 [00:01<00:00,  1.70s/it]
100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
100%|██████████| 1/1 [00:01<00:00,  1.47s/it]
100%|██████████| 1/1 [00:01<00:00,  1.78s/it]
100%|██████████| 1/1 [00:01<00:00,  1.71s/it]
100%|██████████| 1/1 [00:01<00:00,

In [10]:
roots

{'tasks': Node(label=natural_language_processing_tasks, dim=tasks, description=None, level=0),
 'datasets': Node(label=natural_language_processing_datasets, dim=datasets, description=None, level=0),
 'methodologies': Node(label=natural_language_processing_methodologies, dim=methodologies, description=None, level=0),
 'evaluation_methods': Node(label=natural_language_processing_evaluation_methods, dim=evaluation_methods, description=None, level=0),
 'real_world_domains': Node(label=natural_language_processing_real_world_domains, dim=real_world_domains, description=None, level=0)}

In [None]:
roots['tasks'].children['text_classification_tasks'].children

In [None]:
roots['tasks'].display(0, indent_multiplier=5)

**Read in dataset**

In [11]:
from datasets import load_dataset
from tqdm import tqdm
from paper import Paper

In [12]:
if not os.path.exists(args.data_dir):
    os.makedirs(args.data_dir)

In [13]:
ds = load_dataset("EMNLP/EMNLP2024-papers")

In [14]:
internal_collection = {}

with open(os.path.join(args.data_dir, 'internal.txt'), 'w') as i:
    internal_count = 0
    id = 0
    for p in tqdm(ds['train']):
        temp_dict = {"Title": p['title'], "Abstract": p['abstract']}
        formatted_dict = json.dumps(temp_dict)
        i.write(f'{formatted_dict}\n')
        internal_collection[id] = Paper(id, p['title'], p['abstract'], label_opts=args.dimensions, internal=True)
        internal_count += 1
        id += 1
print(f'Internal: {internal_count}')

100%|██████████| 2954/2954 [00:00<00:00, 5226.88it/s]


Internal: 2954


In [15]:
external_ds = load_dataset("TimSchopf/nlp_taxonomy_data")

In [16]:
external_collection = {}

with open(os.path.join(args.data_dir, 'external.txt'), 'w') as e:
    external_count = 0
    id = len(internal_collection)
    for p in tqdm(external_ds['train']):
        temp_dict = {"Title": p['title'], "Abstract": p['abstract']}
        formatted_dict = json.dumps(temp_dict)
        e.write(f'{formatted_dict}\n')
        external_collection[id] = Paper(id, p['title'], p['abstract'], label_opts=args.dimensions, internal=False)
        external_count += 1
        id += 1
print(f'External Count: {external_count}')

100%|██████████| 178521/178521 [00:14<00:00, 12126.21it/s]

External Count: 178521





**Enrich each node with a set of terms and sentences**

In [17]:
from taxonomy import DAG
args.llm = 'vllm'

In [18]:
dags = {dim:DAG(root=root, dim=dim) for dim, root in roots.items()}

In [19]:
enriched_phrases = {dim:[] for dim in args.dimensions}
enriched_sentences = {dim:[] for dim in args.dimensions}

for dim, dag in dags.items():
    all_phrases, all_sentences = dag.enrich_dag(args, id2node)
    enriched_phrases[dim].extend(all_phrases)
    enriched_sentences[dim].extend(all_sentences)

Compiling FSM index for all state transitions: 100%|██████████| 1012/1012 [01:02<00:00, 16.17it/s]
Processed prompts: 100%|██████████| 29/29 [00:10<00:00,  2.65it/s, est. speed input: 1380.46 toks/s, output: 971.57 toks/s]
Processed prompts: 100%|██████████| 30/30 [00:09<00:00,  3.13it/s, est. speed input: 1575.81 toks/s, output: 1026.78 toks/s]
Processed prompts: 100%|██████████| 30/30 [00:10<00:00,  2.79it/s, est. speed input: 1522.69 toks/s, output: 1014.11 toks/s]
Processed prompts: 100%|██████████| 29/29 [00:10<00:00,  2.82it/s, est. speed input: 1515.20 toks/s, output: 972.94 toks/s]
Processed prompts: 100%|██████████| 30/30 [00:09<00:00,  3.02it/s, est. speed input: 1680.07 toks/s, output: 1001.69 toks/s]


In [None]:
roots['tasks'].children

In [None]:
roots['tasks'].children['text_summarization_tasks'].get_phrases()

**Identify Pseudo-labels for Dimension/Type Classification**

In [20]:
from prompts import type_cls_system_instruction, type_cls_main_prompt, TypeClsSchema

In [21]:
# do for internal collection

prompts = [constructPrompt(args, type_cls_system_instruction, type_cls_main_prompt(paper)) for paper in internal_collection.values()]
outputs = promptLLM(args=args, prompts=prompts, schema=TypeClsSchema, max_new_tokens=500, json_mode=True, temperature=0.1, top_p=0.99)
outputs = [json.loads(clean_json_string(c)) if "```" in c else json.loads(c.strip()) for c in outputs]

# do for external collection

Compiling FSM index for all state transitions: 100%|██████████| 139/139 [00:04<00:00, 30.37it/s]
Processed prompts: 100%|██████████| 2954/2954 [02:30<00:00, 19.67it/s, est. speed input: 12821.17 toks/s, output: 609.71 toks/s]


Compiling FSM index for all state transitions: 100%|██████████| 1082/1082 [01:01<00:00, 18.26it/s]toks/s, output: 707.01 toks/s]

In [22]:
for r in roots:
    roots[r].papers = {}
type_dist = {dim:[] for dim in args.dimensions}
for p_id, out in enumerate(outputs):
    internal_collection[p_id].labels = {}
    for key, val in out.items():
        if val:
            type_dist[key].append(internal_collection[p_id])
            internal_collection[p_id].labels[key] = []
            roots[key].papers[p_id] = internal_collection[p_id]

In [None]:
for key, p in type_dist.items():
    print(key, len(p))

In [None]:
count = 0
dim_type = 'datasets'
for paper_id in roots[dim_type].papers:
    if count < 10:
        print(roots[dim_type].papers[paper_id].title, roots[dim_type].papers[paper_id].abstract)
        count += 1

**Zero-Shot Classification of Papers**

In [23]:
dags

{'tasks': <taxonomy.DAG at 0x7f45e454db80>,
 'datasets': <taxonomy.DAG at 0x7f45ee5e2e80>,
 'methodologies': <taxonomy.DAG at 0x7f45e45f5640>,
 'evaluation_methods': <taxonomy.DAG at 0x7f45e45f5eb0>,
 'real_world_domains': <taxonomy.DAG at 0x7f45e45f5f70>}

In [24]:
dim_type = 'datasets'
dag = dags[dim_type]
print(f'classification for {dim_type}; {len(dag.root.papers)} papers')
sample_out = dag.classify_dag(args, label2node=label2node)

classification for datasets; 675 papers
visiting: natural_language_processing_datasets; # of papers: 675


Compiling FSM index for all state transitions: 100%|██████████| 1082/1082 [01:08<00:00, 15.78it/s]
Processed prompts:   0%|          | 0/675 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts: 100%|██████████| 675/675 [03:55<00:00,  2.87it/s, est. speed input: 6674.48 toks/s, output: 281.13 toks/s] 


visiting: machine_translation_datasets_datasets; # of papers: 54


Processed prompts: 100%|██████████| 54/54 [00:16<00:00,  3.30it/s, est. speed input: 7759.36 toks/s, output: 339.23 toks/s]


visiting: question_answering_datasets_datasets; # of papers: 90


Processed prompts: 100%|██████████| 90/90 [00:29<00:00,  3.05it/s, est. speed input: 7675.73 toks/s, output: 357.42 toks/s]


visiting: sentiment_analysis_datasets_datasets; # of papers: 22


Processed prompts: 100%|██████████| 22/22 [00:07<00:00,  3.09it/s, est. speed input: 6188.88 toks/s, output: 310.14 toks/s]


visiting: named_entity_recognition_datasets_datasets; # of papers: 29


Processed prompts: 100%|██████████| 29/29 [00:10<00:00,  2.66it/s, est. speed input: 6814.21 toks/s, output: 255.63 toks/s]


visiting: text_classification_datasets_datasets; # of papers: 161


Processed prompts: 100%|██████████| 161/161 [00:46<00:00,  3.48it/s, est. speed input: 8331.31 toks/s, output: 336.15 toks/s]


In [None]:
roots['datasets'].display(0, indent_multiplier=5)

In [None]:
for idx, p in roots['datasets'].children['question_answering_datasets_datasets'].children['reading_comprehension_datasets_datasets'].papers.items():
    print(f'- {p.title}\n\t{p.abstract}\n')

**Expansion**

In [25]:
unlabeled_papers = {}
for idx, p in roots['datasets'].papers.items():
    unlabeled = True
    for c in roots['datasets'].children.values():
        if idx in c.papers:
            unlabeled = False
    if unlabeled:
        unlabeled_papers[idx] = p

In [26]:
from prompts import sibling_system_instruction, sibling_main_prompt, SiblingExpansionSchema

In [41]:
exp_prompts = [constructPrompt(args, sibling_system_instruction, sibling_main_prompt(paper, roots['datasets'])) for paper in unlabeled_papers.values()]
exp_outputs = promptLLM(args=args, prompts=exp_prompts, schema=SiblingExpansionSchema, max_new_tokens=300, json_mode=True, temperature=0.1, top_p=0.99)
exp_outputs = [json.loads(clean_json_string(c)) if "```" in c else json.loads(c.strip()) for c in exp_outputs]

Processed prompts: 100%|██████████| 340/340 [00:02<00:00, 137.69it/s, est. speed input: 80730.26 toks/s, output: 2157.00 toks/s]


In [44]:
# 0, 6, 10
exp_outputs[0]

{'new_class_label': 'vision_language_datasets_datasets'}

In [52]:
len(all_exp_options), len(set(all_exp_options))

(340, 251)

In [53]:
all_exp_options = list(set([i['new_class_label'] for i in exp_outputs]))

In [51]:
print(list(roots['datasets'].children.keys()))

['text_classification_datasets_datasets', 'named_entity_recognition_datasets_datasets', 'sentiment_analysis_datasets_datasets', 'question_answering_datasets_datasets', 'machine_translation_datasets_datasets']


In [54]:
print(all_exp_options)

['video_generation_datasets_datasets', 'sequential_sentence_classification_datasets_datasets', 'color_understanding_datasets_datasets', 'disinformation_detection_datasets_datasets', 'long-form_text_generation_datasets_datasets', 'document_understanding_datasets_datasets', 'multimodal_event_analysis_datasets_datasets', 'multimodal_model_evaluation_datasets_datasets', 'natural_language_processing_tasks', 'long_context_language_models_evaluation_datasets_datasets', 'legal_analysis_datasets_datasets', 'low_level_chart_question_answering_datasets_datasets', 'biomedical_translation_datasets_datasets', 'retrieval_augmentation_datasets_datasets', 'performance_prediction_datasets_datasets', 'zero-shot_commonsense_reasoning_datasets_datasets', 'multimodal_machine_translation_datasets_datasets', 'fact_checking_datasets_datasets', 'online_text_based_counseling_datasets_datasets', 'safety_risk_awareness_datasets_datasets', 'text-to-table_generation_datasets_datasets', 'multi-modal_language_models_d

In [35]:
def find_any_match(patterns, input_string):
    """
    Check if any pattern in the list matches the input string.

    :param patterns: List of regex patterns (as strings)
    :param input_string: The string to search within
    :return: True if any pattern matches, otherwise False
    """
    # Compile all the patterns to make matching more efficient
    compiled_patterns = [re.compile(pattern) for pattern in patterns]
    
    # Check if any compiled pattern matches the input string
    for compiled_pattern in compiled_patterns:
        if compiled_pattern.search(input_string):
            return True
    
    return False

In [None]:
args.dimensions

In [None]:
pseudo_labels = {d:[] for d in args.dimensions}
paper_dims = {}

patterns = {"datasets": [r'introduce [\s\w]* benchmark', r'introduce [\s\w]* dataset', r'construct [\s\w]* benchmark', r'construct [\s\w]* dataset', r'propose [\s\w]* dataset', r'propose [\s\w]* benchmark', r'present [\s\w]* benchmark', r'present [\s\w]* dataset', r'develop [\s\w]* benchmark', r'develop [\s\w]* dataset', r'create [\s\w]* benchmark', r'create [\s\w]* dataset', r'provide [\s\w]* benchmark', r'provide [\s\w]* dataset', r'describe [\s\w]* benchmark', r'describe [\s\w]* dataset', r'propose a new benchmark', r'propose a new dataset', r'introduce a new benchmark', r'introduce a new dataset', r'we release [\s\w]* dataset', r'we release [\s\w]* benchmark', r'a new dataset for [\s\w]*', r'a new benchmark for [\s\w]*', r'dataset for [\s\w]* task', r'benchmark for [\s\w]* task', r'we present [\s\w]* dataset', r'we present [\s\w]* benchmark', r'dataset designed for [\s\w]*', r'benchmark designed for [\s\w]*', r'introducing [\s\w]* dataset', r'introducing [\s\w]* benchmark'],
            "methodologies": [r'introduce [\s\w]* method', r'propose [\s\w]* method', r'design [\s\w]* method', r'present [\s\w]* method', r'develop [\s\w]* method', r'introduce [\s\w]* approach', r'propose [\s\w]* approach', r'design [\s\w]* approach', r'present [\s\w]* approach', r'develop [\s\w]* approach', r'we propose [\s\w]* method', r'we propose [\s\w]* approach', r'we introduce [\s\w]* method', r'we introduce [\s\w]* approach', r'we present [\s\w]* method', r'we present [\s\w]* approach', r'propose a novel method', r'propose a novel approach', r'introduce a novel method', r'introduce a novel approach', r'present a novel method', r'present a novel approach', r'propose [\s\w]* framework', r'introduce [\s\w]* framework', r'present [\s\w]* framework', r'design [\s\w]* framework', r'we propose [\s\w]* framework', r'we introduce [\s\w]* framework', r'we present [\s\w]* framework', r'our proposed method [\s\w]*', r'our proposed approach [\s\w]*', r'our proposed framework [\s\w]*', r'this paper proposes [\s\w]* method', r'this paper introduces [\s\w]* method', r'this paper presents [\s\w]* method', r'this paper develops [\s\w]* method', r'this paper proposes [\s\w]* approach', r'this paper introduces [\s\w]* approach', r'this paper presents [\s\w]* approach', r'this paper develops [\s\w]* approach', r'this paper proposes [\s\w]* framework', r'this paper introduces [\s\w]* framework', r'this paper presents [\s\w]* framework', r'this paper develops [\s\w]* framework'],
            "evaluation_methods": [r'construct a [\s\w]* evaluate', r'design a [\s\w]* evaluate', r'propose a [\s\w]* evaluate', r'introduce [\s\w]* evaluation method', r'propose [\s\w]* evaluation method', r'design [\s\w]* evaluation method', r'develop [\s\w]* evaluation method', r'introduce [\s\w]* evaluation metric', r'propose [\s\w]* evaluation metric', r'design [\s\w]* evaluation metric', r'develop [\s\w]* evaluation metric', r'propose a novel evaluation method', r'propose a novel evaluation metric', r'present a novel evaluation framework', r'introduce a framework for evaluation', r'this paper proposes [\s\w]* evaluation', r'this paper introduces [\s\w]* evaluation', r'introduce [\s\w]* automatic evaluation', r'propose [\s\w]* automatic evaluation', r'develop [\s\w]* automatic evaluation', r'design [\s\w]* automatic evaluation', r'propose a novel automatic evaluation method', r'automatic evaluation of [\s\w]* task', r'develop a method for automatic evaluation', r'introduce [\s\w]* human evaluation', r'propose [\s\w]* human evaluation', r'develop [\s\w]* human evaluation', r'design [\s\w]* human evaluation', r'propose a framework for human evaluation', r'introduce a novel human evaluation method', r'conduct human evaluation of [\s\w]*', r'compare human and automatic evaluation', r'comparison of human evaluation and automatic evaluation', r'human evaluation versus automatic evaluation', r'evaluate using both human and automatic methods', r'analyze results from human and automatic evaluation']}

for id, paper in tqdm(internal_collection.items(), total=len(internal_collection)):
    for dim, dim_patterns in patterns.items():
        if find_any_match(dim_patterns, f'{paper.title}: {paper.abstract}'.lower()):
            pseudo_labels[dim].append(paper)
            if id in paper_dims:
                paper_dims[id].append(dim)
            else:
                paper_dims[id] = [dim]
print({dim: len(papers) for dim, papers in pseudo_labels.items()})

for id, paper in tqdm(external_collection.items(), total=len(external_collection)):
    for dim, dim_patterns in patterns.items():
        if find_any_match(dim_patterns, f'{paper.title}: {paper.abstract}'.lower()):
            pseudo_labels[dim].append(paper)
            if id in paper_dims:
                paper_dims[id].append(dim)
            else:
                paper_dims[id] = [dim]

print({dim: len(papers) for dim, papers in pseudo_labels.items()})

In [None]:
len(paper_dims), paper_dims

In [None]:
internal_collection[2494].abstract

**Loose Classification of Papers**

In [20]:
args.llm = 'vllm'
# initializeLLM(args)

In [None]:
len(internal_collection)

In [None]:
dags

In [None]:
dags['methodologies'].classify_dag(args, collection=dags['methodologies'].root.papers, label2node=label2node)

In [None]:
len(roots['methodologies'].children['text_classification'].papers)

In [None]:
roots

In [None]:
roots['datasets'].children

In [None]:
root.children

In [None]:
unlabeled = []

for paper_id, paper in tqdm(root.papers.items()):
    add = True
    for c in root.children.values():
        if paper_id in c.papers:
            add = False
    if add:
        unlabeled.append(paper_id)

In [None]:
len(unlabeled)

In [4]:
import pandas as pd
import pingouin as pg
import numpy as np

In [52]:
with open('/home/pk36/epimine/agreement/2019_legislative_hk_human.txt', "r") as f:
    human = [line.strip().split('\t') for line in f.readlines() if len(line.strip())]
    human_dict = {text.strip(): int(ep) if ep.isdigit() else -1 for ep, text in human}

with open('/home/pk36/epimine/agreement/2019_legislative_hk_claude.txt', "r") as f:
    claude = [line.strip().split('\t') for line in f.readlines() if len(line.strip())]
    claude_dict = {text.strip(): int(ep) if ep.isdigit() else -1 for ep, text in claude}

In [66]:
num_rows = len(human)
data = pd.DataFrame(data={'Segment': np.concatenate((np.arange(num_rows),np.arange(num_rows)), axis=0), 
                   'Annotator': np.array([0]*num_rows + [1]*num_rows),
                   'Episode': [human_dict[i[1]] for i in human] + [claude_dict[i[1]] for i in human]})

In [67]:
ep_icc = pg.intraclass_corr(data=data, targets='Segment', raters='Annotator',
                         ratings='Episode').round(3)
ep_icc.set_index("Type")

Unnamed: 0_level_0,Description,ICC,F,df1,df2,pval,CI95%
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ICC1,Single raters absolute,0.629,4.386,316,317,0.0,"[0.56, 0.69]"
ICC2,Single random raters,0.629,4.412,316,316,0.0,"[0.56, 0.69]"
ICC3,Single fixed raters,0.63,4.412,316,316,0.0,"[0.56, 0.69]"
ICC1k,Average raters absolute,0.772,4.386,316,317,0.0,"[0.72, 0.82]"
ICC2k,Average random raters,0.772,4.412,316,316,0.0,"[0.72, 0.82]"
ICC3k,Average fixed raters,0.773,4.412,316,316,0.0,"[0.72, 0.82]"


In [63]:
from sklearn.metrics import cohen_kappa_score
y1 = [human_dict[i] for i in human_dict]
y2 = [claude_dict[i] for i in human_dict]
cohen_kappa_score(y1, y2)

0.6136012364760433

In [43]:
with open('/shared/data3/yzhan238/episode/gt/2014_moco_shootings_gt.txt', "r") as f:
    human = [line.strip().split('\t') for line in f.readlines()]
    human = {int(i[0]): int(i[1])-1 if i[1].isdigit() else -1 for i in human} # segment id: episode id
with open('/home/pk36/epimine/groundtruth/2014_moco_shootings_groundtruth.txt', "r") as f:
    auto = [line.strip().split('\t')[0] for line in f.readlines()]
    auto = {idx: int(i) if i.isdigit() else -1 for idx, i in enumerate(auto)}

with open('/home/pk36/epimine/groundtruth/2014_moco_shootings_groundtruth.txt', "r") as f:
    seg_text = {idx:line.strip().split('\t')[1] for idx, line in enumerate(f.readlines()) if len(line.strip())}

In [34]:
num_rows = len(human)
data = pd.DataFrame(data={'Segment': np.concatenate((np.arange(num_rows),np.arange(num_rows)), axis=0), 
                   'Annotator': np.array([0]*num_rows + [1]*num_rows),
                   'Episode': [human[i] for i in human] + [auto[i] for i in human]})

In [39]:
data.pivot(index='Segment', columns='Annotator').head(40)

Unnamed: 0_level_0,Episode,Episode
Annotator,0,1
Segment,Unnamed: 1_level_2,Unnamed: 2_level_2
0,4,-1
1,4,-1
2,-1,0
3,4,-1
4,3,4
5,-1,-1
6,-1,0
7,5,-1
8,1,1
9,-1,0


In [35]:
ep_icc = pg.intraclass_corr(data=data, targets='Segment', raters='Annotator',
                         ratings='Episode').round(3)
ep_icc.set_index("Type")

Unnamed: 0_level_0,Description,ICC,F,df1,df2,pval,CI95%
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ICC1,Single raters absolute,-0.08,0.852,65,66,0.741,"[-0.31, 0.16]"
ICC2,Single random raters,-0.088,0.841,65,65,0.757,"[-0.33, 0.16]"
ICC3,Single fixed raters,-0.087,0.841,65,65,0.757,"[-0.32, 0.16]"
ICC1k,Average raters absolute,-0.174,0.852,65,66,0.741,"[-0.91, 0.28]"
ICC2k,Average random raters,-0.192,0.841,65,65,0.757,"[-0.97, 0.27]"
ICC3k,Average fixed raters,-0.189,0.841,65,65,0.757,"[-0.94, 0.27]"


In [18]:
num_rows = 23
data = pd.DataFrame(data={'Episode': np.concatenate((np.arange(num_rows),np.arange(num_rows)), axis=0), 
                   'Annotator': np.array([0]*num_rows + [1]*num_rows),
                   'precision': [0.2222,0.6667,0,1,0,1,0,0.25,0,0,0.5,1,0,0.5,0,0.5,1,0.6,0,0.29,0.2,0.5,0,0.5,0.5,0.2,0.5,0.5,0,0.3333,0,0,0,0,1,0.3333,0,0,0,0,0,0,0.2857,0,0,0],
                   'recall': [0.5,0.333,0,0.25,0,0.2,0,0.111,0,0,0.222222,0.4,0,0.125,0,0.4,0.125,0.375,0,0.22,0.2,0.14,0,0.5,0.1667,0.25,0.25,0.2857,0,0.2,0,0,0,0,0.2857,0.125,0,0,0,0,0,0,0.2,0,0,0],
                   'f1': [0.3077,0.444,0,0.4,0,0.333,0,0.153,0,0,0.30769,0.75,0,0.2,0,0.44,0.22,0.46,0,0.25,0.2,0.22,0,0.5,0.25,0.2222,0.3333,0.3636,0,0.25,0,0,0,0,0.4444,0.1818,0,0,0,0,0,0,0.2353,0,0,0]})

In [19]:
prec_icc = pg.intraclass_corr(data=data, targets='Episode', raters='Annotator',
                         ratings='precision').round(3)
prec_icc.set_index("Type")

Unnamed: 0_level_0,Description,ICC,F,df1,df2,pval,CI95%
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ICC1,Single raters absolute,0.165,1.394,22,23,0.217,"[-0.25, 0.53]"
ICC2,Single random raters,0.207,1.601,22,22,0.139,"[-0.16, 0.55]"
ICC3,Single fixed raters,0.231,1.601,22,22,0.139,"[-0.19, 0.58]"
ICC1k,Average raters absolute,0.283,1.394,22,23,0.217,"[-0.67, 0.69]"
ICC2k,Average random raters,0.343,1.601,22,22,0.139,"[-0.38, 0.71]"
ICC3k,Average fixed raters,0.375,1.601,22,22,0.139,"[-0.47, 0.74]"


In [20]:
recall_icc = pg.intraclass_corr(data=data, targets='Episode', raters='Annotator',
                         ratings='recall').round(3)
recall_icc.set_index("Type")

Unnamed: 0_level_0,Description,ICC,F,df1,df2,pval,CI95%
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ICC1,Single raters absolute,0.314,1.914,22,23,0.065,"[-0.1, 0.64]"
ICC2,Single random raters,0.329,2.048,22,22,0.05,"[-0.06, 0.64]"
ICC3,Single fixed raters,0.344,2.048,22,22,0.05,"[-0.07, 0.66]"
ICC1k,Average raters absolute,0.478,1.914,22,23,0.065,"[-0.21, 0.78]"
ICC2k,Average random raters,0.495,2.048,22,22,0.05,"[-0.13, 0.78]"
ICC3k,Average fixed raters,0.512,2.048,22,22,0.05,"[-0.15, 0.79]"


In [21]:
f1_icc = pg.intraclass_corr(data=data, targets='Episode', raters='Annotator',
                         ratings='f1').round(3)
f1_icc.set_index("Type")

Unnamed: 0_level_0,Description,ICC,F,df1,df2,pval,CI95%
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ICC1,Single raters absolute,0.247,1.657,22,23,0.118,"[-0.17, 0.59]"
ICC2,Single random raters,0.271,1.812,22,22,0.086,"[-0.11, 0.6]"
ICC3,Single fixed raters,0.289,1.812,22,22,0.086,"[-0.13, 0.62]"
ICC1k,Average raters absolute,0.397,1.657,22,23,0.118,"[-0.4, 0.74]"
ICC2k,Average random raters,0.426,1.812,22,22,0.086,"[-0.25, 0.75]"
ICC3k,Average fixed raters,0.448,1.812,22,22,0.086,"[-0.3, 0.77]"
