In [1]:
from datasets import load_dataset


ace_gen = load_dataset("datht/ace-short-generated-dataset")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
system_prompt = """You are an open-domain event extraction system. Your task is to identify events expressed or clearly implied in a given text.
IMPORTANT:Output ONLY valid JSON. No explanations, no markdown, no extra text.
Output Format (JSON only, no markdown):

{
  "events": [
    [<trigger span>, <event type>, [[<argument span>, <semantic role>], [<argument span 2>, <semantic role 2>]], <description>],
    [<trigger span 2>, <event type 2>, [], <description 2>]
  ]
}

- If no events are detected, return: {"events": []}"""

user_prompt = """Given an input text: {input}

Your task is to extract all events present in the text. The text may contain zero, one, or multiple events.

For each event:
- Identify event type
- Identify a trigger: the specific word or phrase whose presence signals the occurrence of an event instance.
- Extract all relevant arguments participating in the event.
- Each argument must be an exact span from the text and assigned a semantic role.
- Description: explaining what this event type means

Constraints and Guidelines
- Do not invent information not supported by the text.
- Do not paraphrase triggers or arguments.
- The "text" field must exactly match a span in the original input.
"""

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-4B', padding_side="right")

In [4]:
import json
import numpy as np
import random

with open("data/ext-data/ace/label2id.json", "r", encoding="utf-8") as f:
    label2id = json.load(f)

with open("data/ext-data/ace/streams.json", "r", encoding="utf-8") as f:
    streams = json.load(f)

In [None]:
def process_ace_data(raw_data, buffer, is_test=False, tasks=[], eval_task=[]):
    full_len = []
    sent_total = 0
    data = []
    prompt_len = []
    response_len = []
    none_data = []

    temp_buffer = {}
    b_sent_id_map = {}

    for idx, sample in enumerate(raw_data):
        sent_id_to_sentence = {i: content['sentence'] for i, content in enumerate(sample["content"])}
        sent_id_set = set(sent_id_to_sentence.keys())
        sent_total += len(sent_id_to_sentence)
        sent_to_existing_events = {}

        for event in sample.get("events", []):
            if event.get("type_id", -1) == -1: continue
            event_type = event.get("type")
            description = event.get("description", "")
            
            if label2id[event_type] not in tasks:
                continue

            if event_type not in temp_buffer:
                temp_buffer[event_type] = []
            
            for mention in event.get("mention", []):
                sent_id = mention.get("sent_id")
                             
                if sent_id not in sent_to_existing_events:
                    sent_to_existing_events[sent_id] = []
                args = [[arg["text"], arg["role"]] for arg in mention.get("arguments", [])]
                event_info = [mention.get("trigger_word"), event_type, args, description]
                
                sent_to_existing_events[sent_id].append(event_info)

                if len(temp_buffer[event_type]) < 5:
                    b_sent_id_map[f"{idx}_{sent_id}"] = event_type

        for sent_id, events in sent_to_existing_events.items():
            sent_txt = sent_id_to_sentence[sent_id]
            sent_id_set.remove(sent_id)

            response = json.dumps({"events": events})

            data.append({"system_prompt": system_prompt, "user_prompt": user_prompt.format(input=sent_txt), "response": response})

            if f"{idx}_{sent_id}" in b_sent_id_map:
                temp_buffer[b_sent_id_map[f"{idx}_{sent_id}"]].append(data[-1])


        for sent_id in sent_id_set:
            sent_txt = sent_id_to_sentence[sent_id]

            response = json.dumps({"events": []})
            
            none_data.append({"system_prompt": system_prompt, "user_prompt": user_prompt.format(input=sent_txt), "response": response})


    if is_test:
        # data.extend(none_data)
        pass
    else:
        data.extend(random.sample(list(none_data), min(len(none_data), len(data) // 10)))
        data.extend(buffer)
        
        for k, v in temp_buffer.items():
            buffer.extend(v)
    
    return data, full_len, prompt_len, response_len

In [6]:
import os

def save(data, data_name, tasks, buffer): 
    os.makedirs(f"data/{data_name}", exist_ok=True)
    train, _, _, _ = process_ace_data(data["train"], buffer=buffer, tasks=tasks)
    with open(f"data/{data_name}/train.jsonl", "w", encoding="utf-8") as f:
        for item in train:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    dev, _, prompt_len, _ = process_ace_data(data["validation"], [], is_test=True, tasks=tasks)
    with open(f"data/{data_name}/dev.jsonl", "w", encoding="utf-8") as f:
        for item in dev:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    test, _, prompt_len, _ = process_ace_data(data["test"], [], is_test=True, tasks=tasks)
    with open(f"data/{data_name}/test.jsonl", "w", encoding="utf-8") as f:
        for item in test:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    return train, dev, test

In [None]:
buffer = []
for i, tasks in enumerate(streams):
    save(ace_gen, f"ace/{i}", tasks, buffer)

