In [17]:
from datasets import load_dataset


ace_gen = load_dataset("datht/ace-generated-dataset")

In [18]:
ace = load_dataset("datht/ace-event-dataset")

In [19]:
system_prompt = """You are an open-domain event extraction system. Your task is to identify events expressed or clearly implied in a given text.
IMPORTANT:Output ONLY valid JSON. No explanations, no markdown, no extra text.
Output Format (JSON only, no markdown):

{
  "events": [
    {
      "trigger_text": <trigger span>,
      "type": <event type>,
      "arguments": [
        {"text": <argument span>, "role": <semantic role>},
        {"text": <argument span 2>, "role": <semantic role 2>}
      ]
    },
    {
      "trigger_text": <trigger span 2>,
      "type": <event type 2>,
      "arguments": []
    }
  ]
}

- If no events are detected, return: {"events": []}"""

user_prompt = """Given an input text: {input}

Your task is to extract all events present in the text. The text may contain zero, one, or multiple events.

For each event:
- Identify event type
- Identify a trigger: the word or phrase that most clearly indicates the event.
- Extract all relevant arguments participating in the event.
- Each argument must be an exact span from the text and assigned a semantic role.
- Argument roles are open-ended and may include: Agent, Patient, Target, Time, Location, Instrument, Victim, Attacker, Artifact, Beneficiary, ...

Constraints and Guidelines
- Do not invent information not supported by the text.
- Do not paraphrase triggers or arguments.
- The "text" field must exactly match a span in the original input.
"""

In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-4B-Instruct-2507', padding_side="right")

In [21]:
ace["train"]

Dataset({
    features: ['id', 'title', 'content', 'events', 'negative_triggers'],
    num_rows: 529
})

In [22]:
import json
import numpy as np
import random


def process_ace_data(raw_data):
    full_len = []
    sent_total = 0
    data = []
    prompt_len = []
    response_len = []
    none_data = []

    for sample in raw_data:
        sent_id_to_sentence = {i: content['sentence'] for i, content in enumerate(sample["content"])}
        sent_id_set = set(sent_id_to_sentence.keys())
        sent_total += len(sent_id_to_sentence)
        sent_to_existing_events = {}
        for event in sample.get("events", []):
            if event.get("type_id", -1) == -1: continue
            event_type = event.get("type")

            for mention in event.get("mention", []):
                sent_id = mention.get("sent_id")
                             
                if sent_id not in sent_to_existing_events:
                    sent_to_existing_events[sent_id] = []
                event_info = {
                    "trigger_text": mention.get("trigger_word"),
                    "type": event_type,
                    "arguments": mention.get("arguments")
                }
                sent_to_existing_events[sent_id].append(event_info)

        for sent_id, events in sent_to_existing_events.items():
            sent_txt = sent_id_to_sentence[sent_id]
            sent_id_set.remove(sent_id)

            response = json.dumps({"events": events})

            data.append({"system_prompt": system_prompt, "user_prompt": user_prompt.format(input=sent_txt), "response": response})

        for sent_id in random.sample(list(sent_id_set), min(len(sent_id_set), 1)):
            sent_txt = sent_id_to_sentence[sent_id]

            response = json.dumps({"events": []})
            
            none_data.append({"system_prompt": system_prompt, "user_prompt": user_prompt.format(input=sent_txt), "response": response})


    data.extend(random.sample(list(none_data), min(len(none_data), len(data) // 100)))
    for sample in data:
        prompt = tokenizer.apply_chat_template(
                [{"role": "system", "content": sample['system_prompt']},
                {"role": "user", "content": sample['user_prompt']}],
                add_generation_prompt=True,
                tokenize=False 
            )
        full = prompt + sample['response'] + tokenizer.eos_token
        
        prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)
        full_tokens = tokenizer.encode(full, add_special_tokens=False)
        response_tokens = full_tokens[len(prompt_tokens):]

        full_len.append(len(full_tokens))
        prompt_len.append(len(prompt_tokens))
        response_len.append(len(response_tokens))

    return data, full_len, prompt_len, response_len

In [23]:
data, full_len, prompt_len, response_len = process_ace_data(ace["train"])

In [24]:
np.max(response_len)

np.int64(346)

In [25]:
np.median(prompt_len)

np.float64(360.0)

In [26]:
np.median(full_len)

np.float64(405.0)

In [27]:
np.sum(np.array(prompt_len) > 460)

np.int64(4)

In [28]:
np.max(full_len)

np.int64(754)

In [29]:
len(data)

3167

In [30]:
np.mean(full_len)

np.float64(415.62898642248183)

In [31]:
def save(data, data_name): 
    train, _, _, _ = process_ace_data(data["train"])
    with open(f"data/{data_name}/train.jsonl", "w", encoding="utf-8") as f:
        for item in train:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    dev, _, prompt_len, _ = process_ace_data(data["validation"])
    with open(f"data/{data_name}/dev.jsonl", "w", encoding="utf-8") as f:
        for item in dev:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    test, _, prompt_len, _ = process_ace_data(data["test"])
    with open(f"data/{data_name}/test.jsonl", "w", encoding="utf-8") as f:
        for item in test:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [32]:
save(ace, "ace")

In [29]:
geneva = load_dataset("datht/geneva-generated-dataset")

In [33]:
data, full_len, prompt_len, response_len = process_ace_data(geneva["train"])

In [34]:
len(data)

1968

In [37]:
np.median(full_len)

np.float64(596.0)

In [45]:
np.sum(np.array(full_len) > 1200)

np.int64(21)

In [41]:
np.sum(np.array(prompt_len) > 384)

np.int64(34)