In [1]:
from datasets import load_dataset


ace_gen = load_dataset("datht/ace-short-generated-dataset")

In [None]:
# ace = load_dataset("datht/ace-event-dataset")

In [None]:
system_prompt = """You are an open-domain event extraction system. Your task is to identify events expressed or clearly implied in a given text.
IMPORTANT:Output ONLY valid JSON. No explanations, no markdown, no extra text.
Output Format (JSON only, no markdown):

{
  "events": [
    [<trigger span>, <event type>, [[<argument span>, <semantic role>], [<argument span 2>, <semantic role 2>]], <description>],
    [<trigger span 2>, <event type 2>, [], <description 2>]
  ]
}

- If no events are detected, return: {"events": []}"""

user_prompt = """Given an input text: {input}

Your task is to extract all events present in the text. The text may contain zero, one, or multiple events.

For each event:
- Identify event type
- Identify a trigger: the word or phrase that most clearly indicates the event.
- Extract all relevant arguments participating in the event.
- Each argument must be an exact span from the text and assigned a semantic role.
- Description: explaining what this event type means

Constraints and Guidelines
- Do not invent information not supported by the text.
- Do not paraphrase triggers or arguments.
- The "text" field must exactly match a span in the original input.
"""

In [42]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-4B', padding_side="right")

In [None]:
tokenizer.eos_token_id = 151645

151645

In [40]:
tokenizer.bos_token

In [43]:
tokenizer.apply_chat_template(
                [{"role": "system", "content": "sample['system_prompt']"},
                {"role": "user", "content": "sample['user_prompt']"}],
                add_generation_prompt=True,
                tokenize=False 
            )

"<|im_start|>system\nsample['system_prompt']<|im_end|>\n<|im_start|>user\nsample['user_prompt']<|im_end|>\n<|im_start|>assistant\n"

In [39]:
tokenizer

Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-0.5B', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, norma

In [59]:
import json
import numpy as np
import random


def process_ace_data(raw_data, is_test=False):
    full_len = []
    sent_total = 0
    data = []
    prompt_len = []
    response_len = []
    none_data = []

    for sample in raw_data:
        sent_id_to_sentence = {i: content['sentence'] for i, content in enumerate(sample["content"])}
        sent_id_set = set(sent_id_to_sentence.keys())
        sent_total += len(sent_id_to_sentence)
        sent_to_existing_events = {}
        for event in sample.get("events", []):
            if event.get("type_id", -1) == -1: continue
            event_type = event.get("type")
            description = event.get("description")
            
            for mention in event.get("mention", []):
                sent_id = mention.get("sent_id")
                             
                if sent_id not in sent_to_existing_events:
                    sent_to_existing_events[sent_id] = []
                args = [{"text": arg["text"], "role": arg["role"]} for arg in mention.get("arguments", [])]
                if description:
                    event_info = {
                        "description": description,
                        "trigger_text": mention.get("trigger_word"),
                        "type": event_type,
                        "arguments": args 
                    }
                else:
                    event_info = {
                        "trigger_text": mention.get("trigger_word"),
                        "type": event_type,
                        "arguments": args 
                    }
                    print("no desc")

                sent_to_existing_events[sent_id].append(event_info)

        for sent_id, events in sent_to_existing_events.items():
            sent_txt = sent_id_to_sentence[sent_id]
            sent_id_set.remove(sent_id)

            response = json.dumps({"events": events})

            data.append({"system_prompt": system_prompt, "user_prompt": user_prompt.format(input=sent_txt), "response": response})

        for sent_id in random.sample(list(sent_id_set), min(len(sent_id_set), 1)):
            sent_txt = sent_id_to_sentence[sent_id]

            response = json.dumps({"events": []})
            
            none_data.append({"system_prompt": system_prompt, "user_prompt": user_prompt.format(input=sent_txt), "response": response})


    if is_test:
        data.extend(none_data)
    else:
        data.extend(random.sample(list(none_data), min(len(none_data), len(data) // 100)))
    for sample in data:
        prompt = tokenizer.apply_chat_template(
                [{"role": "system", "content": sample['system_prompt']},
                {"role": "user", "content": sample['user_prompt']}],
                add_generation_prompt=True,
                tokenize=False 
            )
        full = prompt + sample['response'] + tokenizer.eos_token
        
        prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)
        full_tokens = tokenizer.encode(full, add_special_tokens=False)
        response_tokens = full_tokens[len(prompt_tokens):]

        full_len.append(len(full_tokens))
        prompt_len.append(len(prompt_tokens))
        response_len.append(len(response_tokens))

    return data, full_len, prompt_len, response_len

In [54]:
data, full_len, prompt_len, response_len = process_ace_data(ace_gen["train"])

In [55]:
type_set = set()
for item in data:
    events = json.loads(item['response'])["events"]
    for trigger in events:
        type_set.add(trigger['trigger_text'])

In [57]:
len (data)

3167

In [56]:
len(type_set)

918

In [47]:
np.max(response_len)

np.int64(513)

In [48]:
np.max(prompt_len)

np.int64(480)

In [8]:
np.median(full_len)

np.float64(359.0)

In [51]:
np.sum(np.array(prompt_len) > 460)

np.int64(1)

In [12]:
np.max(full_len)

np.int64(823)

In [13]:
len(data)

3167

In [46]:
np.mean(full_len)

np.float64(415.7120303125987)

In [21]:
import os

def save(data, data_name): 
    os.makedirs(f"data/{data_name}", exist_ok=True)
    train, _, _, _ = process_ace_data(data["train"])
    with open(f"data/{data_name}/train.jsonl", "w", encoding="utf-8") as f:
        for item in train:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    dev, _, prompt_len, _ = process_ace_data(data["validation"], is_test=True)
    with open(f"data/{data_name}/dev.jsonl", "w", encoding="utf-8") as f:
        for item in dev:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    test, _, prompt_len, _ = process_ace_data(data["test"], is_test=True)
    with open(f"data/{data_name}/test.jsonl", "w", encoding="utf-8") as f:
        for item in test:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    return train, dev, test

In [60]:
save(ace_gen, "ace")

no desc
no desc
no desc
no desc


([{'system_prompt': 'You are an open-domain event extraction system. Your task is to identify events expressed or clearly implied in a given text.\nIMPORTANT:Output ONLY valid JSON. No explanations, no markdown, no extra text.\nOutput Format (JSON only, no markdown):\n\n{\n  "events": [\n    {\n      "trigger_text": <trigger span>,\n      "type": <event type>,\n      "description": <description>,\n      "arguments": [\n        {"text": <argument span>, "role": <semantic role>},\n        {"text": <argument span 2>, "role": <semantic role 2>}\n      ]\n    },\n    {\n      "trigger_text": <trigger span 2>,\n      "type": <event type 2>,\n      "description": <description 2>,\n      "arguments": []\n    }\n  ]\n}\n\n- If no events are detected, return: {"events": []}',
   'user_prompt': 'Given an input text: Even as the secretary of homeland security was putting his people on high alert last month , a 30-foot Cuban patrol boat with four heavily armed men landed on American shores , utterl

In [None]:
maven = load_dataset("datht/maven-event-dataset")

In [60]:
maven_train, maven_dev, maven_test = save(maven, "maven")

In [44]:
geneva = load_dataset("datht/geneva-event-dataset")

geneva_train, geneva_dev, geneva_test = save(geneva, "geneva")

In [65]:
with open(f"data/train.jsonl", "w", encoding="utf-8") as f:
    for item in maven_train + geneva_train:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

with open(f"data/dev.jsonl", "w", encoding="utf-8") as f:
    for item in geneva_dev + maven_dev:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

with open(f"data/test.jsonl", "w", encoding="utf-8") as f:
    for item in maven_test + geneva_test:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [None]:
data, full_len, prompt_len, response_len = process_ace_data(geneva["train"])

In [21]:
len(data)

26278

In [22]:
np.median(full_len)

np.float64(414.0)

In [23]:
np.sum(np.array(full_len) > 768)

np.int64(3)

In [18]:
np.sum(np.array(prompt_len) > 460)

np.int64(5)

In [19]:
np.max(response_len)

np.int64(296)

In [50]:
rams = load_dataset("datht/rams-event-dataset")

In [51]:
data, full_len, prompt_len, response_len = process_ace_data(rams["train"])

In [52]:
len(data)

7402

In [7]:
geneva = load_dataset("datht/geneva-event-dataset")

In [8]:
data, full_len, prompt_len, response_len = process_ace_data(geneva["train"])

In [9]:
len(data)

1968

In [10]:
np.median(full_len)

np.float64(448.0)

In [12]:
np.sum(np.array(full_len) > 768)

np.int64(8)

In [1]:
print("{\"events\": [{\"trigger_text\": \"sponsored\", \"type\": \"Supply\", \"arguments\": []}]}{\"events\": [{\"trigger_text\": \"tour\", \"type\": \"Traveling\", \"arguments\": []}]}{\"events\": [{\"trigger_text\": \"promotion\", \"type\": \"Cause_change_of_position_on_a_scale\", \"arguments\": []}]}{\"events\": [{\"trigger_text\": \"concert\", \"type\": \"Social_event\", \"arguments\": []}]}{\"events\": [{\"trigger_text\": \"sponsored\", \"type\": \"Supply\", \"arguments\": []}]}{\"events\": [{\"trigger_text\": \"tour\", \"type\": \"Traveling\", \"arguments\": []}]}{\"events\": [{\"trigger_text\": \"sponsored\", \"type\": \"Supply\", \"arguments\": []}]}{\"events\": [{\"trigger_text\": \"suggested\", \"type\": \"Convincing\", \"arguments\": []}]}{\"events\": [{\"trigger_text\": \"tour\", \"type\": \"Traveling\", \"arguments\": []}]}{\"events\": [{\"trigger_text\": \"tour\", \"type\": \"Traveling\", \"arguments\": []}]}{\"events\": [{\"trigger_text\": \"sponsored\", \"type\": \"Supply\", \"arguments\": []}]}{\"events\": [{\"trigger_text\": \"suggested\", \"type\": \"Convincing\", \"arguments\": []}]}{\"events\": [{\"trigger_text\": \"tour\", \"")

{"events": [{"trigger_text": "sponsored", "type": "Supply", "arguments": []}]}{"events": [{"trigger_text": "tour", "type": "Traveling", "arguments": []}]}{"events": [{"trigger_text": "promotion", "type": "Cause_change_of_position_on_a_scale", "arguments": []}]}{"events": [{"trigger_text": "concert", "type": "Social_event", "arguments": []}]}{"events": [{"trigger_text": "sponsored", "type": "Supply", "arguments": []}]}{"events": [{"trigger_text": "tour", "type": "Traveling", "arguments": []}]}{"events": [{"trigger_text": "sponsored", "type": "Supply", "arguments": []}]}{"events": [{"trigger_text": "suggested", "type": "Convincing", "arguments": []}]}{"events": [{"trigger_text": "tour", "type": "Traveling", "arguments": []}]}{"events": [{"trigger_text": "tour", "type": "Traveling", "arguments": []}]}{"events": [{"trigger_text": "sponsored", "type": "Supply", "arguments": []}]}{"events": [{"trigger_text": "suggested", "type": "Convincing", "arguments": []}]}{"events": [{"trigger_text": "t