In [6]:
from datasets import load_dataset


ace_gen = load_dataset("datht/ace-short-generated-dataset")

In [7]:
# ace = load_dataset("datht/ace-event-dataset")

In [8]:
# system_prompt = """You are an open-domain event extraction system. Your task is to identify events expressed or clearly implied in a given text.
# IMPORTANT:Output ONLY valid JSON. No explanations, no markdown, no extra text.
# Output Format (JSON only, no markdown):

# {"events": [{"trigger_text": <trigger span>, "type": <event type>, "arguments": [{"text": <argument span>, "role": <semantic role>}, {"text": <argument span 2>, "role": <semantic role 2>}], "description": <description>}, {"trigger_text": <trigger span 2>, "type": <event type 2>, "arguments": [], "description": <description 2>}]} 

# - If no events are detected, return: {"events": []}"""

# user_prompt = """Given an input text: 
# <input>
# {input}
# </input>

# Your task is to extract all events present in the text. The text may contain zero, one, or multiple events.

# For each event:
# - Identify a trigger: the word or phrase that most clearly indicates the event.
# - Identify event type
# - Extract all relevant arguments participating in the event.
# - Each argument must be an exact span from the text and assigned a semantic role.
# - Description: explaining what this event type means

# Constraints and Guidelines
# - Do not invent information not supported by the text.
# - Do not paraphrase triggers or arguments.
# - The "text" field must exactly match a span in the original input (from <input>...</input>).
# """

In [9]:
# system_prompt = """You are an open-domain event extraction system. Your task is to identify events expressed or clearly implied in a given text.
# IMPORTANT:Output ONLY valid JSON. No explanations, no markdown, no extra text.
# Output Format (JSON only, no markdown):

# {"events": [{"trigger_text": <trigger span>, "type": <event type>, "arguments": [{"text": <argument span>, "role": <semantic role>}, {"text": <argument span 2>, "role": <semantic role 2>}]}, {"trigger_text": <trigger span 2>, "type": <event type 2>, "arguments": []}]} 

# - If no events are detected, return: {"events": []}"""

# user_prompt = """Given an input text: 
# <input>
# {input}
# </input>

# Your task is to extract all events present in the text. The text may contain zero, one, or multiple events.

# For each event:
# - Identify a trigger: the word or phrase that most clearly indicates the event.
# - Identify event type
# - Extract all relevant arguments participating in the event.
# - Each argument must be an exact span from the text and assigned a semantic role.

# Constraints and Guidelines
# - Do not invent information not supported by the text.
# - Do not paraphrase triggers or arguments.
# - The "text" field must exactly match a span in the original input (from <input>...</input>).
# """

In [10]:
system_prompt = """You are an open-domain event extraction system. Your task is to identify events expressed or clearly implied in a given text.
IMPORTANT:Output ONLY valid JSON. No explanations, no markdown, no extra text.
Output Format (JSON only, no markdown):

{"events": [[<trigger span>, <event type>, <description>], [<trigger span 2>, <event type 2>, <description 2>]]} 

- If no events are detected, return: {"events": []}"""

user_prompt = """Given an input text: 
<input>
{input}
</input>

Your task is to extract all events present in the text. The text may contain zero, one, or multiple events.

For each event:
- Identify a trigger: the word or phrase that most clearly indicates the event.
- Identify event type
- Description: explaining what this event type means

Constraints and Guidelines
- Do not invent information not supported by the text.
- Do not paraphrase triggers or arguments.
- The "text" field must exactly match a span in the original input (from <input>...</input>).
"""

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B', padding_side="right")

In [None]:
import json
import numpy as np
import random


def process_data(raw_data, is_test=False):
    full_len = []
    sent_total = 0
    data = []
    prompt_len = []
    response_len = []
    none_data = []

    for sample in raw_data:
        sent_id_to_sentence = {i: content['sentence'] for i, content in enumerate(sample["content"])}
        sent_id_set = set(sent_id_to_sentence.keys())
        sent_total += len(sent_id_to_sentence)
        sent_to_existing_events = {}
        for event in sample.get("events", []):
            if event.get("type_id", -1) == -1: continue
            event_type = event.get("type")
            description = event.get("description")
            
            for mention in event.get("mention", []):
                sent_id = mention.get("sent_id")
                             
                if sent_id not in sent_to_existing_events:
                    sent_to_existing_events[sent_id] = []

                # args = [[arg["text"], arg["role"]] for arg in mention.get("arguments", [])]
                # args = [{"text": arg["text"], "role": arg["role"]} for arg in mention.get("arguments", [])]
                # event_info = [mention.get("trigger_word"), event_type, args, description]
                event_info = [mention.get("trigger_word"), event_type, description]  
                # if description:
                #     event_info = { 
                #             # "description": description, 
                #             "trigger_text": mention.get("trigger_word"),
                #             "type": event_type,
                #             # "arguments": args,
                #             "description": description, 
                #         }
                # else:
                #     event_info = { 
                #             "trigger_text": mention.get("trigger_word"),
                #             "type": event_type,
                #             # "arguments": args,
                #         }

                sent_to_existing_events[sent_id].append(event_info)
                

                # keys = list(event_info.keys())
                # random.shuffle(keys)
                # random_event_info = {k: event_info[k] for k in keys}
                # sent_to_existing_events[sent_id].append(random_event_info)

        for sent_id, events in sent_to_existing_events.items():
            sent_txt = sent_id_to_sentence[sent_id]
            sent_id_set.remove(sent_id)

            # trigger_texts = [event['trigger_text'] for event in events]
            # response = json.dumps({"triggers": trigger_texts, "events": events})

            response = json.dumps({"events": events})

            data.append({"system_prompt": system_prompt, "user_prompt": user_prompt.format(input=sent_txt), "response": response})

        for sent_id in sent_id_set:
            sent_txt = sent_id_to_sentence[sent_id]

            # response = json.dumps({"triggers": [], "events": []})
            response = json.dumps({"events": []})
            
            none_data.append({"system_prompt": system_prompt, "user_prompt": user_prompt.format(input=sent_txt), "response": response})


    if is_test:
        data.extend(none_data)
    else:
        data.extend(random.sample(list(none_data), min(len(none_data), len(data) // 100)))
    for sample in data:
        prompt = tokenizer.apply_chat_template(
                [{"role": "system", "content": sample['system_prompt']},
                {"role": "user", "content": sample['user_prompt']}],
                add_generation_prompt=True,
                tokenize=False 
            )
        full = prompt + sample['response'] + tokenizer.eos_token
        
        prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)
        full_tokens = tokenizer.encode(full, add_special_tokens=False)
        response_tokens = full_tokens[len(prompt_tokens):]

        full_len.append(len(full_tokens))
        prompt_len.append(len(prompt_tokens))
        response_len.append(len(response_tokens))

    return data, full_len, prompt_len, response_len

In [28]:
data, full_len, prompt_len, response_len = process_data(ace_gen["train"])

In [29]:
np.max(response_len)

np.int64(354)

In [30]:
np.max(prompt_len)

np.int64(538)

In [31]:
np.median(full_len)

np.float64(473.0)

In [32]:
np.sum(np.array(prompt_len) > 460)

np.int64(51)

In [None]:
np.max(full_len)

np.int64(823)

In [None]:
len(data)

3167

In [None]:
np.mean(full_len)

np.float64(415.7120303125987)

In [14]:
import os

def save(data, data_name): 
    os.makedirs(f"data/{data_name}", exist_ok=True)
    train, _, _, _ = process_data(data["train"])
    with open(f"data/{data_name}/train.jsonl", "w", encoding="utf-8") as f:
        for item in train:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    dev, _, prompt_len, _ = process_data(data["validation"], is_test=True)
    with open(f"data/{data_name}/dev.jsonl", "w", encoding="utf-8") as f:
        for item in dev:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    test, _, prompt_len, _ = process_data(data["test"], is_test=True)
    with open(f"data/{data_name}/test.jsonl", "w", encoding="utf-8") as f:
        for item in test:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    return train, dev, test

In [None]:
save(ace_gen, "ace")

In [None]:
maven = load_dataset("datht/maven-event-dataset")

In [60]:
maven_train, maven_dev, maven_test = save(maven, "maven")

In [37]:
geneva = load_dataset("datht/geneva-short-generated-dataset")

geneva_train, geneva_dev, geneva_test = save(geneva, "geneva")

In [34]:
len(geneva_train)

1968

In [65]:
with open(f"data/train.jsonl", "w", encoding="utf-8") as f:
    for item in maven_train + geneva_train:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

with open(f"data/dev.jsonl", "w", encoding="utf-8") as f:
    for item in geneva_dev + maven_dev:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

with open(f"data/test.jsonl", "w", encoding="utf-8") as f:
    for item in maven_test + geneva_test:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [30]:
data, full_len, prompt_len, response_len = process_data(geneva["train"])

In [10]:
len(data)

1968

In [21]:
np.median(full_len)

np.float64(501.0)

In [31]:
np.sum(np.array(full_len) > 768)

np.int64(31)

In [32]:
np.sum(np.array(prompt_len) > 460)

np.int64(1)

In [34]:
np.max(full_len)

np.int64(1065)

In [15]:
maven = load_dataset("datht/maven-short-generated-dataset")

maven_train, maven_dev, maven_test = save(maven, "maven")

In [16]:
len(maven_train)

26278

In [82]:
rams = load_dataset("datht/rams-event-dataset")
maven = load_dataset("datht/maven-short-generated-dataset")

In [17]:
data, full_len, prompt_len, response_len = process_data(maven["train"])

In [84]:
len(data)

26278

In [86]:
np.median(full_len)

np.float64(374.0)

In [19]:
np.sum(np.array(full_len) > 768)

np.int64(18)

In [None]:
import json
from datasets import load_dataset

def extract_key_info(doc):
    """
    Tr√≠ch xu·∫•t c√°c tr∆∞·ªùng: type, trigger_word, arguments (text, role) 
    t·ª´ m·ªôt document v√† format l·∫°i ƒë·ªÉ d·ªÖ so s√°nh.
    """
    info_list = []
    
    # Duy·ªát qua c√°c event
    for event in doc.get('events', []):
        event_type = event.get('type')
        
        # Duy·ªát qua c√°c mention trong event
        for mention in event.get('mention', []):
            trigger_word = mention.get('trigger_word')
            
            # Tr√≠ch xu·∫•t list arguments (ch·ªâ l·∫•y text v√† role)
            args = []
            for arg in mention.get('arguments', []):
                args.append({
                    'text': arg.get('text'),
                    'role': arg.get('role')
                })
            
            # S·∫Øp x·∫øp arguments theo text ƒë·ªÉ ƒë·∫£m b·∫£o th·ª© t·ª± kh√¥ng l√†m sai k·∫øt qu·∫£ so s√°nh
            args = sorted(args, key=lambda x: (x['text'] or "", x['role'] or ""))
            
            info_list.append({
                'type': event_type,
                'trigger_word': trigger_word,
                'arguments': args
            })
            
    # S·∫Øp x·∫øp to√†n b·ªô danh s√°ch event theo type v√† trigger_word
    info_list = sorted(info_list, key=lambda x: (x['type'] or "", x['trigger_word'] or ""))
    return info_list

def compare_json_lists(list1, list2):
    """
    So s√°nh 2 list d·ªØ li·ªáu d·ª±a tr√™n document ID chung.
    """
    # Chuy·ªÉn list th√†nh dictionary v·ªõi key l√† document id ƒë·ªÉ tra c·ª©u nhanh (O(1))
    dict1 = {doc['id']: doc for doc in list1}
    dict2 = {doc['id']: doc for doc in list2}
    
    # T√¨m c√°c document ID c√≥ m·∫∑t ·ªü c·∫£ 2 list
    common_ids = set(dict1.keys()).intersection(set(dict2.keys()))
    
    if not common_ids:
        print("Kh√¥ng t√¨m th·∫•y Document ID n√†o chung gi·ªØa 2 list!")
        return

    print("total: ", len(dict1), len(dict2))
    print(f"T√¨m th·∫•y {len(common_ids)} Document ID chung. ƒêang ti·∫øn h√†nh so s√°nh...\n")
    print("-" * 50)
    
    failed_count = 0
    for doc_id in common_ids:
        # Tr√≠ch xu·∫•t th√¥ng tin tr·ªçng t√¢m t·ª´ 2 list
        info1 = extract_key_info(dict1[doc_id])
        info2 = extract_key_info(dict2[doc_id])
        
        # So s√°nh tr·ª±c ti·∫øp 2 c·∫•u tr√∫c ƒë√£ ƒë∆∞·ª£c chu·∫©n h√≥a
        if info1 == info2:
            # print(f"‚úÖ Document ID: {doc_id} -> GI·ªêNG NHAU HO√ÄN TO√ÄN")
            pass
        else:
            print(f"‚ùå Document ID: {doc_id} -> C√ì S·ª∞ KH√ÅC BI·ªÜT")
            print("  üîª ·ªû List 1:")
            print(f"    {json.dumps(info1, indent=2, ensure_ascii=False)}")
            print("  üîª ·ªû List 2:")
            print(f"    {json.dumps(info2, indent=2, ensure_ascii=False)}")
            failed_count += 1 
        # print("-" * 50)

    print("failed_count: ", failed_count)


data = load_dataset("datht/geneva-event-dataset")
data_gen = load_dataset("datht/geneva-short-generated-dataset")
compare_json_lists(data['validation'], data_gen['validation'])

In [None]:
data = load_dataset("datht/geneva-event-dataset")
data_gen = load_dataset("datht/geneva-short-generated-dataset")

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'content', 'events', 'negative_triggers', 'doc_idx'],
        num_rows: 1968
    })
    validation: Dataset({
        features: ['id', 'title', 'content', 'events', 'negative_triggers', 'doc_idx'],
        num_rows: 783
    })
    test: Dataset({
        features: ['id', 'title', 'content', 'events', 'negative_triggers', 'doc_idx'],
        num_rows: 933
    })
})

In [24]:
compare_json_lists(data['validation'], data_gen['validation'])

total:  783 783
T√¨m th·∫•y 783 Document ID chung. ƒêang ti·∫øn h√†nh so s√°nh...

--------------------------------------------------
‚ùå Document ID: 5ce2acc44adfebf9cc66507e8c8f2618 -> C√ì S·ª∞ KH√ÅC BI·ªÜT
  üîª ·ªû List 1:
    [
  {
    "type": "Adducing",
    "trigger_word": "cited",
    "arguments": [
      {
        "text": "The United States",
        "role": "Speaker"
      },
      {
        "text": "numerous cases of Iran receiving assistance in its missile production from countries such as North Korea and Russia",
        "role": "Specified_entity"
      }
    ]
  },
  {
    "type": "Assistance",
    "trigger_word": "aiding",
    "arguments": [
      {
        "text": "Iran",
        "role": "Benefited_party"
      },
      {
        "text": "private companies",
        "role": "Helper"
      }
    ]
  },
  {
    "type": "Assistance",
    "trigger_word": "assistance",
    "arguments": [
      {
        "text": "Iran",
        "role": "Benefited_party"
      },
      {
    