# Extract a Subset from the EnronQA Dataset

EnronQA contains emails from different people.

This notebook creates a separate dataset only consisting of dasovich-j's emails.

In [3]:
from datasets import load_dataset, DatasetDict
import tiktoken

ds = load_dataset("MichaelR207/enron_qa_0922")

filtered_ds = DatasetDict({
    split: dataset.filter(lambda example: example["user"] == "dasovich-j")
    for split, dataset in ds.items()
})

def add_id(example, idx):
    return {"id": idx}

# add the index‐based id to each row
filtered_ds = filtered_ds.map(add_id, with_indices=True)

email_list = [
    {"dataset_id": row["id"], "email_body": row["email"]}
    for row in filtered_ds["train"]
]

# only include questions marked retrieval‑only (include_email == 0)
question_list = []
for row in filtered_ds["train"]:
    row_id = row["id"]
    for flag, question in zip(row["include_email"], row["rephrased_questions"]):
        if flag == 0:
            question_list.append({"dataset_id": row_id, "question": question})

# remove emails with longer than 2048 tokens
encoding = tiktoken.get_encoding("cl100k_base")

email_token_counts = []
emails_to_keep = []
questions_to_keep = []
removed_email_ids = []

for i, email in enumerate(email_list):
    token_count = len(encoding.encode(email["email_body"]))
    email_token_counts.append(token_count)
    if token_count <= 2048:
        emails_to_keep.append(email)
    else:
        removed_email_ids.append(email["dataset_id"])

# Filter question_list to remove questions associated with removed emails
for question in question_list:
    if question["dataset_id"] not in removed_email_ids:
        questions_to_keep.append(question)

email_list = emails_to_keep
questions_list = questions_to_keep

print(len(email_list))
print(len(questions_list))

4891
11782


In [4]:
import json
import os

# Create directory if it doesn't exist
os.makedirs("data", exist_ok=True)

# Save emails to a separate JSON file
emails_output_file = "enron_emails.json"
with open(emails_output_file, "w") as f:
    json.dump(email_list, f, indent=2)

# Save questions to a separate JSON file
questions_output_file = "enron_questions.json"
with open(questions_output_file, "w") as f:
    json.dump(questions_list, f, indent=2)

# Create a reference file that maps questions to their corresponding emails
reference_data = []
email_dict = {email["dataset_id"]: email["email_body"] for email in email_list}

for question in questions_list:
    dataset_id = question["dataset_id"]
    if dataset_id in email_dict:
        reference_data.append({
            "dataset_id": dataset_id,
            "question_id": id(question)  # Using object id as a unique identifier
        })

reference_output_file = "enron_qa_reference.json"
with open(reference_output_file, "w") as f:
    json.dump(reference_data, f, indent=2)

print(f"Saved {len(email_list)} emails to {emails_output_file}")
print(f"Saved {len(questions_list)} questions to {questions_output_file}")
print(f"Saved {len(reference_data)} question-email references to {reference_output_file}")

Saved 4891 emails to data/enron_emails.json
Saved 11782 questions to data/enron_questions.json
Saved 11782 question-email references to data/enron_qa_reference.json
