In [2]:
import re
import spacy
import json
import csv
nlp = spacy.load("en_core_web_sm") # NER 

with open('Hiring_Problem_Statement/Hiring_Problem_Statement/conversation.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
header_pattern = re.compile(r"On\s.+?,\s.+?\s<(.+?)>\s+wrote:") # regex for Headers 

emails = []
current_email = None
body_lines = []
# Parse headers and bodies
for line in lines:
    line = line.strip()
    header_match = header_pattern.search(line)
    if header_match:
        # If a previous email was being read, save it
        if current_email:
            current_email["body"] = "\n".join(body_lines).strip()
            emails.append(current_email)
            body_lines = []
        email = header_match.group(1)
        header_clean = re.sub(r'<.+?>', '', line)
        doc = nlp(header_clean)
        date = ""
        time = ""
        name = ""
        for ent in doc.ents:
            if ent.label_ == "DATE":
                date = ent.text
            if ent.label_ == "TIME":
                time = ent.text
            if ent.label_ == "PERSON":
                name = ent.text
        current_email = {
            "name": name,
            "email": email,
            "date": date,
            "time": time,
            "header": line,
            "body": "",
            "replies": []
        }
    else:
        if current_email:
            body_lines.append(line)
if current_email:
    current_email["body"] = "\n".join(body_lines).strip()
    emails.append(current_email)
# Build JSON hierarchical thread 
def build_thread(emails):
    if not emails:
        return {}
    root = emails[0]
    node = root
    for email in emails[1:]:
        node["replies"].append(email)
        node = email
    return root
threaded_json = build_thread(emails)
with open("emails_threaded.json", "w", encoding="utf-8") as f_json:
    json.dump(threaded_json, f_json, indent=4)
print("JSON file saved as emails_threaded.json")

with open("emails_flat.csv", "w", newline="", encoding="utf-8") as f_csv:
    writer = csv.DictWriter(f_csv, fieldnames=["name", "email", "date", "time", "header", "body"])
    writer.writeheader()
    for email in emails:
        writer.writerow({
            "name": email["name"],
            "email": email["email"],
            "date": email["date"],
            "time": email["time"],
            "header": email["header"],
            "body": email["body"]
        })
print("CSV file saved as emails_flat.csv")


JSON file saved as emails_threaded.json
CSV file saved as emails_flat.csv
