This notebook produces a jsonl file from txt files containing one motivation letter each to be uploaded to doccano for manual annotation. Lines also contain info on each letters letter id, profile id and model id.

In [None]:
import os
import json
import re

folder_path = '/content/sample_data/manual_annotation_task'
output_file = "feature-matching.jsonl"

model_patterns = {
    "gpt-4-1": r"gpt-4-1",
    "gpt-4o": r"gpt-4o",
    "llama-3-3-70b-versatile": r"llama-3-3-70b-versatile",
    "kimi-k2-instruct": r"kimi-k2-instruct",
    "o3": r"o3",
    "qwen3-32b": r"qwen-qwen3-32b" # Corrected pattern based on file names
}

letter_patterns = [
    r"^\s*([1-5])\.", # Added pattern to capture a digit between 1 and 5 followed by a period at the beginning of the string
    r"Letter (\d)",
    r"\[(\d)\]",
    r"\*\*Letter (\d)\*\*",
    r"Letter \[(\d)\]"
]

txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

with open(output_file, 'w') as outfile:
    for i, txt_file in enumerate(txt_files):
        file_path = os.path.join(folder_path, txt_file)
        with open(file_path, 'r') as infile:
            text_content = infile.read()

        # Extract the first line of the text content
        first_line = text_content.split('\n')[0] if text_content else ""

        model_name = "unknown"
        for name, pattern in model_patterns.items():
            if re.search(pattern, txt_file):
                model_name = name
                break

        profile_id_match = re.search(r'_(\d+)\.txt$', txt_file)
        profile_id = profile_id_match.group(1) if profile_id_match else "unknown"

        letter_id = "unknown"
        for pattern in letter_patterns:
            letter_match = re.search(pattern, first_line) # Search only in the first line
            if letter_match:
                letter_id = letter_match.group(1)
                break

        json_object = {
            "id": i,
            "text": text_content,
            "model": model_name,
            "profile_id": profile_id,
            "letter_id": letter_id
        }
        json.dump(json_object, outfile)
        outfile.write('\n')

print(f"JSONL file '{output_file}' generated successfully.")

JSONL file 'output.jsonl' generated successfully.
