In [None]:
import os
from collections import defaultdict

import pandas as pd

In [None]:
def process_data(path: str) -> pd.DataFrame:
    """Process text files and create a consolidated DataFrame."""
    files = [f for f in os.listdir(path) if f.endswith(".txt")]
    data_dict = defaultdict(dict)

    for file_name in files:
        parts = file_name.split("-")
        column = parts[-1].replace(".txt", "")
        directory = "-".join(parts[:-1])
        file_path = os.path.join(path, file_name)
        with open(file_path) as f:
            lines = f.readlines()[2:]  # Skip the first two lines
            for line in lines:
                values = line.strip().split("\t")
                key = (directory, values[0].strip())
                data_dict[key][column] = values[2:]
    rows = []
    for (directory, file_name), columns in data_dict.items():
        row = {"directory": directory, "file_name": file_name}
        for col_name, values in columns.items():
            clean_values = [
                v.strip().strip(":").strip(",").strip() for v in values if v.strip()
            ]
            row[col_name] = clean_values[0] if clean_values else None
        rows.append(row)
    return pd.DataFrame(rows)


path = "src/data/true/swde/sourceCode/groundtruth"
folders = [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
for folder_name in folders:
    folder_path = os.path.join(path, folder_name)
    df = process_data(folder_path)
    df.index = df["directory"] + "-" + df["file_name"]
    df.to_json(f"{folder_name}_labels_true.json", orient="index")

Unnamed: 0,directory,file_name,location,title,date_posted,company
job-jobtarget-0000,job-jobtarget,0000,"Grand Rapids, Michigan, United States",Sr SW Engineer (Sr Embedded SW/Test Engineer),"June 25, 2010","ENSCO, Inc."
job-jobtarget-0001,job-jobtarget,0001,"Cincinnati, Ohio, United States",Software Engineer (Embedded Controls Software ...,"September 20, 2010","ENSCO, Inc."
job-jobtarget-0002,job-jobtarget,0002,"Bellevue, Washington, United States",Sr. Systems/Software design/development Engineer,"October 1, 2010",HP
job-jobtarget-0003,job-jobtarget,0003,"Kanata, Ontario, Canada",Intermediate Developer - Mobile Clients,"October 7, 2010",TrueContext Corporation
job-jobtarget-0004,job-jobtarget,0004,"Endicott, New York, United States",Software Engineer (Embedded Controls Software ...,"October 12, 2010","ENSCO, Inc."
...,...,...,...,...,...,...
job-hotjobs-1995,job-hotjobs,1995,"Palo Alto, CA",> Confidential &gt; Sr. SW Engineer/Developer ...,"December 5, 2010",Confidential
job-hotjobs-1996,job-hotjobs,1996,"San Bruno, CA",&gt; SENIOR SOFTWARE ENGINEER,"December 5, 2010",ENCLARA HEALTH
job-hotjobs-1997,job-hotjobs,1997,"Montrose, CO",&gt; Polystrand Plant Technician,"December 5, 2010",Gordon Composites
job-hotjobs-1998,job-hotjobs,1998,"Milwaukee, WI",&gt; Software Programmer,"December 5, 2010","MJ Care, Inc."
