In [32]:
import json
import os
from collections import defaultdict

import pandas as pd

In [None]:
def process_data(path: str) -> pd.DataFrame:
    """Process text files and create a consolidated DataFrame."""
    files = [f for f in os.listdir(path) if f.endswith(".txt")]
    data_dict = defaultdict(dict)

    for file_name in files:
        parts = file_name.split("-")
        column = parts[-1].replace(".txt", "")
        directory = "-".join(parts[:-1])
        file_path = os.path.join(path, file_name)
        with open(file_path) as f:
            lines = f.readlines()[2:]  # Skip the first two lines
            for line in lines:
                values = line.strip().split("\t")
                key = (directory, values[0].strip())
                data_dict[key][column] = values[2:]
    rows = []
    for (directory, file_name), columns in data_dict.items():
        row = {"directory": directory, "file_name": file_name}
        for col_name, values in columns.items():
            clean_values = [
                v.strip().strip(":").strip(",").strip() for v in values if v.strip()
            ]
            row[col_name] = clean_values[0] if clean_values else None
        rows.append(row)
    return pd.DataFrame(rows)


path = "src/data/swde/sourceCode/groundtruth"
folders = [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
for folder_name in folders:
    folder_path = os.path.join(path, folder_name)
    df = process_data(folder_path)
    df.index = df["directory"] + "-" + df["file_name"]
    df.to_json(
        f"src/data/swde/sourceCode/groundtruth/{folder_name}_labels_true.json",
        orient="index",
    )

In [34]:
df = pd.read_json(
    "src/data/swde/sourceCode/groundtruth/auto_labels_true.json",
    orient="index",
    dtype={"file_name": str},
)
df = df.sample(n=500, random_state=1337)

# src/data/true/auto, create this folder is not exists
if not os.path.exists("src/data/true/auto"):
    os.makedirs("src/data/true/auto")

# find all directories "src/data/swde/sourceCode/auto"
folders = [
    f
    for f in os.listdir("src/data/swde/sourceCode/auto")
    if os.path.isdir(os.path.join("src/data/swde/sourceCode/auto", f))
]
labels = {}
for _, row in df.iterrows():
    directory = [d for d in folders if row["directory"] in d][0]
    file_path = os.path.join(
        "src/data/swde/sourceCode/auto", directory, f"{row["file_name"]}.htm"
    )
    with open(file_path) as f:
        content = f.read()
    with open(
        f"src/data/true/auto/{row['directory']}-{row['file_name']}.html", "w"
    ) as f:
        # remove empty lines from the content
        content = "\n".join([line for line in content.split("\n") if line.strip()])
        f.write(content)
    labels[f"{row['directory']}-{row["file_name"]}.html"] = {
        "model": None if row["model"] == "<NULL>" else row["model"],
        "fuel_economy": (
            None if row["fuel_economy"] == "<NULL>" else row["fuel_economy"]
        ),
        "engine": None if row["engine"] == "<NULL>" else row["engine"],
        "price": None if row["price"] == "<NULL>" else row["price"],
    }
with open("src/data/true/auto/labels.json", "w") as f:
    json.dump(labels, f, indent=4)