In [8]:
import pandas as pd
import numpy as np
from pathlib import Path

# Read data
df = pd.read_csv("data.csv")
train = pd.read_csv("train.csv")
participant_id = "1"
sequence_id = 0

# Landmark details
landmark_counts = {"face": 478, "pose": 33, "left_hand": 21, "right_hand": 21}
landmark_threshold = 468
output_dir = Path("data/asl-signs/train_landmark_files") / participant_id

# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

# Prepare new rows for train in bulk
new_train_rows = []

# Process each sign and file
for (sign, filename), df_temp in df.groupby(["sign", "filename"]):
    df_filtered = df_temp[["frame", "type", "landmark_index", "x", "y", "z"]].copy()

    # Add missing landmarks for each frame
    frames = df_filtered["frame"].unique()
    for frame in frames:
        missing_data = []
        for type_, count in landmark_counts.items():
            if type_ not in df_filtered[df_filtered["frame"] == frame]["type"].unique():
                missing_data.append(pd.DataFrame({
                    "landmark_index": range(count),
                    "x": np.nan,
                    "y": np.nan,
                    "z": np.nan,
                    "type": type_,
                    "frame": frame
                }))
        if missing_data:
            df_filtered = pd.concat([df_filtered] + missing_data, ignore_index=True)
    
    if len(frames) == 1:
        df_filtered_duplicate = df_filtered.copy()
        for i in range(2, 50):
            df_filtered_duplicate["frame"] = i
            df_filtered = pd.concat([df_filtered, df_filtered_duplicate], ignore_index=True)

    # Filter landmarks and reset index
    df_filtered = df_filtered[df_filtered["landmark_index"] < landmark_threshold].sort_values(by = ["type", "landmark_index"]).reset_index(drop=True)

    # Prepare file path
    sanitized_filename = filename.replace(".jpg", "").replace("\\", "_")
    file_path = output_dir / f"{sign}_{sanitized_filename}.parquet"

    # Add row to new train DataFrame
    new_train_rows.append({
        "path": str(file_path.relative_to("data/asl-signs")),
        "participant_id": participant_id,
        "sequence_id": sequence_id,
        "sign": sign
    })

    # Save filtered data to Parquet
    df_filtered.to_parquet(file_path, index=False)

# Update and save train DataFrame
if new_train_rows:
    train = pd.concat([train, pd.DataFrame(new_train_rows)], ignore_index=True)

train.to_csv("train.csv", index=False, header=True)

train

Unnamed: 0,path,participant_id,sequence_id,sign
0,train_landmark_files/26734/1000035562.parquet,26734,1.000036e+09,blow
1,train_landmark_files/28656/1000106739.parquet,28656,1.000107e+09,wait
2,train_landmark_files/16069/100015657.parquet,16069,1.000157e+08,cloud
3,train_landmark_files/25571/1000210073.parquet,25571,1.000210e+09,bird
4,train_landmark_files/62590/1000240708.parquet,62590,1.000241e+09,owie
...,...,...,...,...
133718,train_landmark_files\1\Z_asl-sign-language-alp...,1,0.000000e+00,Z
133719,train_landmark_files\1\Z_asl-sign-language-alp...,1,0.000000e+00,Z
133720,train_landmark_files\1\Z_asl-sign-language-alp...,1,0.000000e+00,Z
133721,train_landmark_files\1\Z_asl-sign-language-alp...,1,0.000000e+00,Z
