In [None]:
import os
import pandas as pd
from glob import glob
from scipy.spatial import KDTree
import numpy as np

# Define base directory
base_dir = "/Users/suryanshpatel/Projects/Directed Readings/Technical/MintPain/Annotated_data_part1"

# Collect all image paths
rgb_paths = glob(os.path.join(base_dir, "**", "RGB", "*.jpg"), recursive=True)
depth_paths = glob(os.path.join(base_dir, "**", "D", "*.png"), recursive=True)
thermal_paths = glob(os.path.join(base_dir, "**", "T", "*.png"), recursive=True)

# Debugging: Check if files are found
print(f"Found {len(rgb_paths)} RGB images, {len(depth_paths)} Depth images, {len(thermal_paths)} Thermal images")

# Adjust extraction of subject, trial, and ID based on your file path structure
def extract_info(filepath, image_type):
    parts = filepath.split(os.sep)
    
    # Extract Subject and Trial from the relevant folder structure
    subject = parts[6]  # Adjust based on your structure; should point to Sub1, Sub2, etc.
    trial = parts[7]    # Adjust based on your structure; should point to Annotated_data_Sub01_Trial01, etc.

    filename = os.path.basename(filepath)

    # Extract ID based on image type and filename structure
    if image_type == 'RGB':
        # Extract RGB ID from filename format RGB-xx-xx-xx-xxxx.jpg
        identifier = filename.split("-")[3]  # Extract the numeric ID part
    elif image_type == 'D':
        # Extract Depth ID from filename format D-xx-xx-xx-xxxx.png
        identifier = filename.split("-")[3]  # Extract the numeric ID part
    elif image_type == 'T':
        # Extract Thermal ID from filename format Txxxx.png (number only)
        identifier = filename.replace("T", "").split(".")[0]  # Remove 'T' and extract number

    return subject, trial, identifier, filepath

# Convert lists into DataFrames
df_rgb = pd.DataFrame([extract_info(p, 'RGB') for p in rgb_paths], columns=["Subject", "Trial", "ID", "RGB_Path"])
df_depth = pd.DataFrame([extract_info(p, 'D') for p in depth_paths], columns=["Subject", "Trial", "ID", "Depth_Path"])
df_thermal = pd.DataFrame([extract_info(p, 'T') for p in thermal_paths], columns=["Subject", "Trial", "ID", "Thermal_Path"])

# Ensure that all ID columns are of the same type
df_rgb["ID"] = pd.to_numeric(df_rgb["ID"], errors="coerce", downcast="integer")
df_depth["ID"] = pd.to_numeric(df_depth["ID"], errors="coerce", downcast="integer")
df_thermal["ID"] = pd.to_numeric(df_thermal["ID"], errors="coerce", downcast="integer")

# Build KDTree for fast nearest match
rgb_keys = df_rgb["ID"].dropna().values.reshape(-1, 1)
rgb_tree = KDTree(rgb_keys)

# Find nearest RGB match for each Thermal image
def match_nearest(rgb_tree, rgb_keys, thermal_id):
    if len(rgb_keys) == 0:
        return None  # Return None if no RGB IDs exist
    _, idx = rgb_tree.query([[thermal_id]])
    return rgb_keys[idx[0]]

# Apply the nearest neighbor matching for thermal IDs
df_thermal["Matched_RGB_ID"] = df_thermal["ID"].apply(lambda x: match_nearest(rgb_tree, rgb_keys, x))

# Ensure Matched_RGB_ID is also an integer for merging
df_thermal["Matched_RGB_ID"] = pd.to_numeric(df_thermal["Matched_RGB_ID"], errors="coerce", downcast="integer")

# Now proceed with merging the datasets
df_final = df_rgb.merge(df_depth, on=["Subject", "Trial", "ID"], how="inner")\
                 .merge(df_thermal, left_on=["Subject", "Trial", "ID"], right_on=["Subject", "Trial", "Matched_RGB_ID"], how="left")

# Drop redundant columns
df_final.drop(columns=["Matched_RGB_ID"], inplace=True)

# Rename columns
df_final.rename(columns={"ID": "Timestamp"}, inplace=True)

# Save the dataset
df_final.to_csv("synced_pain_dataset.csv", index=False)


Found 8656 RGB images, 8656 Depth images, 11758 Thermal images


In [20]:
df_final.head()

Unnamed: 0,Subject,Trial,ID_x,RGB_Path,Depth_Path,ID_y,Thermal_Path
0,MintPain,Annotated_data_part1,54,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...,,
1,MintPain,Annotated_data_part1,54,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...,,
2,MintPain,Annotated_data_part1,54,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...,,
3,MintPain,Annotated_data_part1,54,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...,,
4,MintPain,Annotated_data_part1,54,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...,,


In [22]:
df_final[df_final['Thermal_Path'].notna()]

Unnamed: 0,Subject,Trial,ID_x,RGB_Path,Depth_Path,ID_y,Thermal_Path


In [None]:
import os
import pandas as pd
from glob import glob
import numpy as np

# Define base directory
base_dir = "/Users/suryanshpatel/Projects/Directed Readings/Technical/MintPain/Annotated_data_part1"

# Collect all image paths
rgb_paths = glob(os.path.join(base_dir, "**", "RGB", "*.jpg"), recursive=True)
depth_paths = glob(os.path.join(base_dir, "**", "D", "*.png"), recursive=True)
thermal_paths = glob(os.path.join(base_dir, "**", "T", "*.png"), recursive=True)

# Debugging: Check if files are found
print(f"Found {len(rgb_paths)} RGB images, {len(depth_paths)} Depth images, {len(thermal_paths)} Thermal images")

# Adjust extraction of subject, trial, and ID based on your file path structure
def extract_info(filepath, image_type):
    parts = filepath.split(os.sep)
    
    # Extract Subject and Trial from the relevant folder structure
    subject = parts[6]  # Adjust based on your structure; should point to Sub1, Sub2, etc.
    trial = parts[7]    # Adjust based on your structure; should point to Annotated_data_Sub01_Trial01, etc.

    filename = os.path.basename(filepath)

    # Extract ID based on image type and filename structure
    if image_type == 'RGB':
        # Extract RGB ID from filename format RGB-xx-xx-xx-xxxx.jpg
        identifier = filename.split("-")[3]  # Extract the numeric ID part
    elif image_type == 'D':
        # Extract Depth ID from filename format D-xx-xx-xx-xxxx.png
        identifier = filename.split("-")[3]  # Extract the numeric ID part
    elif image_type == 'T':
        # Extract Thermal ID from filename format Txxxx.png (number only)
        identifier = filename.replace("T", "").split(".")[0]  # Remove 'T' and extract number

    return subject, trial, identifier, filepath

# Convert lists into DataFrames
df_rgb = pd.DataFrame([extract_info(p, 'RGB') for p in rgb_paths], columns=["Subject", "Trial", "ID", "RGB_Path"])
df_depth = pd.DataFrame([extract_info(p, 'D') for p in depth_paths], columns=["Subject", "Trial", "ID", "Depth_Path"])
df_thermal = pd.DataFrame([extract_info(p, 'T') for p in thermal_paths], columns=["Subject", "Trial", "ID", "Thermal_Path"])

# Ensure that all ID columns are of the same type
df_rgb["ID"] = pd.to_numeric(df_rgb["ID"], errors="coerce", downcast="integer")
df_depth["ID"] = pd.to_numeric(df_depth["ID"], errors="coerce", downcast="integer")
df_thermal["ID"] = pd.to_numeric(df_thermal["ID"], errors="coerce", downcast="integer")

# Match Thermal images to RGB images sequentially
def match_sequentially(rgb_df, thermal_df):
    thermal_ids = thermal_df["ID"].values
    rgb_ids = rgb_df["ID"].values
    
    # Create an empty list to store matched RGB IDs
    matched_rgb_ids = []
    
    # Iterate over the thermal IDs and match sequentially with RGB IDs
    for i, thermal_id in enumerate(thermal_ids):
        if i < len(rgb_ids):  # If there are still RGB images left
            matched_rgb_ids.append(rgb_ids[i])  # Match with the RGB ID at the same index
        else:
            matched_rgb_ids.append(np.nan)  # If no more RGB images, append NaN
    
    thermal_df["Matched_RGB_ID"] = matched_rgb_ids
    return thermal_df

# Apply sequential matching to the thermal dataframe
df_thermal = match_sequentially(df_rgb, df_thermal)

# Now proceed with merging the datasets
df_final = df_rgb.merge(df_depth, on=["Subject", "Trial", "ID"], how="inner")\
                 .merge(df_thermal, left_on=["Subject", "Trial", "ID"], right_on=["Subject", "Trial", "Matched_RGB_ID"], how="left")

# Drop redundant columns
df_final.drop(columns=["Matched_RGB_ID"], inplace=True)

# Rename columns
df_final.rename(columns={"ID": "Timestamp"}, inplace=True)

# Save the dataset
df_final.to_csv("synced_pain_dataset.csv", index=False)

# Preview the dataframe
print(df_final.head())


Found 8656 RGB images, 8656 Depth images, 11758 Thermal images


: 

In [2]:
# import os
# import pandas as pd
# from glob import glob
# import numpy as np

# # Define base directory
# base_dir = "/Users/suryanshpatel/Projects/Directed Readings/Technical/MintPain/Annotated_data_part1"

# # Collect all image paths
# rgb_paths = glob(os.path.join(base_dir, "**", "RGB", "*.jpg"), recursive=True)
# depth_paths = glob(os.path.join(base_dir, "**", "D", "*.png"), recursive=True)
# thermal_paths = glob(os.path.join(base_dir, "**", "T", "*.png"), recursive=True)

# # Debugging: Check if files are found
# print(f"Found {len(rgb_paths)} RGB images, {len(depth_paths)} Depth images, {len(thermal_paths)} Thermal images")

# # Adjust extraction of subject, trial, and ID based on your file path structure
# def extract_info(filepath, image_type):
#     parts = filepath.split(os.sep)
    
#     # Extract Subject and Trial from the relevant folder structure
#     subject = parts[6]  # Adjust based on your structure; should point to Sub1, Sub2, etc.
#     trial = parts[7]    # Adjust based on your structure; should point to Annotated_data_Sub01_Trial01, etc.

#     filename = os.path.basename(filepath)

#     # Extract ID based on image type and filename structure
#     if image_type == 'RGB':
#         identifier = filename.split("-")[3]  # Extract the numeric ID part
#     elif image_type == 'D':
#         identifier = filename.split("-")[3]  # Extract the numeric ID part
#     elif image_type == 'T':
#         identifier = filename.replace("T", "").split(".")[0]  # Remove 'T' and extract number

#     return subject, trial, identifier, filepath

# # Convert lists into DataFrames
# def create_dataframe(file_paths, image_type):
#     data = [extract_info(p, image_type) for p in file_paths]
#     df = pd.DataFrame(data, columns=["Subject", "Trial", "ID", f"{image_type}_Path"])
#     # Optimize memory usage by converting 'Subject' and 'Trial' to category dtype if possible
#     df["Subject"] = df["Subject"].astype("category")
#     df["Trial"] = df["Trial"].astype("category")
#     df["ID"] = pd.to_numeric(df["ID"], errors="coerce", downcast="integer")
#     return df

# df_rgb = create_dataframe(rgb_paths, 'RGB')
# df_depth = create_dataframe(depth_paths, 'D')
# df_thermal = create_dataframe(thermal_paths, 'T')

# # Sequentially match thermal images with RGB images in smaller chunks
# def match_sequentially_in_batches(rgb_df, thermal_df, batch_size=500):
#     rgb_keys = rgb_df["ID"].values
#     thermal_ids = thermal_df["ID"].values
#     matched_rgb_ids = []
    
#     # Process in batches
#     for i in range(0, len(thermal_ids), batch_size):
#         batch_ids = thermal_ids[i:i+batch_size]
#         current_index = 0  # Start with the first RGB image
        
#         for thermal_id in batch_ids:
#             if current_index < len(rgb_keys):
#                 matched_rgb_id = rgb_keys[current_index]
#                 matched_rgb_ids.append(matched_rgb_id)
#                 current_index += 1  # Increment for next match
#             else:
#                 matched_rgb_ids.append(np.nan)  # No more RGB images to match, append NaN
    
#     # Create a new column for matched RGB IDs
#     thermal_df["Matched_RGB_ID"] = matched_rgb_ids
#     return thermal_df

# # Apply sequential matching logic
# df_thermal = match_sequentially_in_batches(df_rgb, df_thermal)

# # Now proceed with merging the datasets in smaller chunks to avoid memory overload
# chunk_size = 500  # Adjust chunk size for merging
# final_df = pd.DataFrame()

# # Merge in smaller chunks
# for start in range(0, len(df_thermal), chunk_size):
#     end = min(start + chunk_size, len(df_thermal))
#     temp_thermal_chunk = df_thermal.iloc[start:end]
    
#     # Merge this chunk with RGB and Depth DataFrames
#     merged_chunk = df_rgb.merge(df_depth, on=["Subject", "Trial", "ID"], how="inner") \
#                          .merge(temp_thermal_chunk, left_on=["Subject", "Trial", "ID"], 
#                                 right_on=["Subject", "Trial", "Matched_RGB_ID"], how="left")
    
#     # Append the chunk to the final DataFrame
#     final_df = pd.concat([final_df, merged_chunk], ignore_index=True)

# # Drop redundant columns
# final_df.drop(columns=["Matched_RGB_ID"], inplace=True)

# # Rename columns
# final_df.rename(columns={"ID": "Timestamp"}, inplace=True)

# # Save the dataset
# final_df.to_csv("synced_pain_dataset.csv", index=False)

# # Preview the dataframe
# print(final_df.head())


In [None]:
import os
import pandas as pd
from glob import glob
import numpy as np

# Define base directory
base_dir = "/Users/suryanshpatel/Projects/Directed Readings/Technical/MintPain/Annotated_data_part1"

# Collect all image paths
rgb_paths = glob(os.path.join(base_dir, "**", "RGB", "*.jpg"), recursive=True)
depth_paths = glob(os.path.join(base_dir, "**", "D", "*.png"), recursive=True)

# Debugging: Check if files are found
print(f"Found {len(rgb_paths)} RGB images, {len(depth_paths)} Depth images")

def extract_info(filepath, image_type):
    # Get relative path from base_dir to correctly identify subject and trial
    relative_path = os.path.relpath(filepath, base_dir)
    parts = relative_path.split(os.sep)
    
    # The first two directories under base_dir are Subject and Trial
    subject = parts[0]
    trial = parts[1]

    filename = os.path.basename(filepath)

    # Extract timestamp ID from filename
    if image_type in ['RGB', 'D']:
        # Split filename to get the numeric identifier (last part before extension)
        identifier_part = filename.split("-")[4]  # Example: '0121.jpg' from RGB-...-0121.jpg
        identifier = identifier_part.split(".")[0]  # '0121'
    else:
        identifier = None

    return subject, trial, identifier, filepath

# Convert lists into DataFrames
df_rgb = pd.DataFrame([extract_info(p, 'RGB') for p in rgb_paths], columns=["Subject", "Trial", "Timestamp", "RGB_Path"])
df_depth = pd.DataFrame([extract_info(p, 'D') for p in depth_paths], columns=["Subject", "Trial", "Timestamp", "Depth_Path"])

# Ensure Timestamp is numeric
df_rgb["Timestamp"] = pd.to_numeric(df_rgb["Timestamp"], errors="coerce")
df_depth["Timestamp"] = pd.to_numeric(df_depth["Timestamp"], errors="coerce")

# Merge the datasets on Subject, Trial, and Timestamp
df_final = df_rgb.merge(df_depth, on=["Subject", "Trial", "Timestamp"], how="inner")

# Save the dataset
df_final.to_csv("synced_rgb_depth_dataset.csv", index=False)

# Preview the dataframe
print(df_final.head())

In [5]:
import os
import pandas as pd
from glob import glob
import numpy as np

# Define base directory
base_dir = "/Users/suryanshpatel/Projects/Directed Readings/Technical/MintPain/Annotated_data_part1"

# Collect all image paths
rgb_paths = glob(os.path.join(base_dir, "**", "RGB", "*.jpg"), recursive=True)
depth_paths = glob(os.path.join(base_dir, "**", "D", "*.png"), recursive=True)

# Debugging: Check if files are found
print(f"Found {len(rgb_paths)} RGB images, {len(depth_paths)} Depth images")

def extract_info(filepath, image_type):
    # Get relative path from base_dir to correctly identify subject and trial
    relative_path = os.path.relpath(filepath, base_dir)
    parts = relative_path.split(os.sep)
    
    # The first two directories under base_dir are Subject and Trial
    subject = parts[0]
    trial = parts[1]

    filename = os.path.basename(filepath)

    # Extract timestamp ID from filename
    if image_type in ['RGB', 'D']:
        # Split filename to get the numeric identifier (last part before extension)
        identifier_part = filename.split("-")[4]  # Example: '0121.jpg' from RGB-...-0121.jpg
        identifier = identifier_part.split(".")[0]  # '0121'
    else:
        identifier = None

    return subject, trial, identifier, filepath

# Convert lists into DataFrames
df_rgb = pd.DataFrame([extract_info(p, 'RGB') for p in rgb_paths], columns=["Subject", "Trial", "Timestamp", "RGB_Path"])
df_depth = pd.DataFrame([extract_info(p, 'D') for p in depth_paths], columns=["Subject", "Trial", "Timestamp", "Depth_Path"])

# Ensure Timestamp is numeric
df_rgb["Timestamp"] = pd.to_numeric(df_rgb["Timestamp"], errors="coerce")
df_depth["Timestamp"] = pd.to_numeric(df_depth["Timestamp"], errors="coerce")

# Merge the datasets on Subject, Trial, and Timestamp
df_final = df_rgb.merge(df_depth, on=["Subject", "Trial", "Timestamp"], how="inner")

# Save the dataset
df_final.to_csv("synced_rgb_depth_dataset.csv", index=False)

# Preview the dataframe
print(df_final.head())

Found 8656 RGB images, 8656 Depth images
            Subject                         Trial  Timestamp  \
0  Sub2 Fabrio Jure  Annotated_data_Sub02_Trial02       8112   
1  Sub2 Fabrio Jure  Annotated_data_Sub02_Trial02       3457   
2  Sub2 Fabrio Jure  Annotated_data_Sub02_Trial02       4113   
3  Sub2 Fabrio Jure  Annotated_data_Sub02_Trial02       4463   
4  Sub2 Fabrio Jure  Annotated_data_Sub02_Trial02       5795   

                                            RGB_Path  \
0  /Users/suryanshpatel/Projects/Directed Reading...   
1  /Users/suryanshpatel/Projects/Directed Reading...   
2  /Users/suryanshpatel/Projects/Directed Reading...   
3  /Users/suryanshpatel/Projects/Directed Reading...   
4  /Users/suryanshpatel/Projects/Directed Reading...   

                                          Depth_Path  
0  /Users/suryanshpatel/Projects/Directed Reading...  
1  /Users/suryanshpatel/Projects/Directed Reading...  
2  /Users/suryanshpatel/Projects/Directed Reading...  
3  /Users/suryans

In [10]:
df_final.head()

Unnamed: 0,Subject,Trial,Timestamp,RGB_Path,Depth_Path
0,Sub2 Fabrio Jure,Annotated_data_Sub02_Trial02,8112,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...
1,Sub2 Fabrio Jure,Annotated_data_Sub02_Trial02,3457,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...
2,Sub2 Fabrio Jure,Annotated_data_Sub02_Trial02,4113,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...
3,Sub2 Fabrio Jure,Annotated_data_Sub02_Trial02,4463,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...
4,Sub2 Fabrio Jure,Annotated_data_Sub02_Trial02,5795,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...


In [11]:
import os
import pandas as pd
from glob import glob

# Define base directory
base_dir = "/Users/suryanshpatel/Projects/Directed Readings/Technical/MintPain/Annotated_data_part1"

# Collect all image paths
rgb_paths = glob(os.path.join(base_dir, "**", "RGB", "*.jpg"), recursive=True)
depth_paths = glob(os.path.join(base_dir, "**", "D", "*.png"), recursive=True)

def extract_metadata(filepath):
    # Get relative path from base directory
    relative_path = os.path.relpath(filepath, base_dir)
    parts = relative_path.split(os.sep)
    
    # Extract metadata from directory structure
    subject_folder = parts[0]  # Sub1 Daniel Simonsen
    trial_folder = parts[1]     # Annotated_data_Sub01_Trial01
    sweep_folder = parts[2]     # Sub01_Trial01_Sweep38_Label0
    
    # Extract numeric identifiers using split
    subject = subject_folder.split()[0].replace("Sub", "Subject_")
    trial = trial_folder.split("_")[-1].replace("Trial", "Trial_")
    
    # Extract sweep and label from sweep_folder
    sweep_parts = sweep_folder.split('_')
    sweep = sweep_parts[2].replace("Sweep", "Sweep_")
    label = sweep_parts[3].replace("Label", "Label_")
    
    # Extract timestamp from filename
    filename = os.path.basename(filepath)
    timestamp = filename.split('-')[4].split('.')[0]  # 0252 from RGB-xx-xx-xx-0252.jpg
    
    # Convert to numeric values
    sub_num = int(subject.split('_')[1])
    trial_num = int(trial.split('_')[1])
    sweep_num = int(sweep.split('_')[1])
    label_num = int(label.split('_')[1])
    timestamp_num = int(timestamp)
    
    return {
        'Subject_ID': subject,
        'Subject_Number': sub_num,
        'Trial_ID': trial,
        'Trial_Number': trial_num,
        'Sweep_ID': sweep,
        'Sweep_Number': sweep_num,
        'Label_ID': label,
        'Label_Number': label_num,
        'Timestamp': timestamp_num,
        'Full_Path': filepath,
        'Modality': 'RGB' if filepath.endswith('.jpg') else 'Depth'
    }

# Create DataFrames
df_rgb = pd.DataFrame([extract_metadata(p) for p in rgb_paths])
df_depth = pd.DataFrame([extract_metadata(p) for p in depth_paths])

# Merge RGB and Depth data
merge_keys = ['Subject_ID', 'Subject_Number', 
             'Trial_ID', 'Trial_Number',
             'Sweep_ID', 'Sweep_Number',
             'Label_ID', 'Label_Number',
             'Timestamp']

df_merged = pd.merge(
    df_rgb[df_rgb['Modality'] == 'RGB'],
    df_depth[df_depth['Modality'] == 'Depth'],
    on=merge_keys,
    suffixes=('_RGB', '_Depth')
)

# Cleanup and final columns
final_df = df_merged[[
    'Subject_ID', 'Subject_Number',
    'Trial_ID', 'Trial_Number',
    'Sweep_ID', 'Sweep_Number',
    'Label_ID', 'Label_Number',
    'Timestamp',
    'Full_Path_RGB', 'Full_Path_Depth'
]]

# Save to CSV
final_df.to_csv("structured_dataset.csv", index=False)

# Display sample
final_df.head()

Unnamed: 0,Subject_ID,Subject_Number,Trial_ID,Trial_Number,Sweep_ID,Sweep_Number,Label_ID,Label_Number,Timestamp,Full_Path_RGB,Full_Path_Depth
0,Subject_2,2,Trial_02,2,Sweep_10,10,Label_0,0,8112,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...
1,Subject_2,2,Trial_02,2,Sweep_10,10,Label_0,0,3457,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...
2,Subject_2,2,Trial_02,2,Sweep_10,10,Label_0,0,4113,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...
3,Subject_2,2,Trial_02,2,Sweep_10,10,Label_0,0,4463,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...
4,Subject_2,2,Trial_02,2,Sweep_10,10,Label_0,0,5795,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...


In [None]:
import os
import pandas as pd
from glob import glob

# Define base directory
base_dir = "/Users/suryanshpatel/Projects/Directed Readings/Technical/MintPain/Annotated_data_part1"

# Collect all image paths
rgb_paths = glob(os.path.join(base_dir, "**", "RGB", "*.jpg"), recursive=True)
depth_paths = glob(os.path.join(base_dir, "**", "D", "*.png"), recursive=True)

def extract_metadata(filepath):
    relative_path = os.path.relpath(filepath, base_dir)
    parts = relative_path.split(os.sep)
    
    # Extract numeric identifiers directly
    subject_num = int(parts[0].split()[0].replace("Sub", ""))
    trial_num = int(parts[1].split("Trial")[-1])
    
    # Extract sweep and label numbers
    sweep_num = int(parts[2].split("Sweep")[-1].split("_")[0])
    label_num = int(parts[2].split("Label")[-1])
    
    # Extract timestamp from filename
    filename = os.path.basename(filepath)
    timestamp = int(filename.split('-')[4].split('.')[0])

    return {
        'Subject': subject_num,
        'Trial': trial_num,
        'Sweep': sweep_num,
        'Label': label_num,
        'Timestamp': timestamp,
        'Full_Path': filepath,
        'Modality': 'RGB' if filepath.endswith('.jpg') else 'Depth'
    }

# Create DataFrames
df_rgb = pd.DataFrame([extract_metadata(p) for p in rgb_paths])
df_depth = pd.DataFrame([extract_metadata(p) for p in depth_paths])

# Merge RGB and Depth data
merge_keys = ['Subject', 'Trial', 'Sweep', 'Label', 'Timestamp']

df_merged = pd.merge(
    df_rgb[df_rgb['Modality'] == 'RGB'],
    df_depth[df_depth['Modality'] == 'Depth'],
    on=merge_keys,
    suffixes=('_RGB', '_Depth')
)

# Final cleaned dataframe
final_df = df_merged[[
    'Subject', 'Trial', 'Sweep', 'Label', 'Timestamp',
    'Full_Path_RGB', 'Full_Path_Depth'
]]

# Save to CSV
final_df.to_csv("structured_dataset1.csv", index=False)


   Subject  Trial  Sweep  Label  Timestamp  \
0        2      2     10      0       8112   
1        2      2     10      0       3457   
2        2      2     10      0       4113   

                                       Full_Path_RGB  \
0  /Users/suryanshpatel/Projects/Directed Reading...   
1  /Users/suryanshpatel/Projects/Directed Reading...   
2  /Users/suryanshpatel/Projects/Directed Reading...   

                                     Full_Path_Depth  
0  /Users/suryanshpatel/Projects/Directed Reading...  
1  /Users/suryanshpatel/Projects/Directed Reading...  
2  /Users/suryanshpatel/Projects/Directed Reading...  


In [13]:

final_df.head()

Unnamed: 0,Subject,Trial,Sweep,Label,Timestamp,Full_Path_RGB,Full_Path_Depth
0,2,2,10,0,8112,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...
1,2,2,10,0,3457,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...
2,2,2,10,0,4113,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...
3,2,2,10,0,4463,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...
4,2,2,10,0,5795,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...


In [15]:
set(final_df.Label)

{0, 1, 2, 3, 4}

In [16]:
final_df.shape

(8784, 7)

In [17]:
final_df.columns

Index(['Subject', 'Trial', 'Sweep', 'Label', 'Timestamp', 'Full_Path_RGB',
       'Full_Path_Depth'],
      dtype='object')

In [26]:
import os
import pandas as pd
from glob import glob

# Define base directory
base_dir = "/Users/suryanshpatel/Projects/Directed Readings/Technical/MintPain/Annotated_data_part7"

# Collect all image paths
rgb_paths = glob(os.path.join(base_dir, "**", "RGB", "*.jpg"), recursive=True)
depth_paths = glob(os.path.join(base_dir, "**", "D", "*.png"), recursive=True)

def extract_metadata(filepath):
    relative_path = os.path.relpath(filepath, base_dir)
    parts = relative_path.split(os.sep)
    
    # Extract numeric identifiers
    subject_num = int(parts[0].split()[0].replace("Sub", ""))
    trial_num = int(parts[1].split("Trial")[-1])
    sweep_num = int(parts[2].split("Sweep")[-1].split("_")[0])
    label_num = int(parts[2].split("Label")[-1])
    
    # Extract FULL timestamp from filename
    filename = os.path.basename(filepath)
    filename_parts = filename.split('-')
    
    # Combine time components (exclude modality prefix and file extension)
    timestamp_str = '-'.join(filename_parts[1:5]).split('.')[0]
    
    return {
        'Subject': subject_num,
        'Trial': trial_num,
        'Sweep': sweep_num,
        'Label': label_num,
        'Timestamp': timestamp_str,  # Now returns "11-59-53-0580"
        'Full_Path': filepath,
        'Modality': 'RGB' if filepath.endswith('.jpg') else 'Depth'
    }

# Create DataFrames
df_rgb = pd.DataFrame([extract_metadata(p) for p in rgb_paths])
df_depth = pd.DataFrame([extract_metadata(p) for p in depth_paths])

# Merge datasets
df_merged = pd.merge(
    df_rgb[df_rgb['Modality'] == 'RGB'],
    df_depth[df_depth['Modality'] == 'Depth'],
    on=['Subject', 'Trial', 'Sweep', 'Label', 'Timestamp'],
    suffixes=('_RGB', '_Depth')
)

# Final dataframe
final_df = df_merged[[
    'Subject', 'Trial', 'Sweep', 'Label', 'Timestamp',
    'Full_Path_RGB', 'Full_Path_Depth'
]]

# Save to CSV
final_df.to_csv("/Users/suryanshpatel/Projects/Directed Readings/Technical/src/data/Finaldata-part7.csv", index=False)

final_df.head(3)

Unnamed: 0,Subject,Trial,Sweep,Label,Timestamp,Full_Path_RGB,Full_Path_Depth
0,20,1,20,2,04-13-30-3001,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...
1,20,1,20,2,04-13-30-4342,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...
2,20,1,20,2,04-13-30-0669,/Users/suryanshpatel/Projects/Directed Reading...,/Users/suryanshpatel/Projects/Directed Reading...


In [20]:
final_df['Full_Path_Depth'][0]

'/Users/suryanshpatel/Projects/Directed Readings/Technical/MintPain/Annotated_data_part1/Sub2 Fabrio Jure/Annotated_data_Sub02_Trial02/Sub02_Trial02_Sweep10_Label0/D/D-03-32-54-8112.png'

In [27]:
import pandas as pd
import glob

# Path to the data folder
data_folder = "/Users/suryanshpatel/Projects/Directed Readings/Technical/src/data"  # Adjust if needed

# Get all CSV files that match the pattern
csv_files = sorted(glob.glob(f"{data_folder}/Finaldata-part*.csv"))

# Combine all CSV files into one
df_combined = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

# Save the combined CSV
output_file = f"{data_folder}/Finaldata_combined.csv"
df_combined.to_csv(output_file, index=False)

print(f"Combined CSV saved to {output_file}")


Combined CSV saved to /Users/suryanshpatel/Projects/Directed Readings/Technical/src/data/Finaldata_combined.csv
