In [13]:
import os
import json
import time
import threading

import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
from scipy.interpolate import CubicSpline
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ProcessPoolExecutor

In [14]:
sample_video = {
    "positions": {
        "1.0": {
            "Nose": [1, 1, 1],
            "Left_eye": [1, 1, 1],
        },
        "2.0": {
            "Nose": [2, 2, 2],
            "Left_eye": [2, 2, 2],
        },
        "3.0": {
            "Nose": [3, 3, 3],
            "Left_eye": [3, 3, 3],
        },
        "4.0": {
            "Nose": [4, 4, 4],
            "Left_eye": [4, 4, 4],
        },
    }
}

sample_video["positions"]["1.0"]["Nose"]

[1, 1, 1]

  <!-- for column in row.index:
            # Extract joint name and position (e.g., Nose_x)
            joint = '_'.join(column.split('_')[1:])
            if joint not in mean_data_series:
                mean_data_series[joint] = []
            mean_data_series[joint].append(row[column]) -->

Let say we have
a[1_a_x:5, 1_a_y:4, 2_a_x:3, 2_a_y:7]
The above code give:
result = {
a_x:[5,3],
a_y:[4,7]
}


In [15]:
def getFilesPath(directory):
    files = []
    for root, dirs, file in os.walk(directory):
        for f in file:
            files.append(os.path.join(root, f))
    return files


def loadFiles(files_path, activities, joints_mapping, target_frames=30, max_workers=4):
    """
    Load and process activity data files.

    This function loads JSON files containing activity data, resamples the data to a target number of frames,
    flattens the data, and combines it into a single DataFrame. The function filters files based on the specified
    activities and processes only those files.

    Parameters:
    - files_path (list of str): List of file paths to be processed.
    - activities (list of str): List of activity names to filter the files.
    - joints_mapping (list): Dictionary mapping joint names to their indices.
    - target_frames (int, optional): The target number of frames to resample the data to. Default is 30.

    Returns:
    - pd.DataFrame: A DataFrame containing the processed and combined activity data. If no data is processed, 
    an empty DataFrame is returned.
    """
    activity_dfs = []
    for activity in activities:
        for file_path in files_path:
            if file_path.endswith('.json') and activity in file_path:
                with open(file_path, 'r') as file:
                    file_data = json.load(file)
                    resample_file_data = resample_video(file_data, joints_mapping, target_frames)
                    if resample_file_data is None:
                        return None
                    file_df = flatternData(resample_file_data, joints_mapping)
                    file_df.insert(0, 'Group', file_path.split('\\')[-1].split('-')[0])
                    file_df.insert(1, 'Activity', activity)
                    return file_df
    if activity_dfs:
        final_df = pd.concat(activity_dfs, ignore_index=True)
        return final_df
    else:
        return pd.DataFrame()


def flatternData(data, joints_mapping):
    # Create a list of dictionaries for each timestamp and joint
    flattened_data = [
        {f'{timestamp}_{joint}_{axis}': position[joint][i]
            for joint in joints_mapping if joint in position
            for i, axis in enumerate(['x', 'y', 'z'])}
        for timestamp, position in data.items()
    ]

    # Flatten the list of dictionaries into a single dictionary
    flattened_data = {k: v for d in flattened_data for k, v in d.items()}
    return pd.DataFrame([flattened_data])


def resample_video(video_data, joint_mapping, target_frames=30):
    # Get the original frame numbers and convert to float
    original_frames = np.array([float(k) for k in video_data['positions'].keys()])
    # check that video has at least 4 frames

    if len(original_frames) < 4:
        return None
    # Create new evenly spaced frames
    new_frames = np.linspace(min(original_frames), max(original_frames), target_frames)

    # Get all unique joints
    joints = list(next(iter(video_data['positions'].values())).keys())

    # Initialize the resampled data structure
    resampled_data = {str(float(i)): {} for i in range(1, target_frames + 1)}

    # Interpolate each joint's coordinates
    for joint in joint_mapping:
        # Extract x, y, z coordinates for the current joint across all frames
        coords = np.array([video_data['positions'][str(frame)][joint] for frame in original_frames])
        x_coords, y_coords, z_coords = coords[:, 0], coords[:, 1], coords[:, 2]

        # Create cubic interpolation functions for each coordinate
        x_interp = interp1d(original_frames, x_coords, kind='cubic')
        y_interp = interp1d(original_frames, y_coords, kind='cubic')
        z_interp = interp1d(original_frames, z_coords, kind='cubic')

        # Apply interpolation to get new coordinates
        for i, frame in enumerate(new_frames, 1):  # Start counting from 1
            resampled_data[str(float(i))][joint] = [
                float(x_interp(frame)),
                float(y_interp(frame)),
                float(z_interp(frame))
            ]
    return resampled_data


def calculateMeanForAllVideos(df):
    # Store the group and activity columns
    labed_df = df[['Group', 'Activity']]

    # Ignore the group and activity columns for mean calculation
    df = df.drop(columns=['Group', 'Activity'])

    # Extract joint names and coordinates
    joint_names = df.columns.str.split('_').str[1:].str.join('_')

    # Calculate mean for each joint
    mean_df = df.T.groupby(joint_names).mean().T

    # Add back the group and activity columns
    mean_df.insert(0, 'Group', labed_df['Group'])
    mean_df.insert(1, 'Activity', labed_df['Activity'])

    return mean_df


def group_by_activity(dfs):
    # Concatenate all DataFrames in the list
    merged_df = pd.concat(dfs, ignore_index=True)

    # Get unique activities
    activities = merged_df['Activity'].unique()

    # Split the merged DataFrame by activity
    split_dfs = {activity: merged_df[merged_df['Activity'] == activity] for activity in activities}

    return split_dfs

In [16]:
# List of joint names in OpenPose
openpose_joint_mapping = [
    "Nose",
    "Left_eye",
    "Right_eye",
    "Left_ear",
    "Right_ear",
    "Left_shoulder",
    "Right_shoulder",
    "Left_elbow",
    "Right_elbow",
    "Left_wrist",
    "Right_wrist",
    "Left_hip",
    "Right_hip",
    "Left_knee",
    "Right_knee",
    "Left_ankle",
    "Right_ankle"
]

In [25]:
# path to data
root_path = os.getcwd()
group1A_path = os.path.join(root_path, 'Group1A', 'blazepose')
group2A_path = os.path.join(root_path, 'Group2A', 'blazepose')
group3_path = os.path.join(root_path, 'Group3', 'blazepose')

# group1B_path = os.path.join(root_path, 'Group1B', 'blazepose')
group2B_path = os.path.join(root_path, 'Group2B', 'blazepose')

# activities
activities = ['CTK', 'ELK', 'RTK']
# activities = ['ELK']

# Get paths
group1A_files_path = getFilesPath(group1A_path)
group2A_files_path = getFilesPath(group2A_path)
group3_files_path = getFilesPath(group3_path)

# group1B_files_path = getFilesPath(group1B_path)
group2B_files_path = getFilesPath(group2B_path)

# Load data
# group1A_df = loadFiles(group1A_files_path, activities, openpose_joint_mapping, 200)
group2A_df = loadFiles(group2A_files_path, activities, openpose_joint_mapping, 200, 4)
# group3_df = loadFiles(group3_files_path, activities, openpose_joint_mapping, 200)

# Store data to csv
# group1A_df.to_csv('group1A.csv', index=False)
group2A_df.to_csv('group2A.csv', index=False)
# group3_df.to_csv('group3.csv', index=False)

# group1B_df = loadFiles(group1B_files_path, activities,openpose_joint_mapping )
# group2B_df = loadFiles(group2B_files_path, activities,openpose_joint_mapping)

G2A-BP-CTK-S1-Brest-029.json on ThreadPoolExecutor-9_0
G2A-BP-CTK-S1-Brest-034.json on ThreadPoolExecutor-9_1
G2A-BP-CTK-S1-Brest-090.json on ThreadPoolExecutor-9_2
G2A-BP-CTK-S1-Brest-093.json on ThreadPoolExecutor-9_3
G2A-BP-CTK-S1-Brest-096.json on ThreadPoolExecutor-9_3
G2A-BP-CTK-S1-Roscoff-009.json on ThreadPoolExecutor-9_0
G2A-BP-CTK-S1-Roscoff-012.json on ThreadPoolExecutor-9_1
G2A-BP-CTK-S1-Roscoff-013.json on ThreadPoolExecutor-9_2
G2A-BP-CTK-S1-Roscoff-043.json on ThreadPoolExecutor-9_1
G2A-BP-CTK-S1-Roscoff-051.json on ThreadPoolExecutor-9_3
G2A-BP-CTK-S1-Roscoff-059.json on ThreadPoolExecutor-9_0
G2A-BP-CTK-S2-Roscoff-003.json on ThreadPoolExecutor-9_2
G2A-BP-CTK-S2-Roscoff-018.json on ThreadPoolExecutor-9_1
G2A-BP-CTK-S2-Roscoff-021.json on ThreadPoolExecutor-9_3
G2A-BP-CTK-S2-Roscoff-025.json on ThreadPoolExecutor-9_0
G2A-BP-CTK-S3-Roscoff-077.json on ThreadPoolExecutor-9_1
G2A-BP-CTK-S3-Roscoff-091.json on ThreadPoolExecutor-9_3
G2A-BP-CTK-S4-Roscoff-001.json on ThreadP

In [18]:
# Load data from csv
group1A_df = pd.read_csv('group1A.csv')
group2A_df = pd.read_csv('group2A.csv')
group3_df = pd.read_csv('group3.csv')
# group1B_df = pd.read_csv('group1B.csv')
# group2B_df = pd.read_csv('group2B.csv')
group2A_df.head(2)

FileNotFoundError: [Errno 2] No such file or directory: 'group1A.csv'

In [162]:
# Calculate mean for all videos
group1A_mean_df = calculateMeanForAllVideos(group1A_df)
group2A_mean_df = calculateMeanForAllVideos(group2A_df)
group3_mean_df = calculateMeanForAllVideos(group3_df)

# Add labels
group1A_mean_df.insert(2, 'Backpain', 1)
group2A_mean_df.insert(2, 'Backpain', 0)
group3_mean_df.insert(2, 'Backpain', 0)

grouped_activities = group_by_activity([group1A_mean_df, group2A_mean_df, group3_mean_df])

grouped_activities['CTK'].head(2)

Unnamed: 0,Group,Activity,Backpain,Left_ankle_x,Left_ankle_y,Left_ankle_z,Left_ear_x,Left_ear_y,Left_ear_z,Left_elbow_x,...,Right_hip_z,Right_knee_x,Right_knee_y,Right_knee_z,Right_shoulder_x,Right_shoulder_y,Right_shoulder_z,Right_wrist_x,Right_wrist_y,Right_wrist_z
0,G1A,CTK,1,0.470761,0.758881,0.037198,0.469344,0.370807,-0.036697,0.457347,...,0.004945,0.47637,0.66242,0.017133,0.474796,0.424234,-0.017197,0.489964,0.433962,-0.027319
1,G1A,CTK,1,0.498472,0.794799,0.05396,0.505062,0.423639,-0.022104,0.485954,...,-0.008531,0.514236,0.704524,0.005048,0.517393,0.474906,-0.037333,0.531627,0.480544,-0.046474


In [163]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Assuming merged_df_by_activity is already defined and contains the merged DataFrame for each activity
df_ctk = grouped_activities['CTK']

# Prepare the data
X = df_ctk.drop(columns=['Group', 'Activity', 'Backpain'])  # Features
y = df_ctk['Backpain']  # Labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Define the parameter grid for the KNeighborsClassifier
param_grid = {
    'n_neighbors': [3, 4, 5],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Initialize the KNeighborsClassifier
knn_clf = KNeighborsClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=knn_clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=3)

# Perform grid search and cross-validation
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Print the best parameters and best score
print(f"Best parameters: {best_params}")
print(f"Best cross-validation score: {grid_search.best_score_}")

# Evaluate the model
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy}")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters: {'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'distance'}
Best cross-validation score: 0.9824154589371981
Test set accuracy: 0.9827586206896551


In [164]:
group2B_df = loadFiles(group2B_files_path, activities, openpose_joint_mapping)

group2B_mean_df = calculateMeanForAllVideos(group2B_df)
group2B_mean_df.insert(2, 'Backpain', 0)

grouped2B_activities = group_by_activity([group2B_mean_df])

In [165]:
# use the above model to predict the backpain for group2B
df_2B = grouped2B_activities['CTK']
X_2B = df_2B.drop(columns=['Group', 'Activity', 'Backpain'])
y_2B = df_2B['Backpain']

y_pred_2B = grid_search.predict(X_2B)
accuracy_2B = accuracy_score(y_2B, y_pred_2B)

print(f"Group 2B test set accuracy: {accuracy_2B}")
# print(group2B_mean_df)

Group 2B test set accuracy: 0.40350877192982454


In [166]:
a = flatternData(sample_video['positions'], openpose_joint_mapping)
a

Unnamed: 0,1.0_Nose_x,1.0_Nose_y,1.0_Nose_z,1.0_Left_eye_x,1.0_Left_eye_y,1.0_Left_eye_z,2.0_Nose_x,2.0_Nose_y,2.0_Nose_z,2.0_Left_eye_x,...,3.0_Nose_z,3.0_Left_eye_x,3.0_Left_eye_y,3.0_Left_eye_z,4.0_Nose_x,4.0_Nose_y,4.0_Nose_z,4.0_Left_eye_x,4.0_Left_eye_y,4.0_Left_eye_z
0,1,1,1,1,1,1,2,2,2,2,...,3,3,3,3,4,4,4,4,4,4


In [167]:
def calculateStatisticalFeatures(df):
    features = {}
    for column in df.columns:
        features[f'{column}_mean'] = df[column].mean()
        features[f'{column}_std'] = df[column].std()
        features[f'{column}_min'] = df[column].min()
        features[f'{column}_max'] = df[column].max()
    return features


def labelData(dataframes, label):
    labeled_data = []
    for df in dataframes:
        features = calculateStatisticalFeatures(df)
        features['label'] = label
        labeled_data.append(features)
    return labeled_data


def combineData(labeled_data):
    return pd.DataFrame(labeled_data)

In [168]:
group2A_labeled_data = labelData(group2A_df, label=0)

AttributeError: 'str' object has no attribute 'columns'