In [1]:
import pandas as pd
import os
from glob import glob

def load(pose_path: str, labels_path: str):
    """
    load data from csv files in the given path
    """
    # get labels
    df_labels = pd.read_csv(labels_path)

    # Get all CSV files in the folder
    csv_files = [ p for p in glob(os.path.join(pose_path, '*.csv'))]
    movie_names = [n.removeprefix(pose_path).removeprefix('/').removesuffix('_clicked.csv') + ".mp4" for n in csv_files]

    # filter out files that are not in the labels
    df_labels = df_labels[df_labels['movieName'].isin(movie_names)]

    # get labels for the movies
    movie_labels_dict = df_labels.set_index('movieName')['SKIER_LEVEL'].to_dict()
    movie_labels = [movie_labels_dict.get(n, 'Unknown') for n in movie_names]

    # Load and concatenate all CSV files into one DataFrame
    df = pd.concat(
        [pd.read_csv(f).assign(movie_name=n, style=l) for f,n,l in zip(csv_files, movie_names, movie_labels)],
        ignore_index=True
    )
    return df

df = load('../pose_outputs_loss', '../data/labeledFilms.csv')
df.head()


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,27,28,29,30,31,32,33,34,movie_name,style
0,677.0,297.0,677.0,297.0,677.0,297.0,832.282715,424.533691,850.483032,428.466125,...,792.604858,561.561401,863.404846,569.667847,771.782104,610.205078,878.055664,619.091309,trimmed_noaudio20240224173004004_0.csv.mp4,Unknown
1,851.287964,437.780212,680.0,294.0,849.370361,432.482971,680.0,294.0,840.466736,434.131042,...,851.289124,573.8927,813.992798,569.179565,869.699585,622.658081,802.528687,616.703125,trimmed_noaudio20240224173004004_0.csv.mp4,Unknown
2,670.0,301.0,670.0,301.0,670.0,301.0,846.901794,435.319244,865.920776,438.161194,...,804.543274,573.923096,871.583496,579.96814,783.53595,622.502197,888.99176,631.533447,trimmed_noaudio20240224173004004_0.csv.mp4,Unknown
3,686.0,301.0,686.0,301.0,686.0,301.0,856.71936,440.720337,686.0,301.0,...,814.11499,579.237671,883.382263,586.953979,792.45813,627.463928,900.482117,634.007812,trimmed_noaudio20240224173004004_0.csv.mp4,Unknown
4,690.0,304.0,690.0,304.0,690.0,304.0,879.94519,443.308289,690.0,304.0,...,826.695679,588.380005,893.711548,596.905518,796.821045,634.917725,905.081299,645.603027,trimmed_noaudio20240224173004004_0.csv.mp4,Unknown


# Normalizacja
Usunięcie pustych wierszy

TODO wywalić jak jest za mało pod rząd

In [47]:
def remove_empty_rows(df: pd.DataFrame) -> pd.DataFrame:
    """
    Remove rows with NaN values in the DataFrame.
    Remove smaller than 40 records parts of data
    """
    keypoints_indexes = [str(i) for i in range(0,34)]
    delimiter_mask = df[keypoints_indexes].isnull().all(axis=1)

    # Get the indexes of delimiter rows
    delimiter_indices = df[delimiter_mask].index.tolist()

    # Add start and end for slicing
    all_indices = [-1] + delimiter_indices + [len(df)]

    # Split into chunks
    chunks = []
    for i in range(len(all_indices) - 1):
        start = all_indices[i] + 1
        end = all_indices[i + 1]
        chunk = df.iloc[start:end]
        if len(chunk) >= 40:
            chunks.append(chunk.reset_index(drop=True))

    # Concatenate all chunks into a single DataFrame
    return pd.concat(chunks, ignore_index=True)

df = load('../pose_outputs_clicked', '../data/labeledFilms.csv')
print("Before cleanup:", df.last_valid_index())

df = remove_empty_rows(df)
print("After cleanup:", df.last_valid_index())
print(f"Are any nan values left? {df.isnull().values.any()}")

Before cleanup: 4701
After cleanup: 4440
Are any nan values left? False


Przesunięcie punktów pozycji na relatywne względem punktu pomiędzy prawym a lewym biodrem

In [3]:
def normalize_translation(df: pd.DataFrame):
    LEFT_HIP_X = "22"
    LEFT_HIP_Y = "23"
    RIGHT_HIP_X = "24"
    RIGHT_HIP_Y = "25"

    # calculate hip_center
    hip_center_x = (df[[LEFT_HIP_X, RIGHT_HIP_X]].mean(axis=1)).to_frame(name='hip_center_x')
    hip_center_y = (df[[LEFT_HIP_Y, RIGHT_HIP_Y]].mean(axis=1)).to_frame(name='hip_center_y')
    df = df.assign(hip_center_x=hip_center_x, hip_center_y=hip_center_y)

    # shift all keypoints relative to hip_center
    keypoints_indexes_x = [str(i) for i in range(0,34,2)]  # Assuming 17 keypoints
    keypoints_indexes_y = [str(i) for i in range(1,34,2)]  # Assuming 17 keypoints
    normalized = df.copy()
    normalized[keypoints_indexes_x] = df[keypoints_indexes_x] - hip_center_x.values
    normalized[keypoints_indexes_y] = df[keypoints_indexes_y] - hip_center_y.values
    return normalized


df = load('../pose_outputs_clicked', '../data/labeledFilms.csv')
df.head()
translated_df = normalize_translation(df)
# Display the first few rows of the left and right hip columns to verify translation
translated_df.loc[:,["22", "24", "hip_center_x"]].head()
translated_df.loc[:,["23", "25", "hip_center_y"]].head()

Unnamed: 0,23,25,hip_center_y
0,-0.430756,0.430756,323.137238
1,-0.310852,0.310852,325.179321
2,-0.078369,0.078369,326.03479
3,0.130646,-0.130646,327.742035
4,-0.656021,0.656021,333.289993


Przeskalowanie

In [4]:
import numpy as np

def normalize_scale(df: pd.DataFrame):
    LEFT_HIP_X = "22"
    LEFT_HIP_Y = "23"
    RIGHT_HIP_X = "24"
    RIGHT_HIP_Y = "25"

    # Compute reference distance (hip width)
    distance_x = df[RIGHT_HIP_X] - df[LEFT_HIP_X]
    distance_y = df[RIGHT_HIP_Y] - df[LEFT_HIP_Y]
    ref_distance = np.sqrt(distance_x**2 + distance_y**2)

    # # Avoid division by zero
    epsilon = 1e-6
    scale_factor = 1.0 / (ref_distance + epsilon)

    # Scale all keypoints
    keypoints_indexes = [str(i) for i in range(0,34)]  # Assuming 17 keypoints
    normalized = df.copy()
    normalized[keypoints_indexes] = df[keypoints_indexes].mul(scale_factor, axis=0)
    return normalized

df = load('../pose_outputs_clicked', '../data/labeledFilms.csv')
df = remove_empty_rows(df)
normalized = normalize_scale(df)
normalized.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,movie_name,style
0,88.82205,18.850522,88.736792,18.663842,88.809815,18.66875,88.488415,18.768178,89.097343,18.780323,...,88.303893,24.815439,89.68167,25.062006,87.767392,27.124232,89.526768,27.448161,trimmed_noaudio20240320142001028.mp4,intermediate
1,77.72967,16.422781,77.631624,16.248943,77.758449,16.248532,77.336516,16.36749,78.117747,16.358029,...,77.320602,21.675615,78.730237,21.84054,76.902724,23.614452,78.605154,23.888757,trimmed_noaudio20240320142001028.mp4,intermediate
2,75.705635,16.076459,75.829542,15.912142,75.554597,15.945213,76.03543,15.962492,75.364774,16.03191,...,76.745324,21.148593,75.325617,21.227505,76.562345,22.950963,75.106047,23.048736,trimmed_noaudio20240320142001028.mp4,intermediate
3,86.04508,18.443988,86.063844,18.265247,86.171268,18.279824,86.220501,18.392563,86.883651,18.341875,...,86.423245,24.285231,87.753525,24.401708,86.064626,26.423989,87.775178,26.570533,trimmed_noaudio20240320142001028.mp4,intermediate
4,83.498154,18.049917,83.338918,17.870794,83.494173,17.854367,82.851757,17.972473,83.663308,17.958643,...,83.130301,23.694397,84.508631,23.828682,82.938851,25.729682,84.634351,26.010848,trimmed_noaudio20240320142001028.mp4,intermediate


Rotacja

In [5]:
def normalize_rotation(df: pd.DataFrame):
    LEFT_HIP_X = "22"
    LEFT_HIP_Y = "23"
    RIGHT_HIP_X = "24"
    RIGHT_HIP_Y = "25"

    vx = df[RIGHT_HIP_X] - df[LEFT_HIP_X]
    vy = df[RIGHT_HIP_Y] - df[LEFT_HIP_Y]
    angle = np.arctan2(vy, vx)


    # Extract X and Y columns
    x_cols = [str(i) for i in range(0, 34, 2)]
    y_cols = [str(i) for i in range(1, 34, 2)]

    x = df[x_cols].values  # shape (N, 17)
    y = df[y_cols].values  # shape (N, 17)

    # Compute sin and cos of angles
    cos_a = np.cos(angle.values.reshape(-1, 1))  # shape (N, 1)
    sin_a = np.sin(angle.values.reshape(-1, 1))  # shape (N, 1)

    # Apply rotation:
    # x' = x * cos - y * sin
    # y' = x * sin + y * cos
    x_rot = x * cos_a - y * sin_a
    y_rot = x * sin_a + y * cos_a

    # Build rotated DataFrame
    rotated = df.copy()
    rotated[x_cols] = x_rot
    rotated[y_cols] = y_rot

    return rotated

df = load('../pose_outputs_clicked', '../data/labeledFilms.csv')
normalized = normalize_rotation(df)
normalized.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,movie_name,style
0,1236.246469,342.333826,1235.20507,339.627985,1236.230538,339.760102,1231.612803,340.885262,1240.188869,341.58111,...,1223.801059,425.99918,1243.016769,430.663004,1214.246768,458.093431,1238.776788,464.176899,trimmed_noaudio20240320142001028.mp4,intermediate
1,1252.270863,315.062747,1250.786477,312.178328,1252.846626,312.250497,1245.919655,313.920298,1258.614243,314.252326,...,1242.361094,400.124672,1265.15378,403.67975,1234.368567,431.355364,1261.848802,436.869005,trimmed_noaudio20240320142001028.mp4,intermediate
2,-1270.302768,-257.353895,-1272.351996,-254.582794,-1267.752889,-255.179697,-1275.807735,-255.393687,-1264.587662,-256.661291,...,-1288.508625,-342.129995,-1264.746278,-343.673993,-1285.726928,-372.341557,-1261.354771,-374.207133,trimmed_noaudio20240320142001028.mp4,intermediate
3,1266.327179,247.924679,1266.555578,245.299251,1268.134332,245.484893,1268.885599,247.124903,1278.59481,246.20849,...,1273.397731,333.464398,1292.931417,334.824483,1268.698861,364.91445,1293.815567,366.615991,trimmed_noaudio20240320142001028.mp4,intermediate
4,1246.413228,384.111489,1244.226087,381.177909,1246.609232,381.131745,1236.68245,382.08539,1249.04514,382.939809,...,1233.411997,469.487384,1254.201661,473.338427,1227.829457,500.19503,1253.2509,506.696441,trimmed_noaudio20240320142001028.mp4,intermediate


Zamiana na wartości od 0 do 1

In [6]:
def normalize_value(df: pd.DataFrame):
    """
    Normalize the values in the DataFrame to be between 0 and 1.
    """
    keypoints_indexes = [str(i) for i in range(0,34)]

    # Assuming the DataFrame contains only numeric columns
    min_values = df[keypoints_indexes].min(axis=1)
    max_values = df[keypoints_indexes].max(axis=1)
    # Avoid division by zero
    epsilon = 1e-6
    normalized_df = df.copy()
    subtracted_df = df[keypoints_indexes].sub(min_values, axis=0)

    normalized_df[keypoints_indexes] = subtracted_df.div(max_values - min_values + epsilon, axis=0)
    print(normalized_df.head())

    return normalized_df

df = load('../pose_outputs_clicked', '../data/labeledFilms.csv')
df = remove_empty_rows(df)
normalized = normalize_value(df)
normalized.head()

          0         1         2         3         4         5         6  \
0  0.967382  0.002574  0.966207  0.000000  0.967213  0.000068  0.962782   
1  0.965892  0.002738  0.964352  0.000006  0.966344  0.000000  0.959716   
2  0.966504  0.002656  0.968507  0.000000  0.964063  0.000535  0.971835   
3  0.956760  0.002523  0.957025  0.000000  0.958541  0.000206  0.959236   
4  0.967359  0.002882  0.965013  0.000242  0.967301  0.000000  0.957834   

          7         8         9  ...        26        27        28        29  \
0  0.001439  0.971178  0.001606  ...  0.960237  0.084822  0.979235  0.088222   
1  0.001869  0.971989  0.001720  ...  0.959466  0.085262  0.981612  0.087853   
2  0.000814  0.960995  0.001936  ...  0.983310  0.084642  0.960362  0.085918   
3  0.001797  0.968597  0.001082  ...  0.962098  0.084976  0.980876  0.086620   
4  0.001740  0.969793  0.001537  ...  0.961938  0.086062  0.982250  0.088040   

         30        31        32        33  \
0  0.952840  0.116657  

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,movie_name,style
0,0.967382,0.002574,0.966207,0.0,0.967213,6.8e-05,0.962782,0.001439,0.971178,0.001606,...,0.960237,0.084822,0.979235,0.088222,0.95284,0.116657,0.977099,0.121123,trimmed_noaudio20240320142001028.mp4,intermediate
1,0.965892,0.002738,0.964352,6e-06,0.966344,0.0,0.959716,0.001869,0.971989,0.00172,...,0.959466,0.085262,0.981612,0.087853,0.952901,0.115721,0.979646,0.120031,trimmed_noaudio20240320142001028.mp4,intermediate
2,0.966504,0.002656,0.968507,0.0,0.964063,0.000535,0.971835,0.000814,0.960995,0.001936,...,0.98331,0.084642,0.960362,0.085918,0.980352,0.113776,0.956812,0.115356,trimmed_noaudio20240320142001028.mp4,intermediate
3,0.95676,0.002523,0.957025,0.0,0.958541,0.000206,0.959236,0.001797,0.968597,0.001082,...,0.962098,0.084976,0.980876,0.08662,0.957036,0.115166,0.981182,0.117235,trimmed_noaudio20240320142001028.mp4,intermediate
4,0.967359,0.002882,0.965013,0.000242,0.967301,0.0,0.957834,0.00174,0.969793,0.001537,...,0.961938,0.086062,0.98225,0.08804,0.959117,0.116055,0.984103,0.120198,trimmed_noaudio20240320142001028.mp4,intermediate


# Normalizacja kończyn
Każde osobno

Podsumowanie

In [46]:
def normalize_pose(frame_keypoints):
    steps = [
        remove_empty_rows,
        normalize_translation,
        normalize_scale,
        normalize_rotation,
        # Optional: normalize_limb for each limb
        normalize_value
    ]
    normalized = frame_keypoints.copy()
    for step in steps:
        normalized = step(normalized)
    return normalized
df = load('../pose_outputs_clicked', '../data/labeledFilms.csv')
normalized = normalize_pose(df)


Delimiter mask created: 261
Delimiter indices found: [2820, 2821, 4443, 4444, 4445, 4446, 4447, 4448, 4449, 4450, 4451, 4452, 4453, 4454, 4455, 4456, 4457, 4458, 4459, 4460, 4461, 4462, 4463, 4464, 4465, 4466, 4467, 4468, 4469, 4470, 4471, 4472, 4473, 4474, 4475, 4476, 4477, 4478, 4479, 4480, 4481, 4482, 4483, 4484, 4485, 4486, 4487, 4488, 4489, 4490, 4491, 4492, 4493, 4494, 4495, 4496, 4497, 4498, 4499, 4500, 4501, 4502, 4503, 4504, 4505, 4506, 4507, 4508, 4509, 4510, 4511, 4512, 4513, 4514, 4515, 4516, 4517, 4518, 4519, 4520, 4521, 4522, 4523, 4524, 4525, 4526, 4527, 4528, 4529, 4530, 4531, 4532, 4533, 4534, 4535, 4536, 4537, 4538, 4539, 4540, 4541, 4542, 4543, 4544, 4545, 4546, 4547, 4548, 4549, 4550, 4551, 4552, 4553, 4554, 4555, 4556, 4557, 4558, 4559, 4560, 4561, 4562, 4563, 4564, 4565, 4566, 4567, 4568, 4569, 4570, 4571, 4572, 4573, 4574, 4575, 4576, 4577, 4578, 4579, 4580, 4581, 4582, 4583, 4584, 4585, 4586, 4587, 4588, 4589, 4590, 4591, 4592, 4593, 4594, 4595, 4596, 4597, 4598

# LSTM
## Trening

In [8]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

# load the data
df = load('../pose_outputs_loss', '../data/labeledFilms.csv')
df['style'] = df['style'].astype(str)  # ensure it's str (for label encoding)

# Prepare the data
SEQUENCE_LENGTH = 40  # truncate or pad to this length
FEATURE_SIZE = len(df.columns) - 2  # exclude 'style' and 'movie_name'
sequences = []
labels = []

for movie_name, group in df.groupby("movie_name"):
    group = group.drop(columns=["movie_name"])  # keep only keypoints + label
    label = group['style'].iloc[0]  # assume label same for the whole clip
    keypoints = group.drop(columns=['style']).values.astype(np.float32)

    # Truncate or pad
    if len(keypoints) >= SEQUENCE_LENGTH:
        keypoints = keypoints[:SEQUENCE_LENGTH]
    else:
        pad_len = SEQUENCE_LENGTH - len(keypoints)
        padding = np.zeros((pad_len, keypoints.shape[1]), dtype=np.float32)
        keypoints = np.vstack((keypoints, padding))

    sequences.append(keypoints)
    labels.append(label)

# Encode labels
le = LabelEncoder()
y = le.fit_transform(labels)
X = np.array(sequences, dtype=np.float32)
y = np.array(y, dtype=np.int64)
print(f"X shape: {X.shape}, y shape: {y.shape}")  # (n_clips, seq_len, features)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

class PoseSequenceDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X)
        self.y = torch.tensor(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(PoseSequenceDataset(X_train, y_train), batch_size=32, shuffle=True)
val_loader = DataLoader(PoseSequenceDataset(X_val, y_val), batch_size=32)

class LSTMPoseClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]  # last time step
        return self.fc(out)

# Initialize the model, loss function, and optimizer
input_size = X.shape[2]
model = LSTMPoseClassifier(input_size=input_size, hidden_size=64, num_classes=len(le.classes_))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    model.train()
    for X_batch, y_batch in train_loader:
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


X shape: (77, 40, 34), y shape: (77,)
Epoch 1, Loss: nan
Epoch 2, Loss: nan
Epoch 3, Loss: nan
Epoch 4, Loss: nan
Epoch 5, Loss: nan
Epoch 6, Loss: nan
Epoch 7, Loss: nan
Epoch 8, Loss: nan
Epoch 9, Loss: nan
Epoch 10, Loss: nan


## Ewaluacja

In [9]:
model.eval()
correct = total = 0
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        preds = model(X_batch)
        predicted = preds.argmax(dim=1)
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)

print("Validation Accuracy:", correct / total)

Validation Accuracy: 1.0


# DTW + k-NN
## Train

In [10]:
from tslearn.metrics import dtw
from tslearn.neighbors import KNeighborsTimeSeriesClassifier

# Prepare the data
SEQUENCE_LENGTH = 40  # truncate or pad to this length
FEATURE_SIZE = len(df.columns) - 2  # exclude 'style' and 'movie_name'
df = load('../pose_outputs_loss', '../data/labeledFilms.csv')
df['style'] = df['style'].astype(str)  # ensure it's str (for label encoding)

# Suppose each sample is a (T, D) array: T=timesteps, D=features (e.g. 34 keypoints)
# You need to group rows per movie into a sequence
X_sequences = []
y = []

for movie_name, group in df.groupby("movie_name"):
    coords = group.drop(columns=["movie_name", "style"]).values  # shape (T, D)
    X_sequences.append(coords)
    y.append(group["style"].iloc[0])

# Convert to proper time series format (num_samples, T, D)
from tslearn.utils import to_time_series_dataset
X_ts = to_time_series_dataset(X_sequences)  # handles padding internally

# Train a classifier
knn = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="dtw")
knn.fit(X_ts, y)

# Predict on a new clip
new_clip_sequence = np.random.rand(SEQUENCE_LENGTH, FEATURE_SIZE).astype(np.float32)  # Example new clip
y_pred = knn.predict([new_clip_sequence])
print("Predicted label for new clip:", y_pred)

ValueError: One of the input time series contains only nans or has zero length.

## Evaluate

In [None]:
# TODO