In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np

data = np.load("/content/drive/MyDrive/Gsoc_dance/mariel_betternot_and_retrograde.npy")

data -= data[0, :, :].reshape(1, -1, 3)

# sequence length
sequence_length = 40

# Create 40-frame sequences
num_sequences = data.shape[1] // sequence_length
sequences = [data[:, i * sequence_length:(i + 1) * sequence_length, :] for i in range(num_sequences)]
sequences = np.array(sequences)

np.save("/content/dance_sequences.npy", sequences)
print(f"Extracted {len(sequences)} sequences, each of shape {sequences.shape[1:]}")


Extracted 273 sequences, each of shape (55, 40, 3)


In [None]:
import json

sequences = np.load("/content/dance_sequences.npy")



manual_labels = {}

for idx in range(30):
    sequence = sequences[idx]

    label = input(f"Enter label for sequence {idx}: ")
    manual_labels[idx] = label

with open("/content/manual_labels.json", "w") as f:
    json.dump(manual_labels, f)

print("Manual labeling complete. Labels saved.")


Enter label for sequence 0: stand
Enter label for sequence 1: stand
Enter label for sequence 2: stand
Enter label for sequence 3: stand
Enter label for sequence 4: stand
Enter label for sequence 5: stand
Enter label for sequence 6: walk
Enter label for sequence 7: walk
Enter label for sequence 8: walk
Enter label for sequence 9: lie_down
Enter label for sequence 10: lie_down
Enter label for sequence 11: lie_down
Enter label for sequence 12: lie_down
Enter label for sequence 13: lie_down
Enter label for sequence 14: lie_down
Enter label for sequence 15: lie_down
Enter label for sequence 16: flip_horizontaly
Enter label for sequence 17: lie_down
Enter label for sequence 18: raise_hand
Enter label for sequence 19: flip_horizontaly
Enter label for sequence 20: flip_horizontaly
Enter label for sequence 21: sleep_dance
Enter label for sequence 22: stand
Enter label for sequence 23: stand_dance
Enter label for sequence 24: stand_dance
Enter label for sequence 25: stand_dance
Enter label for s

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Convert sequences to features using motion differences
features = np.mean(np.diff(sequences, axis=2), axis=2).reshape(len(sequences), -1)

# Reduce dimensionality
pca = PCA(n_components=10)
features_reduced = pca.fit_transform(features)

# Applying KMeans Clustering
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(features_reduced)

# Assign labels based on cluster similarity
with open("/content/manual_labels.json", "r") as f:
    manual_labels = json.load(f)

cluster_labels = {}
for idx, label in manual_labels.items():
    idx = int(idx)
    cluster_labels[clusters[idx]] = label

sequence_labels = [cluster_labels.get(c, "unknown") for c in clusters]


with open("/content/auto_labels.json", "w") as f:
    json.dump(sequence_labels, f)

print("Auto-labeling complete. Labels saved.")


Auto-labeling complete. Labels saved.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import json

class DanceTextContrastiveModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.dance_encoder = nn.Sequential(
            nn.Linear(55 * 40 * 3, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
        )
        self.text_embedding = nn.Embedding(1000, 256)
        self.text_encoder = nn.LSTM(256, 256, batch_first=True)

    def forward(self, dance, text):
        # Dance Encoder
        dance_embedding = self.dance_encoder(dance)

        # Text Encoder
        text_embedded = self.text_embedding(text)  # Convert text input to embeddings
        lstm_out, _ = self.text_encoder(text_embedded)  # Process through LSTM
        text_embedding = lstm_out[:, -1, :]  # Take last LSTM output

        return dance_embedding, text_embedding

# Contrastive Loss Function
def contrastive_loss(dance_emb, text_emb):
    return -torch.cosine_similarity(dance_emb, text_emb).mean()

# (Shape: [num_samples, 55, 40, 3])
sequences = np.load("/content/dance_sequences.npy")
sequences = sequences.reshape(len(sequences), -1)

#Labels
with open("/content/auto_labels.json", "r") as f:
    labels = json.load(f)

# Convert Text Labels to Tokens
word_to_index = {word: idx for idx, word in enumerate(set(labels))}
text_inputs = [word_to_index[label] for label in labels]
text_inputs = torch.tensor(text_inputs, dtype=torch.long).unsqueeze(1)

dance_data = torch.tensor(sequences, dtype=torch.float32)

# Initialize Model and Optimizer
model = DanceTextContrastiveModel()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training
num_epochs = 20
for epoch in range(num_epochs):
    optimizer.zero_grad()
    dance_emb, text_emb = model(dance_data, text_inputs)
    loss = contrastive_loss(dance_emb, text_emb)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

print("Training complete!")

torch.save(model.state_dict(), "/content/dance_text_model.pth")
print("Model saved!")


Epoch 1, Loss: -0.063943050801754
Epoch 2, Loss: -0.5137917995452881
Epoch 3, Loss: -0.6824116706848145
Epoch 4, Loss: -0.7670229077339172
Epoch 5, Loss: -0.81571364402771
Epoch 6, Loss: -0.8505261540412903
Epoch 7, Loss: -0.8784983158111572
Epoch 8, Loss: -0.9011996984481812
Epoch 9, Loss: -0.9189996719360352
Epoch 10, Loss: -0.9324604272842407
Epoch 11, Loss: -0.9424901008605957
Epoch 12, Loss: -0.950090765953064
Epoch 13, Loss: -0.9560830593109131
Epoch 14, Loss: -0.9610079526901245
Epoch 15, Loss: -0.96517014503479
Epoch 16, Loss: -0.968743085861206
Epoch 17, Loss: -0.9718512892723083
Epoch 18, Loss: -0.9745963215827942
Epoch 19, Loss: -0.9770501255989075
Epoch 20, Loss: -0.9792491793632507
Training complete!
Model saved!


In [None]:
from google.colab import files
files.download("/content/dance_text_model.pth")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# state_dict = torch.load("dance_text_model.pth")
# print(state_dict.keys())


odict_keys(['dance_encoder.0.weight', 'dance_encoder.0.bias', 'dance_encoder.2.weight', 'dance_encoder.2.bias', 'text_embedding.weight', 'text_encoder.weight_ih_l0', 'text_encoder.weight_hh_l0', 'text_encoder.bias_ih_l0', 'text_encoder.bias_hh_l0'])


In [None]:
state_dict = torch.load("dance_text_model.pth")

# Rename mismatched keys
new_state_dict = {}
for k, v in state_dict.items():
    if k.startswith("text_embedding"):
        new_key = k.replace("text_embedding", "text_encoder.0")
    elif k.startswith("text_encoder"):
        new_key = k.replace("text_encoder", "text_encoder.1")
    else:
        new_key = k
    new_state_dict[new_key] = v

# Load fixed state dictionary
model.load_state_dict(new_state_dict, strict=False)


_IncompatibleKeys(missing_keys=['text_encoder.2.weight', 'text_encoder.2.bias'], unexpected_keys=[])

In [None]:
import torch
import torch.nn as nn

class DanceTextContrastiveModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.dance_encoder = nn.Sequential(
            nn.Linear(55 * 40 * 3, 512),  # 6600 = 55*40*3
            nn.ReLU(),
            nn.Linear(512, 256),
        )
        self.text_embedding = nn.Embedding(1000, 256)  # This must match
        self.text_encoder = nn.LSTM(256, 256)

    def forward(self, dance, text):
        dance_embedding = self.dance_encoder(dance)
        text_embedded = self.text_embedding(text)
        text_output, _ = self.text_encoder(text_embedded)
        text_embedding = text_output[:, -1, :]
        return dance_embedding, text_embedding

# Load model
model = DanceTextContrastiveModel()
model.load_state_dict(torch.load("dance_text_model.pth"))
model.eval()
print("Model loaded successfully!")


Model loaded successfully!


In [None]:
import torch
import numpy as np

model = DanceTextContrastiveModel()
model.load_state_dict(torch.load("dance_text_model.pth"))
model.eval()

#only first 500 frames
test_dance_sequence = np.load("/content/mariel_penelope.npy")[:500]

#40 frames each
test_dance_sequence = test_dance_sequence[:, :40, :]

# Convert dance sequence to tensor
dance_tensor = torch.tensor(test_dance_sequence.reshape(1, -1), dtype=torch.float32)

with torch.no_grad():
    dance_embedding, _ = model(dance_tensor, torch.zeros(1, dtype=torch.long).unsqueeze(0))

# Compute similarity with all text embeddings
all_text_embeddings = []
for label in word_to_index.keys():
    text_tensor = torch.tensor([word_to_index[label]], dtype=torch.long).unsqueeze(0)
    # Dummy dance input
    _, text_emb = model(torch.zeros(1, 6600), text_tensor)
    all_text_embeddings.append(text_emb.detach().numpy())


# Convert to numpy
all_text_embeddings = np.array(all_text_embeddings).squeeze()
dance_embedding_np = dance_embedding.numpy().squeeze()

# Find closest text label using cosine similarity
cosine_sim = np.dot(all_text_embeddings, dance_embedding_np) / (
    np.linalg.norm(all_text_embeddings, axis=1) * np.linalg.norm(dance_embedding_np)
)
closest_text = list(word_to_index.keys())[np.argmax(cosine_sim)]

print("Predicted Text Label:", closest_text)

Predicted Text Label: stand_dance


In [None]:
import torch
import numpy as np

model = DanceTextContrastiveModel()
model.load_state_dict(torch.load("dance_text_model.pth"))
model.eval()

def generate_dance_from_text(input_text):
    if input_text not in word_to_index:
        raise ValueError(f"Text '{input_text}' not in vocabulary!")

    # Convert text to tensor
    text_tensor = torch.tensor([word_to_index[input_text]], dtype=torch.long).unsqueeze(0)

    # Get dance embedding
    with torch.no_grad():
        _, text_embedding = model(torch.zeros(1, 6600), text_tensor)

    # using a linear layer to map to the desired dance sequence shape
    decoder = nn.Linear(text_embedding.shape[1], 55 * 40 * 3) #decoder layer
    dance_embedding = decoder(text_embedding) # Decode text embedding

    generated_dance_sequence = dance_embedding.detach().reshape(1, 55, 40, 3).cpu().numpy()[0]

    return generated_dance_sequence

input_text = "stand_dance"
generated_dance = generate_dance_from_text(input_text)

np.save("generated_dance.npy", generated_dance)

print(f"Generated dance sequence for '{input_text}' saved as 'generated_dance.npy'")

Generated dance sequence for 'stand_dance' saved as 'generated_dance.npy'
