In [36]:
import os
from pathlib import Path
from datasets import load_dataset
import pandas as pd
os.environ["HF_HOME"] = "/workspace/.cache/huggingface"

In [19]:
data_dir = Path("data/vox2/")
audio = data_dir / "aac" / "dev" / "aac"
video = data_dir / "mp4" / "dev" / "mp4"
txt = data_dir / "txt"

ids = {"id00012", "id00015", "id00016", "id00018", "id00019", "id00020"}

# Create a list to store all audio-video pairs
audio_video_pairs = []

# Iterate through each ID
for id_name in ids:
    # Get all scene directories for this ID
    audio_id_dir = audio / id_name
    video_id_dir = video / id_name
    txt_id_dir = txt / id_name
    
    # Ensure both directories exist
    if not audio_id_dir.exists() or not video_id_dir.exists():
        continue
    
    # Get all scene directories
    audio_scenes = [d for d in audio_id_dir.iterdir() if d.is_dir()]
    
    # Iterate through each scene
    for audio_scene in audio_scenes:
        scene_name = audio_scene.name
        video_scene = video_id_dir / scene_name
        
        # Ensure the corresponding video scene directory exists
        if not video_scene.exists():
            continue
        
        # Get all audio files in this scene
        audio_files = list(audio_scene.glob("*.m4a"))
        
        # For each audio file, find the corresponding video file
        for audio_file in audio_files:
            # Get the base name without extension
            base_name = audio_file.stem
            # Look for the corresponding video file
            video_file = video_scene / f"{base_name}.mp4"
            
            # If the video file exists, add the pair to our list
            if video_file.exists():
                audio_video_pairs.append((audio_file, video_file))

print(f"Found {len(audio_video_pairs)} audio-video pairs")

Found 1657 audio-video pairs


In [24]:
audio_video_pairs[0][1].parent.parent.name

'id00015'

In [25]:
# Create a directory to store all audio and video files
import shutil

output_dir = Path("data/combined_files")
os.makedirs(output_dir, exist_ok=True)

# Copy all files to the output directory
for audio_file, video_file in audio_video_pairs:
    # Extract filenames
    speaker_id = audio_file.parent.parent.name
    audio_filename = f"{speaker_id}_{audio_file.name}"
    video_filename = f"{speaker_id}_{video_file.name}"
    
    # Copy files to the output directory
    shutil.copy2(audio_file, output_dir / audio_filename)
    shutil.copy2(video_file, output_dir / video_filename)

print(f"Copied {len(audio_video_pairs)} audio files and {len(audio_video_pairs)} video files to {output_dir}")

# Display the ori

Copied 1657 audio files and 1657 video files to data/combined_files


In [4]:
# Load the dataset and remove the audio_path field
ds = load_dataset("acul3/voxceleb2")

In [5]:
ds['train'][0]

{'audio_path': {'path': '00100.wav',
  'array': array([0.02627907, 0.03061463, 0.02636903, ..., 0.        , 0.        ,
         0.        ], shape=(155136,)),
  'sampling_rate': 24000},
 'transcription': ' What simply sounds good and you wind up in some way trying to, you know, eventually accessing the truth about your condition.',
 'language': 'en',
 'speaker_id': 'id03701',
 'gender': 'male'}

In [15]:
from concurrent.futures import ProcessPoolExecutor
import numpy as np
from tqdm.auto import tqdm

def process_range(range_tuple):
    start_idx, end_idx = range_tuple
    results = []
    for i in range(start_idx, end_idx):
        entry = ds['train'][i]
        speaker_id = entry['speaker_id']
        if i % 1000 == 0:
            print(f"Processing entry {i} of {len(ds['train'])}")
        if speaker_id in ids:
            results.append((speaker_id, entry['audio_path']['path'], entry['transcription']))
    return results

# Define the ranges directly without creating batches first
total_items = len(ds['train'])
num_workers = 4
chunk_size = total_items // num_workers
ranges = [(i, min(i + chunk_size, total_items)) for i in range(0, total_items, chunk_size)]

print(f"Processing {total_items} items in {len(ranges)} chunks")

speaker_videos = []
# Process ranges in parallel
with ProcessPoolExecutor(max_workers=num_workers) as executor:
    results = list(tqdm(
        executor.map(process_range, ranges),
        total=len(ranges),
        desc="Processing data chunks"
    ))
    
    # Flatten results
    for chunk_result in results:
        speaker_videos.extend(chunk_result)

print(f"Found {len(speaker_videos)} matching speaker videos")


Processing 462850 items in 5 chunks


Processing data chunks:   0%|          | 0/5 [00:00<?, ?it/s]

Processing entry 0 of 462850
Processing entry 116000 of 462850
Processing entry 232000 of 462850
Processing entry 1000 of 462850
Processing entry 348000 of 462850
Processing entry 117000 of 462850
Processing entry 233000 of 462850
Processing entry 118000 of 462850
Processing entry 349000 of 462850
Processing entry 2000 of 462850
Processing entry 234000 of 462850
Processing entry 350000 of 462850
Processing entry 3000 of 462850
Processing entry 119000 of 462850
Processing entry 235000 of 462850
Processing entry 351000 of 462850
Processing entry 4000 of 462850
Processing entry 120000 of 462850
Processing entry 236000 of 462850
Processing entry 352000 of 462850
Processing entry 5000 of 462850
Processing entry 121000 of 462850
Processing entry 237000 of 462850
Processing entry 6000 of 462850
Processing entry 238000 of 462850
Processing entry 353000 of 462850
Processing entry 122000 of 462850
Processing entry 354000 of 462850
Processing entry 7000 of 462850
Processing entry 239000 of 462850

Processing data chunks: 100%|██████████| 5/5 [05:54<00:00, 70.85s/it] 


Found 973 matching speaker videos


In [26]:
speaker_videos = sorted(speaker_videos, key=lambda x: x[1])

In [52]:
video_path_and_transcripts = []

for speaker_id, audio_path, transcription in speaker_videos:
    video_path = audio_path.replace('.wav', '.mp4')
    file_name = speaker_id + "_" + video_path
    video_path_and_transcripts.append({"file_name": file_name, "text": transcription.strip()})

In [53]:
videos_df = pd.DataFrame.from_dict(video_path_and_transcripts)

In [54]:
videos_df.to_csv("data/metadata.csv", index=False)

In [5]:
import torch
from diffsynth import ModelManager, WanVideoPipeline, save_video, VideoData


model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
model_manager.load_models([
    "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
    "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
    "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
])
model_manager.load_lora("models/lightning_logs/version_2/checkpoints/epoch=4-step=625.ckpt", lora_alpha=1.0)
pipe = WanVideoPipeline.from_model_manager(model_manager, device="cuda")
pipe.enable_vram_management(num_persistent_param_in_dit=None)

video = pipe(
    prompt="i was telling my friend about how much i wanted to see him",
    negative_prompt="low quality, unclear facial expressions, blurry",
    num_inference_steps=50,
    seed=0, tiled=True
)
save_video(video, "video.mp4", fps=30, quality=5)

Loading models from: models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
    model_name: wan_video_dit model_class: WanModel
        This model is initialized with extra kwargs: {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 16, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06}
    The following models are loaded: ['wan_video_dit'].
Loading models from: models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
    model_name: wan_video_text_encoder model_class: WanTextEncoder
    The following models are loaded: ['wan_video_text_encoder'].
Loading models from: models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
    model_name: wan_video_vae model_class: WanVideoVAE
    The following models are loaded: ['wan_video_vae'].
Loading LoRA models from file: models/lightning_logs/version_2/checkpoints/epoch=4-step=625.ckpt
    Adding LoRA to wan_video_dit (models/Wan-AI/Wan2.1-T2V-1.3B/diff

100%|██████████| 50/50 [01:23<00:00,  1.68s/it]
VAE decoding: 100%|██████████| 9/9 [00:04<00:00,  2.13it/s]
Saving video:   0%|          | 0/81 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Saving video: 100%|██████████| 81/81 [00:01<00:00, 41.72it/s]
