In [None]:
# take images_and_labels

### CONFIG ###
DATA_DIR = "/home/vladimir_albrekht/projects/img_to_spec/large_files/ILSVRC/imagenet-1k/data"
SPLIT = "train"
START_LABEL = 0
END_LABEL = 100

### IMPORTS ###
from pathlib import Path
import pyarrow.parquet as pq
from collections import Counter
import json

### FUNCTIONS ###
def count_labels(data_dir, split, start_label, end_label):
    data_dir = Path(data_dir)
    parquet_files = sorted(data_dir.glob(f"{split}-*.parquet"))
    
    counter = Counter()
    
    for pq_file in parquet_files:
        table = pq.read_table(pq_file, columns=['label'])
        labels = table['label'].to_pylist()
        
        for label in labels:
            if start_label <= label < end_label:
                counter[label] += 1
    
    return counter

def get_class_names(data_dir):
    parquet_file = next(Path(data_dir).glob("train-*.parquet"))
    table = pq.read_table(parquet_file)
    hf_meta = json.loads(table.schema.metadata[b'huggingface'])
    return hf_meta['info']['features']['label']['names']

### MAIN EXECUTION ###
class_names = get_class_names(DATA_DIR)
label_counts = count_labels(DATA_DIR, SPLIT, START_LABEL, END_LABEL)

print(f"Label distribution for labels {START_LABEL}-{END_LABEL-1}:")
print(f"Total labels in range: {len(label_counts)}")
print(f"Total samples: {sum(label_counts.values())}")
print(f"\nPer-label breakdown:")

for label in sorted(label_counts.keys()):
    print(f"Label {label:3d} ({class_names[label][:40]:40s}): {label_counts[label]:5d} samples")

Label distribution for labels 0-99:
Total labels in range: 100
Total samples: 129395

Per-label breakdown:
Label   0 (tench, Tinca tinca                      ):  1300 samples
Label   1 (goldfish, Carassius auratus             ):  1300 samples
Label   2 (great white shark, white shark, man-eate):  1300 samples
Label   3 (tiger shark, Galeocerdo cuvieri         ):  1300 samples
Label   4 (hammerhead, hammerhead shark            ):  1300 samples
Label   5 (electric ray, crampfish, numbfish, torpe):  1300 samples
Label   6 (stingray                                ):  1300 samples
Label   7 (cock                                    ):  1300 samples
Label   8 (hen                                     ):  1300 samples
Label   9 (ostrich, Struthio camelus               ):  1300 samples
Label  10 (brambling, Fringilla montifringilla     ):  1300 samples
Label  11 (goldfinch, Carduelis carduelis          ):  1300 samples
Label  12 (house finch, linnet, Carpodacus mexicanu):  1300 samples
Label  13

In [None]:
### CONFIG ###
DATA_DIR = "/home/vladimir_albrekht/projects/img_to_spec/large_files/ILSVRC/imagenet-1k/data"
OUTPUT_DIR = "/home/vladimir_albrekht/projects/img_to_spec/large_files/ILSVRC/images_10_class"
SPLIT = "train"
TARGET_LABELS = list(range(10))  # Labels 0-9
IMAGE_SIZE = 512
MAX_PER_CLASS = None  # None = all, or set a number like 100

### IMPORTS ###
from pathlib import Path
import pyarrow.parquet as pq
from PIL import Image
import io
import json
from tqdm import tqdm

### FUNCTIONS ###

def get_class_names(data_dir):
    """Get class names from parquet metadata."""
    parquet_file = next(Path(data_dir).glob("train-*.parquet"))
    table = pq.read_table(parquet_file)
    hf_meta = json.loads(table.schema.metadata[b'huggingface'])
    return hf_meta['info']['features']['label']['names']


def sanitize_filename(name):
    """Make class name safe for filesystem."""
    # Take first part, replace spaces and special chars
    name = name.split(',')[0].strip()
    name = name.replace(' ', '_').replace('-', '_')
    name = ''.join(c for c in name if c.isalnum() or c == '_')
    return name[:30]  # Limit length


def extract_and_save_images(
    data_dir,
    output_dir,
    split,
    target_labels,
    image_size=512,
    max_per_class=None
):
    """
    Extract images from parquet files, resize to image_size×image_size,
    and save organized by class.
    """
    data_dir = Path(data_dir)
    output_dir = Path(output_dir)
    
    # Get class names
    class_names = get_class_names(data_dir)
    
    # Create output directories
    label_to_dirname = {}
    for label in target_labels:
        class_name = sanitize_filename(class_names[label])
        dir_name = f"{label:03d}_{class_name}"
        label_to_dirname[label] = dir_name
        (output_dir / dir_name).mkdir(parents=True, exist_ok=True)
    
    print(f"Output directories created in {output_dir}")
    for label, dirname in label_to_dirname.items():
        print(f"  Label {label}: {dirname}/")
    
    # Track counts per label
    counts = {label: 0 for label in target_labels}
    
    # Find all parquet files
    parquet_files = sorted(data_dir.glob(f"{split}-*.parquet"))
    print(f"\nFound {len(parquet_files)} parquet files")
    
    # Process each parquet file
    for pq_file in tqdm(parquet_files, desc="Processing parquet files"):
        # Read table
        table = pq.read_table(pq_file, columns=['image', 'label'])
        
        # Get data
        images = table['image'].to_pylist()
        labels = table['label'].to_pylist()
        
        for img_data, label in zip(images, labels):
            # Skip if not in target labels
            if label not in target_labels:
                continue
            
            # Skip if we have enough for this class
            if max_per_class and counts[label] >= max_per_class:
                continue
            
            try:
                # Extract image bytes
                if isinstance(img_data, dict) and 'bytes' in img_data:
                    img_bytes = img_data['bytes']
                elif isinstance(img_data, bytes):
                    img_bytes = img_data
                else:
                    print(f"Unknown image format: {type(img_data)}")
                    continue
                
                # Open and resize image
                img = Image.open(io.BytesIO(img_bytes))
                
                # Convert to RGB if necessary
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                
                # Resize to square (center crop + resize for better quality)
                # Option 1: Simple resize (might distort)
                # img = img.resize((image_size, image_size), Image.LANCZOS)
                
                # Option 2: Center crop to square, then resize (preserves aspect)
                w, h = img.size
                min_dim = min(w, h)
                left = (w - min_dim) // 2
                top = (h - min_dim) // 2
                img = img.crop((left, top, left + min_dim, top + min_dim))
                img = img.resize((image_size, image_size), Image.LANCZOS)
                
                # Save
                dir_name = label_to_dirname[label]
                filename = f"{counts[label]:05d}.jpg"
                save_path = output_dir / dir_name / filename
                img.save(save_path, 'JPEG', quality=95)
                
                counts[label] += 1
                
            except Exception as e:
                print(f"Error processing image: {e}")
                continue
        
        # Check if we have enough for all classes
        if max_per_class and all(c >= max_per_class for c in counts.values()):
            print("\nReached max_per_class for all labels, stopping early.")
            break
    
    return counts


### MAIN EXECUTION ###
if __name__ == "__main__":
    print("=" * 60)
    print("IMAGENET IMAGE EXTRACTION")
    print("=" * 60)
    
    counts = extract_and_save_images(
        data_dir=DATA_DIR,
        output_dir=OUTPUT_DIR,
        split=SPLIT,
        target_labels=TARGET_LABELS,
        image_size=IMAGE_SIZE,
        max_per_class=MAX_PER_CLASS
    )
    
    print("\n" + "=" * 60)
    print("EXTRACTION COMPLETE")
    print("=" * 60)
    
    total = 0
    for label in sorted(counts.keys()):
        class_names = get_class_names(DATA_DIR)
        print(f"Label {label:3d} ({class_names[label][:30]:30s}): {counts[label]:5d} images")
        total += counts[label]
    
    print(f"\nTotal images extracted: {total}")
    print(f"Output directory: {OUTPUT_DIR}")
# ```

# ---

# ## Структура Результата
# ```
# /home/vladimir_albrekht/projects/img_to_spec/large_files/ILSVRC/images_10_class/
# ├── 000_tench/
# │   ├── 00000.jpg
# │   ├── 00001.jpg
# │   └── ... (1300 images)
# ├── 001_goldfish/
# │   ├── 00000.jpg
# │   └── ...
# ├── 002_great_white_shark/
# │   └── ...
# ├── 003_tiger_shark/
# │   └── ...
# ├── 004_hammerhead/
# │   └── ...
# ├── 005_electric_ray/
# │   └── ...
# ├── 006_stingray/
# │   └── ...
# ├── 007_cock/
# │   └── ...
# ├── 008_hen/
# │   └── ...
# └── 009_ostrich/
#     └── ...

## PIPER

In [1]:
# 1. download
from huggingface_hub import snapshot_download
from huggingface_hub import login
import dotenv
import os
from pathlib import Path
dotenv.load_dotenv()
login(token=os.getenv("HF_TOKEN"))


repo_id = "rhasspy/piper-voices"
local_dir = Path("piper_tts").resolve()

def download_model_repo(repo_id, local_dir):
    print(f"Downloading repository {repo_id}...")
    snapshot_download(
        repo_id=repo_id,
        allow_patterns=["en/en_US/*"],
        local_dir=str(local_dir)
    )
    print(f"Repository downloaded to {local_dir}")

download_model_repo(repo_id, local_dir)

# # 2. generate speech
# import wave
# from piper import PiperVoice

# # Simple CPU TTS inference

# voice_path = "piper_tts/en/en_US/amy/medium/en_US-amy-medium.onnx"
# text = "Сәлеметсіз бе, бұл мысал сөйлем."
# output_wav = "audio.wav"

# voice = PiperVoice.load(voice_path, use_cuda=False)

# with wave.open(output_wav, "wb") as wav_file:
#     voice.synthesize_wav(text, wav_file)

# print("Done:", output_wav)


Downloading repository rhasspy/piper-voices...


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching ... files: 0it [00:00, ?it/s]

Repository downloaded to /scratch/vladimir_albrekht/projects/i2m/to_revise/piper_tts


In [1]:
# generate_audio.py

import wave
from pathlib import Path
from piper import PiperVoice
import torch
import torchaudio
import soundfile as sf

### CONFIG ###
VOICE_PATH = "/scratch/vladimir_albrekht/projects/i2m/large_files/ILSVRC_images_10_class/piper_tts/en/en_US/amy/medium/en_US-amy-medium.onnx"
OUTPUT_DIR = Path("/scratch/vladimir_albrekht/projects/i2m/large_files/ILSVRC_images_10_class/audios_10_class")
USE_CUDA = False

### CLASS DESCRIPTIONS (~2 sec each) ###
CLASS_DESCRIPTIONS = {
    0: "A tench, a freshwater fish",                        # было 3.52 → ~1.5 sec
    1: "A goldfish, bright orange fish",                    # было 3.10 → ~1.5 sec
    2: "A great white shark, ocean predator",               # 2.93 ✓ OK
    3: "A tiger shark with dark stripes",                   # было 3.11 → ~2 sec
    4: "A hammerhead shark with unique head",               # 2.87 ✓ OK
    5: "An electric ray with electric shocks",              # было 3.25 → ~2 sec
    6: "A stingray gliding through water",                  # 2.41 ✓ OK
    7: "A rooster with colorful feathers",                  # 2.67 ✓ OK
    8: "A hen pecking at seeds",                            # 2.55 ✓ OK
    9: "An ostrich, the largest bird",                      # было 3.49 → ~2 sec
}

CLASS_DIRNAMES = {
    0: "000_tench",
    1: "001_goldfish",
    2: "002_great_white_shark",
    3: "003_tiger_shark",
    4: "004_hammerhead",
    5: "005_electric_ray",
    6: "006_stingray",
    7: "007_cock",
    8: "008_hen",
    9: "009_ostrich",
}

### MAIN ###
def main():
    print("=" * 60)
    print("AUDIO GENERATION WITH PIPER TTS")
    print("=" * 60)
    
    # Load Piper voice
    print(f"Loading voice from {VOICE_PATH}...")
    voice = PiperVoice.load(VOICE_PATH, use_cuda=USE_CUDA)
    print("✓ Voice loaded")
    
    # Create output directory
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    
    # Generate audio for each class
    for label, description in CLASS_DESCRIPTIONS.items():
        dirname = CLASS_DIRNAMES[label]
        class_audio_dir = OUTPUT_DIR / dirname
        class_audio_dir.mkdir(parents=True, exist_ok=True)
        
        output_path = class_audio_dir / "description.wav"
        
        print(f"\nLabel {label}: {dirname}")
        print(f"  Text: \"{description}\"")
        
        # Generate audio
        with wave.open(str(output_path), "wb") as wav_file:
            voice.synthesize_wav(description, wav_file)
        
        # Check duration
        data, sr = sf.read(output_path)
        duration = len(data) / sr
        print(f"  ✓ Saved: {output_path.name} ({duration:.2f} sec, {sr} Hz)")
    
    print("\n" + "=" * 60)
    print("✓ ALL AUDIO FILES GENERATED")
    print(f"  Output: {OUTPUT_DIR}")
    print("=" * 60)


if __name__ == "__main__":
    main()

AUDIO GENERATION WITH PIPER TTS
Loading voice from /scratch/vladimir_albrekht/projects/i2m/large_files/ILSVRC_images_10_class/piper_tts/en/en_US/amy/medium/en_US-amy-medium.onnx...
✓ Voice loaded

Label 0: 000_tench
  Text: "A tench, a freshwater fish"
  ✓ Saved: description.wav (2.21 sec, 22050 Hz)

Label 1: 001_goldfish
  Text: "A goldfish, bright orange fish"
  ✓ Saved: description.wav (2.33 sec, 22050 Hz)

Label 2: 002_great_white_shark
  Text: "A great white shark, ocean predator"
  ✓ Saved: description.wav (2.48 sec, 22050 Hz)

Label 3: 003_tiger_shark
  Text: "A tiger shark with dark stripes"
  ✓ Saved: description.wav (2.35 sec, 22050 Hz)

Label 4: 004_hammerhead
  Text: "A hammerhead shark with unique head"
  ✓ Saved: description.wav (2.38 sec, 22050 Hz)

Label 5: 005_electric_ray
  Text: "An electric ray with electric shocks"
  ✓ Saved: description.wav (2.67 sec, 22050 Hz)

Label 6: 006_stingray
  Text: "A stingray gliding through water"
  ✓ Saved: description.wav (2.17 sec, 

In [None]:
# # verify_audio.py

# import torchaudio
# from pathlib import Path

# AUDIO_DIR = Path("/home/vladimir_albrekht/projects/img_to_spec/large_files/ILSVRC/audio_10_class")

# print("Audio Duration Check:")
# print("=" * 50)

# total_duration = 0
# for class_dir in sorted(AUDIO_DIR.iterdir()):
#     if not class_dir.is_dir():
#         continue
    
#     audio_file = class_dir / "description.wav"
#     if audio_file.exists():
#         waveform, sr = torchaudio.load(str(audio_file))
#         duration = waveform.shape[1] / sr
#         total_duration += duration
        
#         status = "✓" if 1.5 <= duration <= 3.0 else "⚠️"
#         print(f"{status} {class_dir.name}: {duration:.2f} sec @ {sr} Hz")

# print("=" * 50)
# print(f"Average duration: {total_duration / 10:.2f} sec")

### Pairs_json

In [2]:
# create_pairs_json.py

import json
from pathlib import Path

### CONFIG ###
IMAGES_DIR = Path("/scratch/vladimir_albrekht/projects/i2m/large_files/ILSVRC_images_10_class/images_10_class")
AUDIO_DIR = Path("/scratch/vladimir_albrekht/projects/i2m/large_files/ILSVRC_images_10_class/audios_10_class")
OUTPUT_PATH = Path("/scratch/vladimir_albrekht/projects/i2m/src/data/pairs.json")

### MAIN ###
def create_pairs_json():
    print("=" * 60)
    print("CREATING pairs.json")
    print("=" * 60)
    
    pairs = []
    
    # Get all class directories
    class_dirs = sorted([d for d in IMAGES_DIR.iterdir() if d.is_dir()])
    
    print(f"Found {len(class_dirs)} class directories\n")
    
    for class_dir in class_dirs:
        class_name = class_dir.name
        
        # Find corresponding audio file
        audio_path = AUDIO_DIR / class_name / "description.wav"
        
        if not audio_path.exists():
            print(f"⚠️ No audio for {class_name}, skipping...")
            continue
        
        # Get all images in this class
        image_files = sorted(class_dir.glob("*.jpg"))
        
        print(f"{class_name}: {len(image_files)} images")
        
        # Create pairs (each image paired with the class audio)
        for img_path in image_files:
            pairs.append({
                "image": str(img_path),
                "audio": str(audio_path)
            })
    
    # Save pairs.json
    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    
    with open(OUTPUT_PATH, 'w') as f:
        json.dump(pairs, f, indent=2)
    
    print("\n" + "=" * 60)
    print(f"✓ Created {OUTPUT_PATH}")
    print(f"  Total pairs: {len(pairs)}")
    print("=" * 60)
    
    # Show sample
    print("\nSample entries:")
    for i in [0, 1, len(pairs)//2, -1]:
        p = pairs[i]
        print(f"  {Path(p['image']).name} → {Path(p['audio']).parent.name}/{Path(p['audio']).name}")

if __name__ == "__main__":
    create_pairs_json()

CREATING pairs.json
Found 10 class directories

000_tench: 1300 images
001_goldfish: 1300 images
002_great_white_shark: 1300 images
003_tiger_shark: 1300 images
004_hammerhead: 1300 images
005_electric_ray: 1300 images
006_stingray: 1300 images
007_cock: 1300 images
008_hen: 1300 images
009_ostrich: 1300 images

✓ Created /scratch/vladimir_albrekht/projects/i2m/src/data/pairs.json
  Total pairs: 13000

Sample entries:
  00000.jpg → 000_tench/description.wav
  00001.jpg → 000_tench/description.wav
  00000.jpg → 005_electric_ray/description.wav
  01299.jpg → 009_ostrich/description.wav


In [4]:
import soundfile as sf, torch, torchaudio

x, sr = sf.read("/scratch/vladimir_albrekht/projects/i2m/large_files/ILSVRC_images_10_class/audios_10_class/000_tench/description.wav", dtype="float32")
x = torch.from_numpy(x).unsqueeze(0) if x.ndim == 1 else torch.from_numpy(x).mean(dim=1, keepdim=True).T
y = torchaudio.transforms.Resample(sr, 24000)(x).squeeze(0).numpy()
sf.write("/scratch/vladimir_albrekht/projects/i2m/large_files/ILSVRC_images_10_class/audios_10_class/000_tench/temp.wav", y, 24000)
