# Imports

In [1]:
import time
import os
import pickle
import torch
import speechbrain as sb
from speechbrain.dataio.dataio import read_audio
from loquacious_set_prepare import load_datasets
# from tqdm import tqdm
from tqdm.notebook import tqdm
from hyperpyyaml import load_hyperpyyaml
from speechbrain.dataio.sampler import DynamicBatchSampler

import torchaudio
import torchaudio.transforms as T

import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from transformers.models.seamless_m4t.feature_extraction_seamless_m4t import SeamlessM4TFeatureExtractor



# Load dataset

In [19]:
hparams_file = "/local_disk/apollon/rwhetten/sss_data_selection/sense/data_select.yaml"
overrides = {
    "tls_subset": "small",
    "hf_hub": "speechbrain/LoquaciousSet",
    "hf_caching_dir": "/local_disk/apollon/rwhetten/hf_root/datasets",
    "save_int": 5,
    "ckpt_path": "ckpt.pkl",
    "feature_function_name": "text_emb",
    "sense_location": "/data/coros3/smdhaffar/SENSE/CKPT+checkpoint_epoch10/",
    "output_folder": "/local_disk/apollon/rwhetten/sss_data_selection/sense/pt_store",
    "max_batch_length_train": 100,
}

# brain.ckpt  counter.ckpt           lr_annealing_adam.ckpt     model.ckpt
# CKPT.yaml   dataloader-TRAIN.ckpt  lr_annealing_wav2vec.ckpt  wav2vec2.ckpt

In [20]:
with open(hparams_file, encoding="utf-8") as fin:
        hparams = load_hyperpyyaml(fin, overrides)

In [21]:
# tls_subset="small"
# hf_hub="speechbrain/LoquaciousSet"
# hf_caching_dir="/local_disk/apollon/rwhetten/hf_root/datasets"

hf_data_dict = load_datasets(
    hparams["tls_subset"],
    hparams["hf_hub"],
    hparams["hf_caching_dir"],
)

# We must rename the 'id' column because SpeechBrain sampling use this
# name for the sampler already, also it's not an id, but an audio_path.
train_data = hf_data_dict["train"].rename_column("ID", "audio_id")
# create list of durations for the dynamic batch sampler, for speed
train_len_list = list(train_data.select_columns("duration")["duration"])
# create dataset obj
train_data = sb.dataio.dataset.DynamicItemDataset.from_arrow_dataset(
    train_data,
)

print(train_data)


Resolving data files:   0%|          | 0/6323 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/66 [00:00<?, ?it/s]

<speechbrain.dataio.dataset.DynamicItemDataset object at 0x7f2186d23d10>


In [22]:
# create and add pipeline to datasets
datasets = [train_data]

@sb.utils.data_pipeline.takes("wav")
@sb.utils.data_pipeline.provides("sig")
def audio_pipeline(wav):
    feature_size = 80
    sampling_rate = 16000
    num_mel_bins = 80
    padding_value = 0.0
    stride = 2
    
    feature_extractor = SeamlessM4TFeatureExtractor(
        feature_size=feature_size,
        sampling_rate=sampling_rate,
        num_mel_bins=num_mel_bins,
        padding_value=padding_value,
        stride=stride
    )
    
    sig = read_audio(wav["bytes"])
    # np_wav = np.array(sig)
    features = feature_extractor(sig,sampling_rate=16000)["input_features"][0]
    return torch.Tensor(features)

sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)


# 3. Define text pipeline:
@sb.utils.data_pipeline.takes("text")
@sb.utils.data_pipeline.provides("wrd")
def text_pipeline(wrd):
    yield wrd.lower()

sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)

# 4. Set output:
sb.dataio.dataset.set_output_keys(
    datasets,
    ["id", "audio_id", "sig", "wrd"],
)


In [23]:
dataset = datasets[0]
print(len(dataset))
dataset[0]

107303


{'id': 0,
 'audio_id': '20091124-0900-PLENARY-18-en_20091124-22:35:55_9',
 'sig': tensor([[-0.9030, -1.0864, -2.8961,  ..., -1.2175, -1.3774, -1.0768],
         [-0.3821, -0.0228, -0.3508,  ..., -1.2550, -0.8651, -0.9142],
         [-0.9417, -0.8630, -1.0086,  ..., -0.9313, -0.4498, -0.4927],
         ...,
         [ 0.7211,  1.6580,  1.1608,  ..., -2.1000, -0.3901, -0.3237],
         [-0.1314,  0.9283,  0.4485,  ..., -1.2711, -1.0480, -1.0994],
         [-0.9046,  0.1171,  0.2605,  ..., -0.4674, -1.5554, -1.6531]]),
 'wrd': 'and what about interoperability in the rail sector are national barriers preventing progress in this area as well or is there an unwillingness on the part of the rail industry to embrace the concept of interoperability'}

In [24]:
dynamic_hparams_train = hparams["dynamic_batch_sampler_train"]
dynamic_hparams_train

{'max_batch_length': 100,
 'num_buckets': 200,
 'shuffle': False,
 'batch_ordering': 'ascending',
 'max_batch_ex': 256}

In [25]:
# create dynamic batch sampler
train_batch_sampler = DynamicBatchSampler(
    train_data,
    length_func=lambda x: x["duration"],
    lengths_list=train_len_list,
    **dynamic_hparams_train,
)

train_loader_kwargs = {
    "batch_sampler": train_batch_sampler,
    "num_workers": hparams["num_workers"],
}

# create dataloader
dataloader = sb.dataio.dataloader.make_dataloader(
    dataset, **train_loader_kwargs
)

In [26]:
hparams["num_workers"]

2

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# SENSE Features

In [28]:
hparams["wav2vec2"].model.encoder.layers[17].ffn1.intermediate_dense.weight

Parameter containing:
tensor([[ 0.0107, -0.0148, -0.0010,  ...,  0.0458, -0.0136,  0.0232],
        [ 0.0074, -0.0686, -0.0094,  ...,  0.0350, -0.0281, -0.0436],
        [ 0.0521, -0.0670,  0.0395,  ...,  0.0693, -0.0990, -0.0329],
        ...,
        [ 0.0097,  0.0675,  0.0066,  ..., -0.0865, -0.0520, -0.0685],
        [ 0.0313,  0.0098, -0.0862,  ..., -0.0320, -0.0016, -0.0751],
        [-0.0229, -0.0170, -0.0042,  ..., -0.0045,  0.0349,  0.0505]],
       requires_grad=True)

In [29]:
hparams["wav2vec2"].model.encoder.layers[17].ffn1.intermediate_dense.weight

Parameter containing:
tensor([[ 0.0107, -0.0148, -0.0010,  ...,  0.0458, -0.0136,  0.0232],
        [ 0.0074, -0.0686, -0.0094,  ...,  0.0350, -0.0281, -0.0436],
        [ 0.0521, -0.0670,  0.0395,  ...,  0.0693, -0.0990, -0.0329],
        ...,
        [ 0.0097,  0.0675,  0.0066,  ..., -0.0865, -0.0520, -0.0685],
        [ 0.0313,  0.0098, -0.0862,  ..., -0.0320, -0.0016, -0.0751],
        [-0.0229, -0.0170, -0.0042,  ..., -0.0045,  0.0349,  0.0505]],
       requires_grad=True)

In [30]:
hparams["model"][0].attn_pooling_w.weight

Parameter containing:
tensor([[ 0.0162,  0.0143, -0.0193,  ...,  0.0233, -0.0049, -0.0124]],
       requires_grad=True)

In [31]:
hparams["model"][0].attn_pooling_w.weight

Parameter containing:
tensor([[ 0.0162,  0.0143, -0.0193,  ...,  0.0233, -0.0049, -0.0124]],
       requires_grad=True)

In [32]:
if "pretrainer" in hparams.keys():
    hparams["pretrainer"].collect_files()
    hparams["pretrainer"].load_collected()


In [33]:
w2v_bert = hparams["wav2vec2"]
attention_pool = hparams["model"][0]

w2v_bert.to(device)
w2v_bert.eval()
attention_pool.to(device)
attention_pool.eval()

OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 23.69 GiB of which 14.94 MiB is free. Including non-PyTorch memory, this process has 23.67 GiB memory in use. Of the allocated memory 22.39 GiB is allocated by PyTorch, and 33.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
SAVE_INTERVAL_MINUTES = 5
CHECKPOINT_PATH = "sense_checkpoint_test.pkl"

def load_checkpoint(path):
    if os.path.exists(path):
        with open(path, "rb") as f:
            state = pickle.load(f)
        print(f"Resumed from batch {state['last_batch_index']}")
        return state["results"], state["last_batch_index"]
    else:
        return {}, -1

def save_checkpoint(path, results, last_batch_index):
    with open(path, "wb") as f:
        pickle.dump({
            "results": results,
            "last_batch_index": last_batch_index
        }, f)
    print(f"[Checkpoint] Saved batch {last_batch_index}, total items: {len(results)}")

In [34]:
results, last_saved_batch = load_checkpoint(CHECKPOINT_PATH)

start_time = time.time()

for i, batch in enumerate(tqdm(dataloader)):
    if i <= last_saved_batch:
        continue  # Skip already processed batches

    batch = batch.to(device)
    w, wl = batch.sig
    feats = hparams["wav2vec2"](w)
    feats = hparams["model"][0](feats)
    if i == 100:
        print(feats.shape)
        break
    audio_ids = batch.audio_id
    features = feats.cpu().numpy()
    results.extend(zip(audio_ids, features))
    # results.update(dict(zip(batch.audio_id, list(feats))))

    # Checkpoint every X minutes
    if (time.time() - start_time) >= SAVE_INTERVAL_MINUTES * 60:
        save_checkpoint(CHECKPOINT_PATH, results, i)
        start_time = time.time()  # Reset timer

# Final save
save_checkpoint(CHECKPOINT_PATH, results, i)


Resumed from batch 0


  0%|          | 0/9807 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 23.69 GiB of which 14.94 MiB is free. Including non-PyTorch memory, this process has 23.67 GiB memory in use. Of the allocated memory 22.39 GiB is allocated by PyTorch, and 30.24 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Test Loading

In [79]:
from speechbrain.nnet.pooling import AttentionPooling
from torch.nn import ModuleList

# Step 1: Define the model architecture
# Assume the same hyperparameters were used during training
attn_pooling1 = AttentionPooling(input_dim=1024)  # adjust sizes if known
attn_pooling2 = AttentionPooling(input_dim=1024)

# Step 2: Wrap in a ModuleList
model = ModuleList([attn_pooling1, attn_pooling2])

# Step 3: Load weights
x = torch.load("/data/coros3/smdhaffar/SENSE/CKPT+checkpoint_epoch10/model.ckpt", map_location=torch.device('cpu'))
model.load_state_dict(x)

<All keys matched successfully>

In [80]:
from speechbrain.lobes.models.w2v_bert import HuggingFaceWav2Vec2

w2v_bert = HuggingFaceWav2Vec2(
    source="facebook/w2v-bert-2.0",
    output_norm=True,
    save_path="/local_disk/apollon/rwhetten/sss_data_selection/sense/pt_store/wav2vec2_checkpoint"
)
# wav2vec2: !new:speechbrain.lobes.models.w2v_bert.HuggingFaceWav2Vec2
#     source: !ref <wav2vec_url>
#     output_norm: True
#     save_path: !ref <output_folder>/wav2vec2_checkpoint

In [17]:
x = torch.load("/data/coros3/smdhaffar/SENSE/CKPT+checkpoint_epoch10/wav2vec2.ckpt", map_location=torch.device('cpu'))


In [24]:
from speechbrain.utils.parameter_transfer import Pretrainer

# Define references (normally these are modules like torch.nn.Module objects)
# Replace with your actual module instances
wav2vec2 = w2v_bert
model = model

# Paths (replace <sense_location> and <output_folder> with actual paths)
sense_location = "/data/coros3/smdhaffar/SENSE/CKPT+checkpoint_epoch10/"
output_folder = "/local_disk/apollon/rwhetten/sss_data_selection/sense/pt_store"

# Create the Pretrainer instance
pretrainer = Pretrainer(
    collect_in=output_folder,
    loadables={
        "wav2vec2": wav2vec2,
        "model": model,
    },
    paths={
        "wav2vec2": f"{sense_location}/wav2vec2.ckpt",
        "model": f"{sense_location}/model.ckpt",
    }
)

In [26]:
pretrainer.collect_files()
pretrainer.load_collected()

In [27]:
wav2vec2

HuggingFaceWav2Vec2(
  (model): Wav2Vec2BertModel(
    (feature_projection): Wav2Vec2BertFeatureProjection(
      (layer_norm): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=160, out_features=1024, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): Wav2Vec2BertEncoder(
      (dropout): Dropout(p=0.0, inplace=False)
      (layers): ModuleList(
        (0-23): 24 x Wav2Vec2BertEncoderLayer(
          (ffn1_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (ffn1): Wav2Vec2BertFeedForward(
            (intermediate_dropout): Dropout(p=0.0, inplace=False)
            (intermediate_dense): Linear(in_features=1024, out_features=4096, bias=True)
            (intermediate_act_fn): SiLU()
            (output_dense): Linear(in_features=4096, out_features=1024, bias=True)
            (output_dropout): Dropout(p=0.0, inplace=False)
          )
          (self_attn_layer_norm): LayerNorm((1024,), e