In [None]:
import torch 
import torchaudio
import os
import pandas as pd
import tqdm 
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor

In [None]:
MANIFEST = "/kaggle/input/libri-manifest/librispeech_manifest (1).csv"
DEVICE      = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME  = "facebook/wav2vec2-large-960h"
FEATURE_DIR = "/kaggle/working/wav2vec_features"
os.makedirs(FEATURE_DIR, exist_ok=True)

In [None]:
extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
model     = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(DEVICE).eval()

df = pd.read_csv(MANIFEST)
df = df.iloc[:8500].copy()
feature_paths = []

for _, row in tqdm.tqdm(df.iterrows(), total=len(df), desc="Extracting"):
    wav_path = row["audio_path"]
    wav, sr  = torchaudio.load(wav_path)
    if wav.shape[0] > 1:           
        wav = wav.mean(dim=0, keepdim=True)
    if sr != 16000:
        wav = torchaudio.functional.resample(wav, sr, 16000)

    inputs = extractor(
        wav.squeeze().numpy(), sampling_rate=16000,
        return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        h = model(**inputs).last_hidden_state.cpu()

    out_path = os.path.join(
        FEATURE_DIR,
        f"{row.speaker_id}-{row.chapter_id}-{row.utterance_id}.pt")
    torch.save(h, out_path)
    feature_paths.append(out_path)

df["feature_path"] = feature_paths
df.to_csv("/kaggle/working/librispeech_manifest_with_feats.csv", index=False)
print("Features saved")