In [3]:
%load_ext autoreload
%autoreload 2
import os, sys
import glob
import numpy as np
import pandas as pd
import torch

sys.path.append('/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavior/code/utils/')
sys.path.append('/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavior/code/modeling/joint-clm-prosody/')

from config import *
from src.data.components.audio_text_dataset import AudioTextDataset, load_audio, parse_textgrid, process_wavelet_file, extract_word_segment, pool_embeddings

from torch.utils.data import DataLoader
from src.data.components.collators import audio_text_collator

KeyboardInterrupt: 

### Set up dataset and create dataloader

In [None]:
# DATASET = 'pfka-moth-stories'
# split = 'train'
DATASET = 'gigaspeech/m'
split = 'test'
text_model_name = 'gpt2'
audio_model_name = 'wav2vec2'

dataset_dir = os.path.join(DATASETS_DIR, 'nlp-datasets', DATASET)
cache_dir = os.path.join(SCRATCH_DIR, 'nlp-datasets', DATASET)

# create datasets
dataset = AudioTextDataset(
    dataset_dir=dataset_dir,
    cache_dir=cache_dir,
    audio_model_name=audio_model_name, 
    text_model_name=text_model_name, 
    split=split,
)

dataset.preprocess_data()

dataset._initialize_models()

Metadata exists. Processing only new files...
No new files to process


### Create segments for the current item

In [None]:
 # Load and validate all file data
fn = dataset.file_names[0]
file_data = dataset._load_file_data(fn)

#### Get text tokens

In [17]:
# Process text through tokenizer and get token information
text = " ".join([word['text'] for word in file_data['words']])
text_tokens = dataset.text_tokenizer(text)

# Get token counts and associated word ids
word_ids, token_counts = np.unique(text_tokens.word_ids(), return_counts=True)

#### Segment audio word level segments

In [24]:
words = file_data['words']
waveform = file_data['waveform']
sample_rate = 16000

word_ids, token_counts = np.unique(text_tokens.word_ids(), return_counts=True)

segments = []

for word, idx, n_tokens in zip(words, word_ids, token_counts):
    if n_tokens > 1:
        ratios = torch.tensor([len(x) for x in self.text_tokenizer.batch_decode(
            text_tokens['input_ids'][idx:idx+n_tokens])])
        ratios = ratios / ratios.sum()
        word_segments = extract_word_segment(waveform, sample_rate, 
                                            word["start"], word["end"], ratios=ratios)
    else:
        word_segments = extract_word_segment(waveform, sample_rate, 
                                            word["start"], word["end"])
    segments.extend(word_segments)

### Check padded inputs

#### Load models

#### wav2vec2 comparison 

The problem can be [found here](https://github.com/huggingface/transformers/issues/21534) in a github issue

In [84]:
from transformers import AutoProcessor, AutoModel

## This version exhibits differences based on padding
# audio_model_name = "facebook/wav2vec2-base-960h"

# This version does not exhibit differences
audio_model_name = "facebook/wav2vec2-large-960h-lv60"


processor = AutoProcessor.from_pretrained(audio_model_name)
audio_model = AutoModel.from_pretrained(audio_model_name)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60 and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### data2vec comparison

In [None]:
from transformers import AutoProcessor, AutoModel

## This version exhibits differences based on padding
# audio_model_name = "facebook/data2vec-audio-base-960h"

## Both models exhibit differences
audio_model_name = "patrickvonplaten/data2vec-base-960h" #"facebook/data2vec-audio-large-100h"

processor = AutoProcessor.from_pretrained(audio_model_name)
audio_model = AutoModel.from_pretrained(audio_model_name)

preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/373M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/373M [00:00<?, ?B/s]

#### Run with padded

In [87]:
# Pad the batch
padded_features = processor(segments, sampling_rate=sample_rate, padding=True, return_attention_mask=True, return_tensors="pt")

with torch.no_grad():
    padded_outs = audio_model(**padded_features).last_hidden_state

attention_mask = audio_model._get_feature_vector_attention_mask(
    padded_outs.shape[1], 
    padded_features['attention_mask']
)

padded_embeds = pool_embeddings(padded_outs, attention_mask)

#### Run without padding

In [88]:
# Pad the batch
no_pad_features = processor(segments[:1], sampling_rate=sample_rate,return_attention_mask=True, return_tensors="pt")

with torch.no_grad():
    no_pad_outs = audio_model(**no_pad_features).last_hidden_state

attention_mask = audio_model._get_feature_vector_attention_mask(
    no_pad_outs.shape[1], 
    no_pad_features['attention_mask']
)

no_pad_embed = pool_embeddings(no_pad_outs, attention_mask)

#### Compare embeddings

In [89]:
torch.cosine_similarity(padded_embeds[:1], no_pad_embed[:1])

tensor([0.9360])

## Test whisper transcription pipeleine

In [1]:
%load_ext autoreload
%autoreload 2
import os, sys
import glob
import numpy as np
import pandas as pd
import torch

sys.path.append('/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavior/code/utils/')
sys.path.append('/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavior/code/modeling/preproc-datasets/')

from config import *
import utils

### Load model

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-tiny"#"openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,
    batch_size=16,  # batch size for inference - set based on your device
    torch_dtype=torch_dtype,
    device=device,
)


### Create dataset

In [None]:
from datasets import load_dataset

audio_dir = os.path.join(DATASETS_DIR, 'nlp-datasets', 'libritts-r', 'audio', 'test-clean')

dataset = load_dataset("audiofolder", data_dir=audio_dir)
# dataset = load_dataset("audiofolder", data_dir="/path/to/folder")

In [42]:
import itertools
def chunk(it, size):
	it = iter(it)
	return iter(lambda: tuple(itertools.islice(it, size)), ())

In [52]:
pipe(list(x), batch_size=3, generate_kwargs={"language": "english", "return_timestamps": False})



[{'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flower fatten sauce. Stuffed into you, his belly, counseled him.'},
 {'text': ' It would be a gloomy secret night.'},
 {'text': ' After early nightfall, the yellow lamps would light up here and there the squalid quarter of the brothels.'}]

In [59]:
from tqdm import tqdm

transcripts_dir = os.path.join(DATASETS_DIR, 'nlp-datasets', 'voxceleb2', 'transcripts', 'val')

transcripts_fns = sorted(glob.glob(os.path.join(transcripts_dir, '*')))

for transcript_fn in tqdm(transcripts_fns):
    # Open the file for reading
    with open(transcript_fn, 'r') as file:
        # Read the contents of the file
        content = file.read()
        
        # Convert the content to lowercase
        content = content.lower()
    
    # Save the content back to the same file (overwrite)
    with open(transcript_fn, 'w') as file:
        file.write(content)

    # print(f"Processed and saved: {transcript_fn}")

100%|██████████| 91011/91011 [17:37<00:00, 86.07it/s]  


In [67]:
audio_dir = os.path.join(DATASETS_DIR, 'nlp-datasets', 'voxceleb2', 'audio', 'test')
audio_fns = sorted(glob.glob(os.path.join(audio_dir, '*')))

audio_batches = list(chunk(audio_fns, 64))

In [68]:
len(audio_batches)

567

In [71]:
batch = audio_batches[414]

# for fn in batch:

In [79]:
tokens

{'input_ids': [50258, 50363, 14, 67, 446, 16883, 14, 81, 66, 14, 44990, 14, 37, 14, 37, 7729, 37880, 14, 20367, 296, 1385, 14, 77, 75, 79, 12, 20367, 296, 1385, 14, 3080, 87, 384, 28512, 17, 14, 46069, 14, 31636, 14, 327, 12791, 2009, 24, 62, 41, 86, 42, 81, 35, 22, 32, 64, 62, 23, 78, 62, 1360, 28052, 13, 86, 706, 50257], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
import soundfile as sf

for fn in tqdm(batch):
    # Read and write to fix format issues
    data, samplerate = sf.read(fn)
    # sf.write(fn, data, samplerate, subtype='PCM_16')

100%|██████████| 64/64 [00:06<00:00, 10.57it/s]


In [2]:
import math

video_dir = os.path.join(DATASETS_DIR, 'nlp-datasets', 'voxceleb2', 'video', 'train')

# Get all video files
video_files = sorted(glob.glob(os.path.join(video_dir, "*.mp4"), recursive=True))

NameError: name 'os' is not defined

In [128]:
os.path.exists(av_dataset_info)

False

In [129]:
import pandas as pd    

av_dataset_info = os.path.join(DATASETS_DIR, 'nlp-datasets', 'avspeech', 'metadata.jsonl')
jsonObj = pd.read_json(path_or_buf=av_dataset_info, lines=True)

In [139]:
jsonObj['metadata'].iloc[10]

{'view_count': 12354,
 'description': '山梨簿記学院\u3000http://yboki.com/\n体験講義動画\u3000「為替手形の基本」',
 'format': {'tags': {'major_brand': 'mp42',
   'compatible_brands': 'isommp42',
   'minor_version': '0'},
  'start_time': 0.0,
  'nb_streams': 2,
  'format_name': ['3gp', '3g2', 'mov', 'mp4', 'mj2', 'm4a'],
  'bit_rate': 1116993,
  'nb_programs': 0,
  'duration': 1044.3639,
  'probe_score': 100,
  'size': 145818419},
 'video_id': 'w0Q5gH4hb7I',
 'creation_time': '2016-08-27T15:10:23.000000Z',
 'height': 720,
 'dislike_count': 2,
 'channel_id': 'UCZ-aFwJHTPY3gLQYN2S-Y3Q',
 'like_count': 30,
 'subtitles': {},
 'duration': 1044,
 'title': '為替手形の基本\u3000簿記会計',
 'tags': ['為替手形', '名宛人', '振出人', '引受人'],
 'width': 1280,
 'categories': ['Education']}