In [None]:
import torch as t
import os
import random
import re
import csv
import tqdm
from transformers import AutoProcessor, WhisperForConditionalGeneration
from pydub import AudioSegment
from util import audiosegment_to_array, transcribe_batch, create_filler

device = t.device("cuda" if t.cuda.is_available() else "mps" if t.backends.mps.is_available() else "cpu")
model_id = "openai/whisper-small"
sr=16000
random.seed(100) 


In [6]:
processor = AutoProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(model_id).to(device)
model.generation_config.language = "english"
model.generation_config.task = "transcribe"
processor.feature_extractor.return_attention_mask = True

In [7]:

def create_trial(condition, target, frames, talker, n_continuum):
    endpoints = target.split("_")
    frames = [f for f in frames if endpoints[0] not in f and endpoints[1] not in f]
    random.shuffle(frames)

    silence = AudioSegment.silent(duration=500)
    combined = AudioSegment.silent(duration=0)
    for f in frames:
        audio = AudioSegment.from_wav(f'audio/MP/{talker}/{condition}/{f}')
        combined += audio + silence
    
    trials = [combined + AudioSegment.from_wav(f'audio/MP/{talker}/continuum/{target}_1_{i}.wav') + silence for i in range(n_continuum)]
    return trials, frames


In [None]:
def selective_adaptation(output_path, talkers, n_continuum, n_trials):
    with open(output_path, "w", newline="") as f:
        # Prep csv
        writer = csv.writer(f)
        writer.writerow(["id", "talker", "condition", "context", "transcript", "target", "target_step"])
        for t in talkers:
            # Get all the audio paths
            targets = os.listdir(f'audio/MP/{t}/continuum')
            targets = list(set([re.search(r'([a-zA-Z]+_[a-zA-Z]+)_[0-9]', target).group(1) for target in targets]))
            voiced = os.listdir(f'audio/MP/{t}/voiced')
            voiced = [v for v in voiced if v != '.DS_Store']
            voiceless = os.listdir(f'audio/MP/{t}/voiceless')
            voiceless = [v for v in voiceless if v != '.DS_Store']
            fillers = os.listdir(f'audio/fillers/{t}')
            fillers = [f for f in fillers if f != '.DS_Store']

            # A "trial" here represents every time a target is selected. Iterate throught the three conditions and every step of the continuum for each.
            for n in tqdm.tqdm(range(n_trials)):
                target = targets[n % len(targets)]

                for c in ['voiced', 'voiceless', 'control']:
                    if c == 'voiced':
                        audio, frames = create_trial(c, target, voiced, t, n_continuum)
                    elif c == 'voiceless':
                        audio, frames = create_trial(c, target, voiceless, t, n_continuum)
                    else:
                        random.shuffle(fillers)
                        control_frames = fillers[:len(voiced)-1]
                        audio, frames = create_filler(target, control_frames, t, n_continuum)

                    transcript = transcribe_batch(audio, processor, model, sr, device, batch_size=n_continuum)
                    for i in range(n_continuum):
                        
                        row = [n, t, c, frames, transcript[i], target, i]
                        writer.writerow(row)

# selective_adaptation('data/SelAd.csv', ['hope'], 13, 300)



  0%|          | 0/1 [00:00<?, ?it/s]You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
100%|██████████| 1/1 [00:10<00:00, 10.70s/it]
