# ASR

In [1]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import Audio, load_dataset

In [2]:
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small.en")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small.en")
model.config.forced_decoder_ids = None

In [3]:
# load dummy dataset and read audio files
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = ds[0]["audio"]
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
transcription

Generating validation split:   0%|          | 0/73 [00:00<?, ? examples/s]

2024-07-03 13:13:09.440679: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [4]:
# load audio file from local
import librosa
file_path = "audio/n1.wav"
audio_input, sampling_rate = librosa.load(file_path, sr=16000)

input_features = processor(audio_input, sampling_rate=sampling_rate, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
transcription

[' This I read with great attention while they sat silent.']

In [5]:
file_path = "audio/a1.wav"
audio_input, sampling_rate = librosa.load(file_path, sr=16000)

input_features = processor(audio_input, sampling_rate=sampling_rate, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
transcription

[' Except in the way we use also our eyes to make']

In [6]:
processor = WhisperProcessor.from_pretrained("/home/sabina/SpeechConversion/train/whisper-small-voice-conversion")
model = WhisperForConditionalGeneration.from_pretrained("/home/sabina/SpeechConversion/train/whisper-small-voice-conversion")
model.config.forced_decoder_ids = None

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# load audio file from local
file_path = "audio/n1.wav"
audio_input, sampling_rate = librosa.load(file_path, sr=16000)

input_features = processor(audio_input, sampling_rate=sampling_rate, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
transcription



[' this i read with great attention while they sat silent']

In [8]:
file_path = "audio/a1.wav"
audio_input, sampling_rate = librosa.load(file_path, sr=16000)

input_features = processor(audio_input, sampling_rate=sampling_rate, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
transcription

['except in the winter we use also our experiment']

In [10]:
for i in range(1, 6):
    file_path = f"audio/a{i}.wav"
    audio_input, sampling_rate = librosa.load(file_path, sr=16000)

    input_features = processor(audio_input, sampling_rate=sampling_rate, return_tensors="pt").input_features
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    print(transcription)

['except in the winter we use also our experiment']
['stick']
['cheer']
['he slowly takes a short walk in the open air each day']
['meat']


# TTS

In [2]:
import os
from pathlib import Path


import torch
import torchaudio
from icefall.utils import AttributeDict, str2bool

from valle.data import (
    AudioTokenizer,
    TextTokenizer,
    tokenize_audio,
    tokenize_text,
)
from valle.data.collation import get_text_token_collater
from valle.models import get_model


def load_model(checkpoint, device):
    if not checkpoint:
        return None

    checkpoint = torch.load(checkpoint, map_location=device)

    args = AttributeDict(checkpoint)
    model = get_model(args)

    missing_keys, unexpected_keys = model.load_state_dict(
        checkpoint["model"], strict=True
    )
    assert not missing_keys
    model.to(device)
    model.eval()

    text_tokens = args.text_tokens

    return model, text_tokens

def tts_infer(model, text_tokenizer, audio_tokenizer, text_collater, device, args):
    text_prompts = " ".join(args.text_prompts.split("|"))

    audio_prompts = []
    if args.audio_prompts:
        for n, audio_file in enumerate(args.audio_prompts.split("|")):
            encoded_frames = tokenize_audio(audio_tokenizer, audio_file)
            audio_prompts.append(encoded_frames[0][0])

        assert len(args.text_prompts.split("|")) == len(audio_prompts)
        audio_prompts = torch.concat(audio_prompts, dim=-1).transpose(2, 1)
        audio_prompts = audio_prompts.to(device)
    results = []
    for text in args.text.split("|"):
        print(f"synthesize text: {text}")
        text_tokens, text_tokens_lens = text_collater(
            [
                tokenize_text(
                    text_tokenizer, text=f"{text_prompts} {text}".strip()
                )
            ]
        )
        # synthesis
        if args.continual:
            assert text == ""
            encoded_frames = model.continual(
                text_tokens.to(device),
                text_tokens_lens.to(device),
                audio_prompts,
            )
        else:
            enroll_x_lens = None
            if text_prompts:
                _, enroll_x_lens = text_collater(
                    [
                        tokenize_text(
                            text_tokenizer, text=f"{text_prompts}".strip()
                        )
                    ]
                )
            encoded_frames = model.inference(
                text_tokens.to(device),
                text_tokens_lens.to(device),
                audio_prompts,
                enroll_x_lens=enroll_x_lens,
                top_k=args.top_k,
                temperature=args.temperature,
            )

        if audio_prompts != []:
            samples = audio_tokenizer.decode(
                [(encoded_frames.transpose(2, 1), None)]
            )
            # store
            results.append(samples[0].cpu())
        else:  # Transformer
            results.append(None)
    return results

In [None]:
def fix_random_seed(seed=13):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

fix_random_seed()

In [4]:
from types import SimpleNamespace

def get_args():
    return SimpleNamespace(
        text_prompts="This I read with great attention while they sat silent.",
        audio_prompts="audio/n1.wav",
        text="To get up and running quickly just follow the steps below.",
        text_extractor="espeak",
        checkpoint="checkpoint/best-valid-loss-stage2-base.pt",
        output_dir="results",
        top_k=-100,
        temperature=1.0,
        continual=False
    )

# Get the arguments
args = get_args()


device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda", 0)

# initialize everything
text_tokenizer = TextTokenizer(backend=args.text_extractor)
model, text_tokens = load_model(args.checkpoint, device)
text_collater = get_text_token_collater(text_tokens)
audio_tokenizer = AudioTokenizer()

In [15]:
audio_prompts = "audio/a1.wav|audio/a2.wav|audio/a3.wav|audio/a4.wav|audio/a5.wav"
text_prompts = "Except in the winter when the ooze or snow or ice prevents.|stick.|tear.|He slowly takes a short walk in the open air each day.|meat."

for audio_prompt, text_prompt in zip(audio_prompts.split("|"), text_prompts.split("|")):

    args.audio_prompts = audio_prompt
    args.text_prompts = text_prompt
    args.text = text_prompt

    results = tts_infer(model, text_tokenizer, audio_tokenizer, text_collater, device, args)
    for n, result in enumerate(results):
        if result is not None:
            torchaudio.save(
                            f"{args.output_dir}/tts_{args.audio_prompts.split('/')[-1].split('.')[n]}.wav", result, 24000
                        )

synthesize text: Except in the winter when the ooze or snow or ice prevents.
VALL-E EOS [777 -> 1100]
synthesize text: stick.
VALL-E EOS [135 -> 137]
synthesize text: tear.
VALL-E EOS [158 -> 180]
synthesize text: He slowly takes a short walk in the open air each day.
VALL-E EOS [518 -> 878]
synthesize text: meat.
VALL-E EOS [57 -> 106]


## Vanilla Voice Conversion

In [10]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa

import os
import torch
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
import sys
sys.path.append('MeloTTS')
from melo.api import TTS

from time import time


def asr_infer_whisper_hf(model, processor, audio_path):
    audio_input, sampling_rate = librosa.load(audio_path, sr=16000)

    input_features = processor(audio_input, sampling_rate=sampling_rate, return_tensors="pt").input_features
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription


def tts_infer_openvoice(tts, tone_color_converter, text, reference, speed=1, output_dir='results', file_name='output', device='cuda:0'):
    os.makedirs(output_dir, exist_ok=True)
    src_path = f'{output_dir}/tmp_{file_name}.wav'
    speaker_ids = tts.hps.data.spk2id
    assert len(speaker_ids) == 1
    speaker_key = list(speaker_ids.keys())[0]
    speaker_id = speaker_ids[speaker_key]

    # produce speech
    tts.tts_to_file(text, speaker_id, src_path, speed=speed)
    if reference is None:
        return
    
    # convert speech
    speaker_key = speaker_key.lower().replace('_', '-')
    target_se, audio_name = se_extractor.get_se(reference, tone_color_converter, vad=False)
    source_se = torch.load(f'checkpoint/{speaker_key}.pth', map_location=device)
    
    save_path = f'{output_dir}/{file_name}.wav'

    encode_message = "@MyShell"
    tone_color_converter.convert(
        audio_src_path=src_path, 
        src_se=source_se, 
        tgt_se=target_se, 
        output_path=save_path,
        message=encode_message)

In [8]:
speed = 1
output_dir = 'results'
device = 'cuda:0'
tts = 'EN_NEWEST'

In [6]:
processor = WhisperProcessor.from_pretrained("ghost613/whisper-small-voice-conversion", revision='48c188e60459eee6df57cae31620b5b04112d1e2')
model = WhisperForConditionalGeneration.from_pretrained("ghost613/whisper-small-voice-conversion", revision='48c188e60459eee6df57cae31620b5b04112d1e2')
model.config.forced_decoder_ids = None

tone_color_converter = ToneColorConverter(f'checkpoint/config.json', device=device)
tone_color_converter.load_ckpt(f'checkpoint/converter.pth')
tts = TTS(language=tts, device=device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded checkpoint 'checkpoint/converter.pth'
missing/unexpected keys: [] []


In [10]:
start = time()
for i in range(1, 10):
    audio_path = 'audio/a{}.wav'.format(i)
    text = asr_infer_whisper_hf(model, processor, audio_path)[0]
    print("ASR Result:", text)

    file_name = 'ra{}'.format(i)
    try:
        tts_infer_openvoice(tts, tone_color_converter, text, audio_path, speed=speed, device=device, file_name=file_name)
        print(f"Output saved to results/{file_name}.wav")
    except Exception as e:
        print(f"Failed to convert {file_name}: {e}")

print(f"Average time taken: {(time() - start) / 9} seconds")

ASR Result: except in the winter we use also our experiment
 > Text split to sentences.
except in the winter we use also our experiment


100%|██████████| 1/1 [00:00<00:00,  4.80it/s]


OpenVoice version: v2
Output saved to results/a1.wav
ASR Result: stick
 > Text split to sentences.
stick


100%|██████████| 1/1 [00:00<00:00,  7.17it/s]

OpenVoice version: v2





Output saved to results/a2.wav
ASR Result: cheer
 > Text split to sentences.
cheer


100%|██████████| 1/1 [00:00<00:00,  6.98it/s]

OpenVoice version: v2





Failed to convert a3: No audio segments found!
ASR Result: he slowly takes a short walk in the open air each day
 > Text split to sentences.
he slowly takes a short walk in the open air each day


100%|██████████| 1/1 [00:00<00:00,  5.24it/s]


OpenVoice version: v2
Output saved to results/a4.wav
ASR Result: meat
 > Text split to sentences.
meat


100%|██████████| 1/1 [00:00<00:00,  6.87it/s]

OpenVoice version: v2





Failed to convert a5: No audio segments found!
ASR Result: the museum hires musicians every evening
 > Text split to sentences.
the museum hires musicians every evening


100%|██████████| 1/1 [00:00<00:00,  5.57it/s]


OpenVoice version: v2
Output saved to results/a6.wav
ASR Result: she wore warm fleecy woolen overalls
 > Text split to sentences.
she wore warm fleecy woolen overalls


100%|██████████| 1/1 [00:00<00:00,  5.82it/s]


OpenVoice version: v2
Output saved to results/a7.wav
ASR Result: i was conscious all the time
 > Text split to sentences.
i was conscious all the time


100%|██████████| 1/1 [00:00<00:00,  6.32it/s]

OpenVoice version: v2





Output saved to results/a8.wav
ASR Result: hulip
 > Text split to sentences.
hulip


100%|██████████| 1/1 [00:00<00:00,  1.92it/s]


OpenVoice version: v2
Output saved to results/a9.wav
Average time taken: 6.699695825576782 seconds


In [10]:
ref_audio_path = 'audio/a4.wav'
for i in range(1, 6):
    audio_path = 'audio/a{}.wav'.format(i)
    text = asr_infer_whisper_hf(model, processor, audio_path)[0]
    print("ASR Result:", text)

    file_name = 'ref_a{}'.format(i)
    try:
        tts_infer_openvoice(tts, tone_color_converter, text, ref_audio_path, speed=speed, device=device, file_name=file_name)
        print(f"Output saved to results/{file_name}.wav")
    except Exception as e:
        print(f"Failed to convert {file_name}: {e}")




ASR Result: except in the winter we use also our experiment
 > Text split to sentences.
except in the winter we use also our experiment


100%|██████████| 1/1 [00:00<00:00,  5.39it/s]


OpenVoice version: v2
Output saved to results/ref_a1.wav
ASR Result: stick
 > Text split to sentences.
stick


100%|██████████| 1/1 [00:00<00:00,  5.96it/s]

OpenVoice version: v2





Output saved to results/ref_a2.wav
ASR Result: cheer
 > Text split to sentences.
cheer


100%|██████████| 1/1 [00:00<00:00,  6.71it/s]

OpenVoice version: v2





Output saved to results/ref_a3.wav
ASR Result: he slowly takes a short walk in the open air each day
 > Text split to sentences.
he slowly takes a short walk in the open air each day


100%|██████████| 1/1 [00:00<00:00,  5.45it/s]


OpenVoice version: v2
Output saved to results/ref_a4.wav
ASR Result: meat
 > Text split to sentences.
meat


100%|██████████| 1/1 [00:00<00:00,  7.14it/s]

OpenVoice version: v2





Output saved to results/ref_a5.wav


# Korean ASR

In [1]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperFeatureExtractor
from datasets import Audio, load_dataset
from pydub import AudioSegment
import librosa

In [2]:
device = 'cuda:0'
#model_name = "jiwon65/whisper-small_korean-zeroth"  # You can change this to other variants
#model_name = "openai/whisper-small"
model_name = "/home/sabina/SpeechConversion/train/whisper-small-voice-conversion-korean-20min"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
audio_path = '/home/sabina/korean_data/13_CUJ/ID-01-13-N-CUJ-02-01-F-36-kk_0.wav'
audio_input, sr = librosa.load(audio_path, sr=16000)

In [8]:
audio = AudioSegment.from_file(audio_path)
audio

In [9]:

input_features = feature_extractor(audio_input, sampling_rate=16000, return_tensors="pt").input_features
input_features = input_features.to(device)
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
transcription

2024-08-20 02:28:00.359660: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['나는 바리를 입고 단추는 채운다.']

In [2]:
speed = 1
output_dir = 'results'
device = 'cuda:0'
tts = 'KR'

In [3]:
import os
import torch
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
import sys
sys.path.append('MeloTTS')
from melo.api import TTS

In [4]:
model_name = "/home/sabina/SpeechConversion/train/whisper-small-voice-conversion-korean-10min"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)

model.config.forced_decoder_ids = None

tone_color_converter = ToneColorConverter(f'checkpoint/config.json', device=device)
tone_color_converter.load_ckpt(f'checkpoint/converter.pth')
tts = TTS(language=tts, device=device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded checkpoint 'checkpoint/converter.pth'
missing/unexpected keys: [] []


In [5]:
def asr_infer_whisper_hf_ko(model, feature_extractor, processor, audio_path):
    audio_input, sampling_rate = librosa.load(audio_path, sr=16000)
    input_features = feature_extractor(audio_input, sampling_rate=16000, return_tensors="pt").input_features
    input_features = input_features.to(device)
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription



def tts_infer_openvoice(tts, tone_color_converter, text, reference, speed=1, output_dir='results', file_name='output', device='cuda:0'):
    os.makedirs(output_dir, exist_ok=True)
    src_path = f'{output_dir}/tmp_{file_name}.wav'
    speaker_ids = tts.hps.data.spk2id
    assert len(speaker_ids) == 1
    speaker_key = list(speaker_ids.keys())[0]
    speaker_id = speaker_ids[speaker_key]

    # produce speech
    tts.tts_to_file(text, speaker_id, src_path, speed=speed)
    if reference is None:
        return
    
    # convert speech
    speaker_key = speaker_key.lower().replace('_', '-')
    target_se, audio_name = se_extractor.get_se(reference, tone_color_converter, vad=False)
    source_se = torch.load(f'checkpoint/{speaker_key}.pth', map_location=device)
    
    save_path = f'{output_dir}/{file_name}.wav'

    encode_message = "@MyShell"
    tone_color_converter.convert(
        audio_src_path=src_path, 
        src_se=source_se, 
        tgt_se=target_se, 
        output_path=save_path,
        message=encode_message)

In [8]:
for i in range(0, 5):
    audio_path = '/home/sabina/korean_data/13_CUJ/ID-01-13-N-CUJ-02-01-F-36-kk_{}.wav'.format(i)
    text = asr_infer_whisper_hf_ko(model, feature_extractor, processor, audio_path)[0]
    print("ASR Result:", text)

    file_name = 'korean20_{}'.format(i)
    tts_infer_openvoice(tts, tone_color_converter, text, audio_path, speed=speed, device=device, file_name=file_name)

    try:
        tts_infer_openvoice(tts, tone_color_converter, text, audio_path, speed=speed, device=device, file_name=file_name)
        print(f"Output saved to results/{file_name}.wav")
    except Exception as e:
        print(f"Failed to convert {file_name}: {e}")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


ASR Result: 땅콩.땅콩.
 > Text split to sentences.
땅콩. 땅콩.


  0%|          | 0/1 [00:00<?, ?it/s]Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1/1 [00:06<00:00,  6.17s/it]


OpenVoice version: v2
 > Text split to sentences.
땅콩. 땅콩.


100%|██████████| 1/1 [00:00<00:00,  3.18it/s]


OpenVoice version: v2
Output saved to results/korean20_0.wav
ASR Result: 책상위에가이금다.
 > Text split to sentences.
책상위에가이금다.


100%|██████████| 1/1 [00:00<00:00,  3.71it/s]


OpenVoice version: v2
 > Text split to sentences.
책상위에가이금다.


100%|██████████| 1/1 [00:00<00:00,  4.41it/s]


OpenVoice version: v2
Output saved to results/korean20_1.wav
ASR Result: 가방을사.가연필이머리 말려?
 > Text split to sentences.
가방을사. 가연필이머리 말려?


100%|██████████| 1/1 [00:00<00:00,  2.65it/s]


OpenVoice version: v2
 > Text split to sentences.
가방을사. 가연필이머리 말려?


100%|██████████| 1/1 [00:00<00:00,  2.41it/s]


OpenVoice version: v2
Output saved to results/korean20_2.wav
ASR Result: 아파자전화를 카
 > Text split to sentences.
아파자전화를 카


100%|██████████| 1/1 [00:00<00:00,  3.15it/s]


OpenVoice version: v2
 > Text split to sentences.
아파자전화를 카


100%|██████████| 1/1 [00:00<00:00,  3.45it/s]


OpenVoice version: v2
Output saved to results/korean20_3.wav
ASR Result: 전물원에 합니다. 합니다.
 > Text split to sentences.
전물원에 합니다. 합니다.


100%|██████████| 1/1 [00:00<00:00,  2.53it/s]


OpenVoice version: v2
 > Text split to sentences.
전물원에 합니다. 합니다.


100%|██████████| 1/1 [00:00<00:00,  2.91it/s]


OpenVoice version: v2
Output saved to results/korean20_4.wav


# New one

In [1]:
from openai import OpenAI
import evaluate
from dotenv import load_dotenv
import os
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))



def generate_corrected_transcript(temperature, system_prompt, prompt):
    response = client.chat.completions.create(
        model="gpt-4o",
        temperature=temperature,
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": prompt
            }
        ]
    )
    return response.choices[0].message.content

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
system_prompt ="You are a helpful assistant."

def create_prompt1(transcription):
    prompt = f"""Please correct the following transcription if it has a typo error. If it's already correct, return as it is. Return the final transcription without any comments.


Transcription: {transcription}
Corrected Transcription: """
    return prompt

def create_prompt2(transcription):
    prompt = f"""Please correct the following transcription if it has a typo error. If it's already correct, return as it is. Return the final transcription without any comments.

EXAMPLE 1:
Transcription: "저는 한구그어 배우고 있어요."
Corrected Transcription: "저는 한국어 배우고 있어요."
  
EXAMPLE 2:
Transcription: "오늘 날씨가 추울거예요."
Corrected Transcription: "오늘 날씨가 추울 거예요."

EXAMPLE 3:
Transcription: "어제 친구랑 맛있는 저녁을 먹었어요."
Corrected Transcription: "어제 친구랑 맛있는 저녁을 먹었어요."


Transcription: {transcription}
Corrected Transcription: """
    return prompt


def create_prompt3(transcription):
    prompt = f""""주어지는 전사에 오타가 있을 경우 수정해 주세요. 만약 이미 정확하다면 그대로 반환해 주세요. 최종 출력은 어떠한 설명 없이 반환해 주세요.

    
전사: {transcription}
수정된 전사: """
    return prompt

def create_prompt4(transcription):
    prompt = f"""주어지는 전사에 오타가 있을 경우 수정해 주세요. 만약 이미 정확하다면 그대로 반환해 주세요. 최종 출력은 어떠한 설명 없이 반환해 주세요.

예시 1:
전사: "저는 한구그어 배우고 있어요."
수정된 전사: "저는 한국어 배우고 있어요."

예시 2:
전사: "오늘 날씨가 추울거예요."
수정된 전사: "오늘 날씨가 추울 거예요."

예시 3:
전사: "어제 친구랑 맛있는 저녁을 먹었어요."
수정된 전사: "어제 친구랑 맛있는 저녁을 먹었어요."

전사: {transcription}
수정된 전사: """
    return prompt

# Evaluate the model
metric = evaluate.load("wer")
metric_cer = evaluate.load("cer")

file_name = 'VC-01-22-0.77-medium'


with open(f'eval/predictions_{file_name}.txt', 'r') as f:
    all_predictions = f.readlines()

with open('eval/references_original.txt', 'r') as f:
    all_references = f.readlines()


In [3]:

llm_predictions = []

for transcription in all_predictions:
    if transcription == "":
        llm_predictions.append(transcription)
        continue
    llm_prediction = generate_corrected_transcript(0, system_prompt, create_prompt1(transcription))
    llm_prediction = llm_prediction.split(':')[-1].strip()
    llm_predictions.append(llm_prediction)

with open(f'eval/llm_predictions_{file_name}.txt', 'w') as f:
    f.writelines("\n".join(llm_predictions))




llm_wer = 100 * metric.compute(predictions=https://teams.live.com/v2/#/meet/9552493582187?p=NR228qdWFmieEtPkUt&anon=true&v=.jlw&launchType=web&deeplinkId=d40316fb-364a-4732-92a1-7eb013a6bf97, references=all_references)
llm_cer = 100 * metric_cer.compute(predictions=llm_predictions, references=all_references)

print(f"LLM WER: {llm_wer:.2f}%")
print(f"LLM CER: {llm_cer:.2f}%")

LLM WER: 54.50%
LLM CER: 28.12%


In [4]:
with open('eval/llm.txt', 'w') as f:
    f.write(f)

FileNotFoundError: [Errno 2] No such file or directory: 'eval/llm.txt'

In [7]:
import difflib

def highlight_differences(prediction, reference):
    d = difflib.Differ()
    diff = list(d.compare(reference.split(), prediction.split()))
    
    highlighted = []
    for word in diff:
        if word.startswith('+ '):
            highlighted.append(f"\033[1m\033[91m{word[2:]}\033[0m")  # Bold and red for additions
        elif word.startswith('- '):
            continue  # Skip deletions as they're not in the prediction
        else:
            highlighted.append(word[2:])  # Unchanged words
    
    return ' '.join(highlighted)

def compare_files(predictions_file, references_file):
    with open(predictions_file, 'r') as f:
        predictions = f.readlines()
    
    with open(references_file, 'r') as f:
        references = f.readlines()
    
    if len(predictions) != len(references):
        print(f"Warning: Files have different number of lines. Predictions: {len(predictions)}, References: {len(references)}")
    
    for i, (pred, ref) in enumerate(zip(predictions, references), 1):
        pred = pred.strip()
        ref = ref.strip()
        
        if pred != ref:
            print(f"\nLine {i}:")
            print(f"Before LLM: {ref}")
            print(f"After LLM: {highlight_differences(pred, ref)}")

# Usage
file_name = 'VC-01-22-0.77-medium'
references_file = f'eval/predictions_{file_name}.txt'  # Replace with your actual file name
predictions_file = f'eval/llm_predictions_{file_name}.txt'

compare_files(predictions_file, references_file)


Line 1:
Before LLM: 엄마한테 엄마한테 갖고갔다 니네 엄마 한번 전화 해봐
After LLM: 엄마한테 엄마한테 [1m[91m갖고[0m [1m[91m갔다[0m 니네 엄마 한번 [1m[91m전화해[0m   +
 [1m[91m봐[0m

Line 2:
Before LLM: 난 구류자인가 구류자인가
After LLM: 난 [1m[91m굴욕자인가[0m [1m[91m굴욕자인가[0m

Line 7:
Before LLM: 열끼기 싫어서 그래
After LLM: [1m[91m열기기[0m 싫어서 그래

Line 8:
Before LLM: 윤희언니는 그런거 잘하더라 들여주는거
After LLM: [1m[91m윤희[0m --
 [1m[91m언니는[0m   -
 [1m[91m그런[0m [1m[91m거[0m 잘하더라     -
 [1m[91m들여주는[0m [1m[91m거[0m

Line 9:
Before LLM: 실급여봐도
After LLM:    --
 [1m[91m실급여[0m [1m[91m봐도[0m

Line 10:
Before LLM: 아무것도 안하거든 나 나 연락 안하잖아 사람들한테
After LLM: 아무것도 [1m[91m안[0m -
 [1m[91m하거든[0m 나 나 연락 [1m[91m안[0m -
 [1m[91m하잖아[0m 사람들한테

Line 12:
Before LLM: 급행2번째 맛있지
After LLM: [1m[91m급행[0m --
 [1m[91m2번째[0m 맛있지

Line 13:
Before LLM: 애들은 척 하는 거 되게 좋아해요.
After LLM: 애들은 [1m[91m척하는[0m +
 거 되게 좋아해요.

Line 15:
Before LLM: 응 그래서 내비 두려고 빨리 시간 갔으면 좋겠어 빨리 말까지
After LLM: 응 그래서 [1m[91m내버려[0m 두려고 빨리 시간 갔으면 좋겠어 빨리 말까지

Line 16:
Be

In [8]:
llm_predictions = []

for transcription in all_predictions:
    if transcription == "":
        llm_predictions.append(transcription)
        continue
    llm_prediction = generate_corrected_transcript(0, system_prompt, create_prompt2(transcription))
    llm_prediction = llm_prediction.split(':')[-1].strip()
    llm_predictions.append(llm_prediction)

with open(f'eval/llm_predictions_{file_name}.txt', 'w') as f:
    f.writelines("\n".join(llm_predictions))




llm_wer = 100 * metric.compute(predictions=llm_predictions, references=all_references)
llm_cer = 100 * metric_cer.compute(predictions=llm_predictions, references=all_references)

print(f"LLM WER: {llm_wer:.2f}%")
print(f"LLM CER: {llm_cer:.2f}%")

LLM WER: 55.64%
LLM CER: 28.45%


In [9]:

llm_predictions = []

for transcription in all_predictions:
    if transcription == "":
        llm_predictions.append(transcription)
        continue
    llm_prediction = generate_corrected_transcript(0, system_prompt, create_prompt3(transcription))
    llm_prediction = llm_prediction.split(':')[-1].strip()
    llm_predictions.append(llm_prediction)

with open(f'eval/llm_predictions_{file_name}.txt', 'w') as f:
    f.writelines("\n".join(llm_predictions))




llm_wer = 100 * metric.compute(predictions=llm_predictions, references=all_references)
llm_cer = 100 * metric_cer.compute(predictions=llm_predictions, references=all_references)

print(f"LLM WER: {llm_wer:.2f}%")
print(f"LLM CER: {llm_cer:.2f}%")

LLM WER: 56.63%
LLM CER: 29.16%


In [10]:

llm_predictions = []

for transcription in all_predictions:
    if transcription == "":
        llm_predictions.append(transcription)
        continue
    llm_prediction = generate_corrected_transcript(0, system_prompt, create_prompt4(transcription))
    llm_prediction = llm_prediction.split(':')[-1].strip()
    llm_predictions.append(llm_prediction)

with open(f'eval/llm_predictions_{file_name}.txt', 'w') as f:
    f.writelines("\n".join(llm_predictions))




llm_wer = 100 * metric.compute(predictions=llm_predictions, references=all_references)
llm_cer = 100 * metric_cer.compute(predictions=llm_predictions, references=all_references)

print(f"LLM WER: {llm_wer:.2f}%")
print(f"LLM CER: {llm_cer:.2f}%")

LLM WER: 55.95%
LLM CER: 28.56%


In [4]:
import difflib

def highlight_differences(prediction, reference):
    d = difflib.Differ()
    diff = list(d.compare(reference.split(), prediction.split()))
    
    highlighted = []
    for word in diff:
        if word.startswith('+ '):
            highlighted.append(f"\033[1m\033[91m{word[2:]}\033[0m")  # Bold and red for additions
        elif word.startswith('- '):
            continue  # Skip deletions as they're not in the prediction
        else:
            highlighted.append(word[2:])  # Unchanged words
    
    return ' '.join(highlighted)

def compare_files(predictions_file, references_file):
    with open(predictions_file, 'r') as f:
        predictions = f.readlines()
    
    with open(references_file, 'r') as f:
        references = f.readlines()
    
    if len(predictions) != len(references):
        print(f"Warning: Files have different number of lines. Predictions: {len(predictions)}, References: {len(references)}")
    
    for i, (pred, ref) in enumerate(zip(predictions, references), 1):
        pred = pred.strip()
        ref = ref.strip()
        
        if pred != ref:
            print(f"\nLine {i}:")
            print(f"Reference: {ref}")
            print(f"Prediction: {highlight_differences(pred, ref)}")

# Usage
file_name = 'VC-01-22-0.77-medium'
predictions_file = f'eval/predictions_{file_name}.txt'  # Replace with your actual file name
references_file = 'eval/references_original.txt'

compare_files(predictions_file, references_file)


Line 1:
Reference: 엄마한테 엄마도 갈 것 같다. 니네엄마 한번 전화해봐
Prediction: 엄마한테 [1m[91m엄마한테[0m [1m[91m갖고갔다[0m [1m[91m니네[0m   -
 [1m[91m엄마[0m 한번 [1m[91m전화[0m [1m[91m해봐[0m

Line 2:
Reference: 난 불효 불효자인가 불효자인가?
Prediction: 난 [1m[91m구류자인가[0m [1m[91m구류자인가[0m

Line 3:
Reference: 남이 강의하라고 하잖아. 되게 색다른 사람이야. 색다른 사람
Prediction: 남이 강의하라고    -
 [1m[91m하잖아[0m 되게 색다른     -
 [1m[91m사람이야[0m 색다른 사람

Line 4:
Reference: 그러게요. 똑바로 안 걷는다고
Prediction:     -
 [1m[91m그러게요[0m 똑바로 안 걷는다고

Line 5:
Reference: 엑셀도
Prediction: [1m[91mXS도[0m

Line 6:
Reference: 아니야 미국이 일자리가 없어졌대
Prediction: 아니야 미국이 [1m[91m이체기가[0m 없어졌대

Line 7:
Reference: 엮기기 싫어서 그래
Prediction: [1m[91m열끼기[0m 싫어서 그래

Line 8:
Reference: 윤희 언니는 그런거 잘 하더라 들어주는거
Prediction: [1m[91m윤희언니는[0m ++
 그런거 [1m[91m잘하더라[0m +
  ^
 [1m[91m들여주는거[0m  ^


Line 9:
Reference: 실업급여 받고
Prediction: [1m[91m실급여봐도[0m

Line 10:
Reference: 어 아무것도 안하거든 나. 나 연락 안 하잖아 잘 사람들한테
Prediction: 아무것도 안하거든 [1m[91m나[0m 나 연락 [1m[91m안하잖아[0m +
 사람들한테

## CBJ code

In [4]:
import pickle
import itertools

from soynlp.hangle import compose, decompose # 초성,중성, 종성 나눌 수 있음



def search_from_standard_kor_v1(sentence):
    
    # 어절 분리
    # whitespace 기준으로 진행
    
    # charator 1개씩 변환
    try:
        whitespace_split_list = sentence.split(' ')
        result = [is_phrase_in_standard_v1(x) for x in whitespace_split_list]
        
        return ' '.join(result)    
    except Exception as e:
        print(e)
        return sentence

def search_from_standard_kor_v2(sentence):
    
    # 어절 분리
    # whitespace 기준으로 진행
    
    # charator 1개씩 변환
    try:
        whitespace_split_list = sentence.split(' ')
        result = [is_phrase_in_standard_v2(x) for x in whitespace_split_list]
        
        return ' '.join(result)    
    except Exception as e:
        print(e)
        return sentence



def is_phrase_in_standard_v2(phrase): # ?
    # ph_origin = list(re.sub('[^가-힣]', '', phrase)) # kor filtering. Original
    ph_origin = list(phrase) # kor filtering. Original
    
    try:
        if ''.join(ph_origin) in kor_stand_set: # if phrase in kor standard dict, return origin
            return phrase  
        
        else:
            # change 1 character and check if transform pharase in kor dict
            # ph_origin = ph_list[:] # save origin
            for i in range(len(ph_origin)):
                candidate_character_list = candidate_from_character(ph_origin[i])
                for c in candidate_character_list:
                    ph_1ch_changed = ph_origin[:]
                    ph_1ch_changed[i] = c
                    
                    # print(''.join(ph_1ch_changed))
                    # if changed pharse is True, return it
                    if ''.join(ph_1ch_changed) in kor_stand_set:
                        return ''.join(ph_1ch_changed)
                    
            # change 2 character and check 
            if len(ph_origin) > 1:
                for a,b in itertools.combinations(range(len(ph_origin)), 2):
                    candidate_character_list_a = candidate_from_character(ph_origin[a])
                    candidate_character_list_b = candidate_from_character(ph_origin[b])
                    
                    for ca in candidate_character_list_a:
                        for cb in candidate_character_list_b:
                            ph_1ch_changed = ph_origin[:]
                            ph_1ch_changed[a] = ca
                            ph_1ch_changed[b] = cb
                            
                            # print(''.join(ph_1ch_changed))
                            if ''.join(ph_1ch_changed) in kor_stand_set:
                                return ''.join(ph_1ch_changed)
            
    except Exception as e:
        print(e)
        return phrase
                
    return phrase # if above code not working, return original 


def is_phrase_in_standard_v1(phrase): # ?
    # ph_origin = list(re.sub('[^가-힣]', '', phrase)) # kor filtering. Original
    ph_origin = list(phrase) # kor filtering. Original
    
    try:
        if ''.join(ph_origin) in kor_stand_set: # if phrase in kor standard dict, return origin
            return phrase  
        
        else:
            # change 1 character and check if transform pharase in kor dict
            # ph_origin = ph_list[:] # save origin
            for i in range(len(ph_origin)):
                candidate_character_list = candidate_from_character(ph_origin[i])
                for c in candidate_character_list:
                    ph_1ch_changed = ph_origin[:]
                    ph_1ch_changed[i] = c
                    
                    # print(''.join(ph_1ch_changed))
                    # if changed pharse is True, return it
                    if ''.join(ph_1ch_changed) in kor_stand_set:
                        return ''.join(ph_1ch_changed)
            
    except Exception as e:
        print(e)
        return phrase
                
    return phrase # if above code not working, return original

def candidate_from_character(character):
    # 한국어유사발음 참고
    # 중성, 종성만 먼저 해보자
    # 초성까지 하기에는 좀...
    candidate = []
    first, second, third = decompose(character) # 초성, 중성, 종성 분리
    
    # 초성, 중성, 종성은 실제 STT 데이터 확인 후 커스텀하는게 좋아보임
    chosung = {'ㄱ': ['ㄲ', 'ㅋ'], 
               'ㄴ': ['ㄴ'], 
               'ㄷ': ['ㄷ', 'ㄸ'], 
               'ㄹ': ['ㄹ'], 
               'ㅁ': ['ㅂ'],
               'ㅂ': ['ㅃ', 'ㅍ'], 
               'ㅅ': ['ㅆ', 'ㅊ'], 
               'ㅇ': ['ㅇ'], 
               'ㅈ': ['ㅉ', 'ㅊ', 'ㄴ'],
               'ㅊ': ['ㅈ', 'ㅅ'], 
               'ㅋ': ['ㄱ', 'ㄲ'], 
               'ㅌ': ['ㄷ', 'ㄸ'], 
               'ㅍ': ['ㅂ', 'ㅃ'], 
               'ㅎ': ['ㅎ']
               }
    
    # 종성(받침 변환. 이중받침 고려는 어려움)
    jungsung = {'ㅏ': ['ㅑ', 'ㅓ'], 
                'ㅑ': ['ㅏ', 'ㅕ'], 
                'ㅓ': ['ㅕ', 'ㅏ'], 
                'ㅕ': ['ㅓ', 'ㅑ'],
                'ㅗ': ['ㅛ'], 
                'ㅛ': ['ㅗ'], 
                'ㅜ': ['ㅠ'],
                'ㅠ': ['ㅜ'],
                'ㅡ': ['ㅢ'],
                'ㅣ': [],
                'ㅐ': ['ㅔ', 'ㅖ', 'ㅒ'], 
                'ㅒ': ['ㅖ', 'ㅔ', 'ㅐ'], 
                'ㅔ': ['ㅐ', 'ㅖ', 'ㅒ'], 
                'ㅖ': ['ㅔ', 'ㅐ', 'ㅒ'],
                'ㅚ': ['ㅙ'], 
                'ㅢ': ['ㅡ'], 
                'ㅟ': ['ㅟ', 'ㅜ', 'ㅣ']
                }
    
    jongsung = [
        ['ㅂ', 'ㅍ'], 
                ['ㄷ', 'ㅌ', 'ㅅ', 'ㅆ', 'ㅈ', 'ㅊ', 'ㅎ'], 
                ['ㄱ', 'ㄲ', 'ㅋ'], 
                ['ㅁ'], 
                ['ㄴ'],
                ['ㅇ'], 
                ['ㄹ'] 
                ]
    
    # 초성 처리
    candidate_chosung = chosung[first] + [first]

    # 중성 처리
    candidate_jungsung = jungsung[second] + [second] # 중성 후보군
    
    
    # 종성 처리
    candidate_jongsung = None
    for j in jongsung:
        if third in j:
            candidate_jongsung = j
            break
    
    # 종성 후보군이 없으면(이중자음) 그대로 사용
    if candidate_jongsung is None:
        candidate_jongsung = [third]
    else:
        # 종성 없는것도 추가
        candidate_jongsung += [' ']
    
    
    # print(candidate_chosung)
    # 초성, 중성, 종성을 조합하여 후보음절 생성
    for c in candidate_chosung:
        for s in candidate_jungsung:
            for t in candidate_jongsung:
                candidate.append(compose(c, s, t))
    
    return candidate
    
    
    


In [7]:
len(all_predictions)

287

In [8]:

with open('kor_standard_words_list.pickle', 'rb') as f:
    kor_stand_set = pickle.load(f)
    

file_name = 'VC-01-22-0.77-medium'


with open(f'eval/predictions_{file_name}.txt', 'r') as f:
    all_predictions = f.readlines()

with open('eval/references_original.txt', 'r') as f:
    all_references = f.readlines()




for s in all_predictions:
    print(search_from_standard_kor_v1(s))
    print('--------')
    print(search_from_standard_kor_v2(s))

'ㅘ'
엄마한테 엄마한테 갖고갔다 니네 엄마 한번 전화 해봐

--------
'ㅘ'
엄마한테 엄마한테 갖고갔다 니네 엄마 한번 전화 해봐

cannot unpack non-iterable NoneType object
난 구류자인가 구류자인가 

--------
cannot unpack non-iterable NoneType object
난 구류자인가 구류자인가 

cannot unpack non-iterable NoneType object
남이 강의하라고 하잖아 되게 색다른 사람이야 색다른 사람

--------
cannot unpack non-iterable NoneType object
남이 강의하라고 하잖아 되게 색다른 사람이야 색다른 사람

cannot unpack non-iterable NoneType object
그러게요 똑바로 안 걷는다고

--------
cannot unpack non-iterable NoneType object
그러게요 똑바로 안 걷는다고

cannot unpack non-iterable NoneType object
XS도

--------
cannot unpack non-iterable NoneType object
XS도

cannot unpack non-iterable NoneType object
아니야 미국이 이체기가 없어졌대

--------
cannot unpack non-iterable NoneType object
아니야 미국이 이체기가 없어졌대

'ㄲ'
cannot unpack non-iterable NoneType object
열끼기 싫어서 그래

--------
'ㄲ'
cannot unpack non-iterable NoneType object
열끼기 싫어서 그래

cannot unpack non-iterable NoneType object
윤희언니는 그런거 잘하더라 들여주는거

--------
cannot unpack non-iterable NoneType object
윤희언니는 그런거 잘하더라 들여주는거



In [None]:

with open('kor_standard_words_list.pickle', 'rb') as f:
    kor_stand_set = pickle.load(f)
    

file_name = 'VC-01-22-0.77-medium'


with open(f'eval/predictions_{file_name}.txt', 'r') as f:
    all_predictions = f.readlines()

with open('eval/references_original.txt', 'r') as f:
    all_references = f.readlines()




for s in all_predictions:
    print(search_from_standard_kor_v1(s))
    print('--------')
    print(search_from_standard_kor_v2(s))

'ㅘ'
엄마한테 엄마한테 갖고갔다 니네 엄마 한번 전화 해봐

--------
'ㅘ'
엄마한테 엄마한테 갖고갔다 니네 엄마 한번 전화 해봐

cannot unpack non-iterable NoneType object
난 구류자인가 구류자인가 

--------
cannot unpack non-iterable NoneType object
난 구류자인가 구류자인가 

cannot unpack non-iterable NoneType object
남이 강의하라고 하잖아 되게 색다른 사람이야 색다른 사람

--------
cannot unpack non-iterable NoneType object
남이 강의하라고 하잖아 되게 색다른 사람이야 색다른 사람

cannot unpack non-iterable NoneType object
그러게요 똑바로 안 걷는다고

--------
cannot unpack non-iterable NoneType object
그러게요 똑바로 안 걷는다고

cannot unpack non-iterable NoneType object
XS도

--------
cannot unpack non-iterable NoneType object
XS도

cannot unpack non-iterable NoneType object
아니야 미국이 이체기가 없어졌대

--------
cannot unpack non-iterable NoneType object
아니야 미국이 이체기가 없어졌대

'ㄲ'
cannot unpack non-iterable NoneType object
열끼기 싫어서 그래

--------
'ㄲ'
cannot unpack non-iterable NoneType object
열끼기 싫어서 그래

cannot unpack non-iterable NoneType object
윤희언니는 그런거 잘하더라 들여주는거

--------
cannot unpack non-iterable NoneType object
윤희언니는 그런거 잘하더라 들여주는거



## Inference

In [22]:
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "2,3" 
import torch
from faster_whisper import WhisperModel

# fix seed
#torch.manual_seed(0)
#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False


class STT(WhisperModel):
    def __init__(self, model_name, temperature, best_of, beam_size, verbose=False):
        '''
        Args:
            model_name (str): The name of the model to use for speech-to-text.
            verbose (bool): Whether to print the transcribed text and other information

        Possible model names:
            "neoALI/whisper-medium-quanted-handicapped"
        '''
        super().__init__(model_name)
        self.verbose = verbose
        self.temperature = temperature
        self.best_of = best_of
        self.beam_size = beam_size
    
    def transcribe(self, audio_path, initial_prompt=None, condition_on_previous_text=True):
        segments, info = super().transcribe(audio_path, temperature=self.temperature, best_of=self.best_of, 
                                            beam_size=self.beam_size, initial_prompt=initial_prompt,
                                            condition_on_previous_text=condition_on_previous_text)
        texts = [segment.text for segment in segments]
        if self.verbose:
            print(info)
            for segment in segments:
                print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

        transcription = ' '.join(texts)
        return transcription
        


In [29]:
!ct2-transformers-converter --model ghost613/VC-01-22-4.30-turbo --output_dir whisper-turbo-faster-imijeong --copy_files tokenizer_config.json preprocessor_config.json --quantization float16 --force

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [24]:
model_name = "neoALI/whisper-medium-quanted-handicapped"

model = STT(model_name, temperature=0.0, best_of=5, beam_size=5, verbose=True)

In [25]:
from time import time

#audio_path = "test.wav"
#gt_text = "난 불효자인가. 불효자인가?"
#audio_path = "/home/sabina/speech_handicap_dataset/imijeong/divided_audio/240810_2139_part383.wav"
#gt_text = "머리카락 때문에 금메달을 못 땄대"
#audio_path = "/home/sabina/speech_handicap_dataset/imijeong/divided_audio/240812_1942_part100.wav"
#gt_text = "안됐나봐 공부할려고 했는데"
audio_path = "/home/sabina/speech_handicap_dataset/imijeong/divided_audio/240811_2034_part305.wav"
gt_text = "왜 이렇게 많아 12월까지 써야겠네"

In [26]:
#prompt = '난 불효자인가. 불효자인가.'
transcription = model.transcribe(audio_path, condition_on_previous_text=True)
transcription

TranscriptionInfo(language='ko', language_probability=1.0, duration=5.7050625, duration_after_vad=5.7050625, all_language_probs=[('ko', 1.0), ('ja', 1.0967254638671875e-05), ('nn', 8.940696716308594e-06), ('en', 4.947185516357422e-06), ('jw', 4.76837158203125e-06), ('zh', 2.384185791015625e-06), ('uk', 1.1920928955078125e-06), ('ms', 8.344650268554688e-07), ('vi', 6.556510925292969e-07), ('haw', 5.960464477539062e-07), ('ru', 5.364418029785156e-07), ('id', 4.76837158203125e-07), ('tl', 4.76837158203125e-07), ('tr', 4.76837158203125e-07), ('pl', 3.5762786865234375e-07), ('ar', 3.5762786865234375e-07), ('th', 2.980232238769531e-07), ('pt', 2.980232238769531e-07), ('mi', 2.980232238769531e-07), ('de', 2.384185791015625e-07), ('ta', 2.384185791015625e-07), ('fr', 2.384185791015625e-07), ('la', 2.384185791015625e-07), ('it', 1.7881393432617188e-07), ('el', 1.1920928955078125e-07), ('ur', 1.1920928955078125e-07), ('km', 1.1920928955078125e-07), ('nl', 1.1920928955078125e-07), ('cy', 5.960464

'잠깐만 시비할 때까지 써야겠네'

In [28]:
model.temperature = 0.2
for beam_size in [1, 3, 5, 7, 10, 15]:
    model.beam_size = beam_size
    print('beam_size=', beam_size)
    # Print headers with formatting
    print(f"{'Best of':<10} {'GT Text':<30} {'Transcription':<30} {'Time Taken (s)':<10}")
    print('-' * 95)

    # Loop through different beam sizes and print the results in a formatted table
    for best_of in [1, 3, 5, 7, 10, 15]:
        model.best_of = best_of
        start = time()
        transcription = model.transcribe(audio_path)
        elapsed_time = time() - start
        print(f"{best_of:<10} {gt_text:<30} {transcription:<30} {elapsed_time:<10.2f}")


beam_size= 1
Best of    GT Text                        Transcription                  Time Taken (s)
-----------------------------------------------------------------------------------------------
1          왜 이렇게 많아 12월까지 써야겠네            응 왜이렇게 많지? 피할 때까지 써야겠네         0.63      
3          왜 이렇게 많아 12월까지 써야겠네            응 왜이렇게 강아지 시계였는데 시계였다가 짓어야겠네   0.69      
5          왜 이렇게 많아 12월까지 써야겠네            응 왜이렇게 강아지 시계였는데 시계였다가 짓어야겠네   0.68      
7          왜 이렇게 많아 12월까지 써야겠네            응 왜이렇게 강아지 시비할 때 가즈아겠네         0.68      
10         왜 이렇게 많아 12월까지 써야겠네            응 왜이렇게 많지? 피할 때까지 써야겠네         0.83      
15         왜 이렇게 많아 12월까지 써야겠네            응 왜이렇게 많지? 피할 때까지 써야겠네         0.93      
beam_size= 3
Best of    GT Text                        Transcription                  Time Taken (s)
-----------------------------------------------------------------------------------------------
1          왜 이렇게 많아 12월까지 써야겠네            응 왜이렇게 동안 수급을 했다가 짓어야겠네.       0.70      
3          왜 이렇게 많

In [3]:

audio_path = "test.wav"
gt_text = "난 불효자인가. 불효자인가?"
#audio_path = "/home/sabina/speech_handicap_dataset/imijeong/divided_audio/240810_2139_part383.wav"
#gt_text = "머리카락 때문에 금메달을 못 땄대"
#audio_path = "/home/sabina/speech_handicap_dataset/imijeong/divided_audio/240812_1942_part100.wav"
#gt_text = "안됐나봐 공부할려고 했는데"

In [6]:
model.best_of = 5
for temperature in [0, 0.2, 0.4]:
    model.temperature = temperature
    for beam_size in [1, 3, 5, 7, 10, 15]:
        model.beam_size = beam_size
        print('temperature=', temperature, 'beam_size=', beam_size)
        for i in range(3):
            print(f"Transcription {i+1}: {model.transcribe(audio_path)}")
    print('-' * 15)

temperature= 0 beam_size= 1
Transcription 1: 아 난 구류장 가 구류장 가
Transcription 2: 아 난 구류장 가 구류장 가
Transcription 3: 아 난 구류장 가 구류장 가
temperature= 0 beam_size= 3
Transcription 1: 아 난 불효자인가 불효자인가
Transcription 2: 아 난 불효자인가 불효자인가
Transcription 3: 아 난 불효자인가 불효자인가
temperature= 0 beam_size= 5
Transcription 1: 아 난 불효자인가 불효자인가
Transcription 2: 아 난 불효자인가 불효자인가
Transcription 3: 아 난 불효자인가 불효자인가
temperature= 0 beam_size= 7
Transcription 1: 아 난 불효자인가 불효자인가
Transcription 2: 아 난 불효자인가 불효자인가
Transcription 3: 아 난 불효자인가 불효자인가
temperature= 0 beam_size= 10
Transcription 1: 아 난 불효자인가 불효자인가
Transcription 2: 아 난 불효자인가 불효자인가
Transcription 3: 아 난 불효자인가 불효자인가
temperature= 0 beam_size= 15
Transcription 1: 아 난 불효자인가 불효자인가
Transcription 2: 아 난 불효자인가 불효자인가
Transcription 3: 아 난 불효자인가 불효자인가
---------------
temperature= 0.2 beam_size= 1
Transcription 1: 아 난 불효자인가 불효자인가
Transcription 2: 아 난 불효자인가 불효자인가
Transcription 3: 아 난 불효자인가 불효자인가
temperature= 0.2 beam_size= 3
Transcription 1: 아 난 구류장 가 구류장 가
Transcription 2: 아 난 구류장 가 구

In [24]:
model.temperatire = 0.2
for beam_size in [1, 3, 5, 7, 10, 15]:
    model.beam_size = beam_size
    for best_of in [1, 3, 5, 7, 10, 15]:
        model.best_of = best_of
        print('beam_size=', beam_size, 'best_of=', best_of)
        for i in range(3):
            print(f"Transcription {i+1}: {model.transcribe(audio_path)}")
    print('-' * 15)

beam_size= 1 best_of= 1
Transcription 1: 아 난 그루장과 그루장과
Transcription 2: 아 난 구려자인가 구려자인가
Transcription 3: 아 난 불효자인가 불효자인가
beam_size= 1 best_of= 3
Transcription 1: 아 난 불효자인가 불효자인가
Transcription 2: 아 난 구류장 가 구류장 가
Transcription 3: 아 난 구류장가 구류장가
beam_size= 1 best_of= 5
Transcription 1: 아 난 구료장가 구료장가
Transcription 2: 아 난 불효자인가 불효자인가
Transcription 3: 아 난 불효자인가 불효자인가
beam_size= 1 best_of= 7
Transcription 1: 아 난 그료자인가 그료자인가
Transcription 2: 아 난 불효자인가 불효자인가
Transcription 3: 아 난 불효자인가 불효자인가
beam_size= 1 best_of= 10
Transcription 1: 아 난 불효자인가 불효자인가
Transcription 2: 아 난 불효자인가 불효자인가
Transcription 3: 아 난 불효자인가 불효자인가
beam_size= 1 best_of= 15
Transcription 1: 아 난 불효자인가 불효자인가
Transcription 2: 아 난 구류장 가 구류장 가
Transcription 3: 아 난 불효자인가 불효자인가
---------------
beam_size= 3 best_of= 1
Transcription 1: 아 난 필요자인가 필요자인가
Transcription 2: 아 난 구류자인가 구류자인가
Transcription 3: 아 난 구류장가를 구류장가
beam_size= 3 best_of= 3
Transcription 1: 아 난 불효자인가 불효자인가
Transcription 2: 아 난 구류장가 구류장가
Transcription 3: 아 난 그료자인가 그료자인가
beam_si

In [12]:
from typing import Tuple, List, Iterable
import ctranslate2
from faster_whisper.tokenizer import Tokenizer
import zlib
from typing import Optional, List, Union, NamedTuple


class TranscriptionOptions(NamedTuple):
    beam_size: int
    best_of: int
    patience: float
    length_penalty: float
    log_prob_threshold: Optional[float]
    no_speech_threshold: Optional[float]
    compression_ratio_threshold: Optional[float]
    condition_on_previous_text: bool
    temperatures: List[float]
    initial_prompt: Optional[Union[str, Iterable[int]]]
    prefix: Optional[str]
    suppress_blank: bool
    suppress_tokens: Optional[List[int]]
    without_timestamps: bool
    max_initial_timestamp: float
    word_timestamps: bool
    prepend_punctuations: str
    append_punctuations: str


def get_compression_ratio(text: str) -> float:
    text_bytes = text.encode("utf-8")
    return len(text_bytes) / len(zlib.compress(text_bytes))


class STTTopK(WhisperModel):
    def __init__(self, model_name, temperature, best_of, beam_size, top_k=1, verbose=False):
        super().__init__(model_name)
        self.verbose = verbose
        self.temperature = temperature
        self.best_of = best_of
        self.beam_size = beam_size
        self.top_k = top_k

    def transcribe(self, audio_path):
        segments, info = super().transcribe(audio_path, temperature=self.temperature, best_of=self.best_of, beam_size=self.beam_size)
        texts = [segment.text for segment in segments]
        if self.verbose:
            print(info)
            for segment in segments:
                print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

        transcription = ' '.join(texts)
        return transcription
    


    def generate_with_fallback(
        self,
        encoder_output: ctranslate2.StorageView,
        prompt: List[int],
        tokenizer: Tokenizer,
        options: TranscriptionOptions,
    ) -> Tuple[ctranslate2.models.WhisperGenerationResult, float, float, float]:
        decode_result = None
        all_results = []
        below_cr_threshold_results = []

        max_initial_timestamp_index = int(
            round(options.max_initial_timestamp / self.time_precision)
        )

        for temperature in options.temperatures:
            if temperature > 0:
                kwargs = {
                    "beam_size": 1,
                    "num_hypotheses": options.best_of,
                    "sampling_topk": 0,
                    "sampling_temperature": temperature,
                }
            else:
                kwargs = {
                    "beam_size": options.beam_size,
                    "patience": options.patience,
                }

            result = self.model.generate(
                encoder_output,
                [prompt],
                length_penalty=options.length_penalty,
                max_length=self.max_length,
                return_scores=True,
                return_no_speech_prob=True,
                suppress_blank=options.suppress_blank,
                suppress_tokens=options.suppress_tokens,
                max_initial_timestamp_index=max_initial_timestamp_index,
                **kwargs,
            )
            print(result)
            result = result[0]

            tokens = result.sequences_ids[0]

            # Recover the average log prob from the returned score.
            seq_len = len(tokens)
            cum_logprob = result.scores[0] * (seq_len**options.length_penalty)
            avg_logprob = cum_logprob / (seq_len + 1)

            text = tokenizer.decode(tokens).strip()
            compression_ratio = get_compression_ratio(text)

            decode_result = (
                result,
                avg_logprob,
                temperature,
                compression_ratio,
            )
            all_results.append(decode_result)

            needs_fallback = False

            if options.compression_ratio_threshold is not None:
                if compression_ratio > options.compression_ratio_threshold:
                    needs_fallback = True  # too repetitive

                    self.logger.debug(
                        "Compression ratio threshold is not met with temperature %.1f (%f > %f)",
                        temperature,
                        compression_ratio,
                        options.compression_ratio_threshold,
                    )
                else:
                    below_cr_threshold_results.append(decode_result)

            if (
                options.log_prob_threshold is not None
                and avg_logprob < options.log_prob_threshold
            ):
                needs_fallback = True  # average log probability is too low

                self.logger.debug(
                    "Log probability threshold is not met with temperature %.1f (%f < %f)",
                    temperature,
                    avg_logprob,
                    options.log_prob_threshold,
                )

            if (
                options.no_speech_threshold is not None
                and result.no_speech_prob > options.no_speech_threshold
            ):
                needs_fallback = False  # silence

            if not needs_fallback:
                break
        else:
            # all failed, select the result with the highest average log probability
            decode_result = max(
                below_cr_threshold_results or all_results, key=lambda x: x[1]
            )

        return decode_result



In [16]:
model_name = "neoALI/whisper-medium-quanted-handicapped"

audio_path = "test.wav"
gt_text = "난 불효자인가. 불효자인가?"

model = STTTopK(model_name, verbose=False, temperature=0.2, best_of=5, beam_size=5, top_k=2)

In [17]:
model.transcribe(audio_path)

[WhisperGenerationResult(sequences=[['<|0.00|>', 'ìķĦ', 'ĠëĤľ', 'Ġë¶Ī', 'íļ', '¨', 'ìŀĲ', 'ìĿ¸ê°Ģ', 'Ġë¶Ī', 'íļ', '¨', 'ìŀĲ', 'ìĿ¸ê°Ģ', '<|7.50|>'], ['<|0.00|>', 'ìķĦ', 'ĠëĤľ', 'Ġë¶Ī', 'íļ', '¨', 'ìŀĲ', 'ìĿ¸ê°Ģ', 'Ġë¶Ī', 'íļ', '¨', 'ìŀĲ', 'ìĿ¸ê°Ģ', '<|7.50|>'], ['<|0.00|>', 'ìķĦ', 'ĠëĤľ', 'Ġêµ¬ë', '¥ĺ', 'ìŀ¥', 'Ġê°Ģ', 'Ġêµ¬ë', '¥ĺ', 'ìŀ¥', 'Ġê°Ģ', '<|7.50|>'], ['<|0.00|>', 'ìķĦ', 'ĠëĤľ', 'Ġêµ¬ë', '¥ĺ', 'ìŀ¥', 'Ġê°Ģ', 'Ġêµ¬ë', '¥ĺ', 'ìŀ¥', 'Ġê°Ģ', '<|7.00|>'], ['<|0.00|>', 'ìķĦ', 'ĠëĤľ', 'Ġêµ¬ë', '¥ĺ', 'ìŀ¥', 'ê°Ģ', 'Ġêµ¬ë', '¥ĺ', 'ìŀ¥', 'ê°Ģ', '<|7.00|>']], sequences_ids=[[50364, 2230, 19252, 16285, 11193, 101, 4264, 41755, 16285, 11193, 101, 4264, 41755, 50739], [50364, 2230, 19252, 16285, 11193, 101, 4264, 41755, 16285, 11193, 101, 4264, 41755, 50739], [50364, 2230, 19252, 17386, 46014, 4573, 4147, 17386, 46014, 4573, 4147, 50739], [50364, 2230, 19252, 17386, 46014, 4573, 4147, 17386, 46014, 4573, 4147, 50714], [50364, 2230, 19252, 17386, 46014, 4573, 1453, 17386, 46014, 4573, 1453, 

'아 난 불효자인가 불효자인가'

In [33]:
from huggingface_hub import HfApi

api = HfApi()



api.upload_folder(
    folder_path="whisper-turbo-faster-imijeong",
    path_in_repo=".",
    repo_id="neoALI/whisper-turbo-faster-imijeong",
    repo_type="model",
    #commit_message="grebanniy tokenizer i bil problemoy. Nujno skopirovat ego s originalnogo"
)

CommitInfo(commit_url='https://huggingface.co/neoALI/whisper-turbo-faster-imijeong/commit/f74769b51c5649006807b60dc23694319d4b2917', commit_message='Upload folder using huggingface_hub', commit_description='', oid='f74769b51c5649006807b60dc23694319d4b2917', pr_url=None, pr_revision=None, pr_num=None)