Тестирование проводится на NVIDIA GeForce RTX 4070 Laptop GPU 8GB

<a name='1-1'></a>
## Тестирование Whisper-Base
Размер 290 MB

2050 примеров:

856.9 секунд

0.4 секунды на 1 пример

WER 49.35%

In [9]:
import json
import os
import numpy as np
import torch
from ruamel.yaml import YAML
import torchaudio
import time
from omegaconf import DictConfig, OmegaConf, open_dict
import whisper
import nemo.collections.asr as nemo_asr

  from .autonotebook import tqdm as notebook_tqdm
[NeMo W 2024-06-12 12:13:31 transformer_bpe_models:59] Could not import NeMo NLP collection which is required for speech translation model.


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Загрузите модель и процессор
model = whisper.load_model("base")

# Перенесите модель на GPU, если CUDA доступен
model.to(device)

print("Device:", device)

# Дополнительная информация о GPU, если доступно
if device.type == 'cuda':
    print("GPU Name:", torch.cuda.get_device_name(0))
    print("CUDA Version:", torch.version.cuda)

Device: cuda
GPU Name: NVIDIA GeForce RTX 4070 Laptop GPU
CUDA Version: 11.8


In [4]:
manifest_paths = [
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\RuDevicesDataset\manifestValidationRuDevices.jsonl",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\asr_calls_2_val\manifestValidationSTTcalls.jsonl",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\buriy_audiobooks_2_val\manifestValidationSTTaudiobooks.jsonl",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\public_youtube700_val\manifestValidationSTTyoutube.jsonl"
]

base_dirs = [
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\RuDevicesDataset",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\asr_calls_2_val",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\buriy_audiobooks_2_val",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\public_youtube700_val",
]

In [5]:
# необходимо выполнить для соотвествия датасету

import re

def clean_string(input_str):
    # Привести строку к нижнему регистру
    lower_str = input_str.lower()
    
    # Удалить все знаки препинания
    no_punctuation_str = re.sub(r'[^\w\s]', '', lower_str)
    
    # Удалить пробелы в начале и в конце строки
    cleaned_str = no_punctuation_str.strip()
    
    return cleaned_str

In [6]:
num_of_audio_all = 0

hypotheses = []
y_target = []

total_time = 0
start_time = time.time()

for manifest_path, base_dir in zip(manifest_paths, base_dirs):
    print(manifest_path)
    with open(manifest_path, 'r', encoding='utf-8') as f:
        manifest_data = [json.loads(line) for line in f]
    
    num_of_audio = len(manifest_data)
    num_of_audio_all += num_of_audio

    for audio_index in range(num_of_audio):
        current_audio_entry = manifest_data[audio_index]
        y_target.append(current_audio_entry.get('text', ''))

        current_audio_filename = current_audio_entry.get('audio_filepath', '')
        file_path = os.path.join(base_dir, current_audio_filename)
        #print(file_path)

        current_hypotheses = model.transcribe(file_path)["text"]
        hypotheses.append(clean_string(current_hypotheses))

total_time = time.time() - start_time

C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\RuDevicesDataset\manifestValidationRuDevices.jsonl
C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\asr_calls_2_val\manifestValidationSTTcalls.jsonl
C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\buriy_audiobooks_2_val\manifestValidationSTTaudiobooks.jsonl
C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\public_youtube700_val\manifestValidationSTTyoutube.jsonl


In [10]:
print(f"Число аудио: {num_of_audio_all}")
print(f"Общее время: {total_time:.2f} секунд")
print(f"Среднее время на одно аудио: {total_time/num_of_audio_all:.2f} секунд")

Число аудио: 2050
Общее время: 856.91 секунд
Среднее время на одно аудио: 0.42 секунд


In [11]:
# from jiwer import wer не подходит, т.к. он не работает, если в гипотезе есть пустые ("") аудио
print(f"Word Error Rate (WER): {nemo_asr.metrics.wer.word_error_rate(hypotheses, y_target) * 100:.2f}%")

Word Error Rate (WER): 49.35%


<a name='1-2'></a>
## Тестирование Quartznet15x5 Nvidia
Размер 70 MB

2050 примеров:

62.4 секунды

0.03 секунды на 1 пример

WER 72.12%

In [1]:
import json
import os
import numpy as np
import torch
from ruamel.yaml import YAML
import torchaudio
import time
from omegaconf import DictConfig, OmegaConf, open_dict
import nemo
import nemo.collections.asr as nemo_asr

  from .autonotebook import tqdm as notebook_tqdm
[NeMo W 2024-06-12 16:05:27 transformer_bpe_models:59] Could not import NeMo NLP collection which is required for speech translation model.


In [25]:
asr_model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="stt_ru_quartznet15x5", map_location='cuda')

[NeMo I 2024-06-12 16:50:57 cloud:58] Found existing object C:\Users\vdovichev\.cache\torch\NeMo\NeMo_1.21.0\stt_ru_quartznet15x5\92506570b7206ea395e295b3fbbf07e3\stt_ru_quartznet15x5.nemo.
[NeMo I 2024-06-12 16:50:57 cloud:64] Re-using file from: C:\Users\vdovichev\.cache\torch\NeMo\NeMo_1.21.0\stt_ru_quartznet15x5\92506570b7206ea395e295b3fbbf07e3\stt_ru_quartznet15x5.nemo
[NeMo I 2024-06-12 16:50:57 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-06-12 16:50:57 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /raid/noneval.json
    sample_rate: 16000
    labels:
    - ' '
    - а
    - б
    - в
    - г
    - д
    - е
    - ё
    - ж
    - з
    - и
    - й
    - к
    - л
    - м
    - н
    - о
    - п
    - р
    - с
    - т
    - у
    - ф
    - х
    - ц
    - ч
    - ш
    - щ
    - ъ
    - ы
    - ь
    - э
    - ю
    - я
    batch_size: 16
    trim_silence: true
    max_duration: 16.7
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    num_workers: 8
    pin_memory: true
    
[NeMo W 2024-06-12 16:50:57 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation da

[NeMo I 2024-06-12 16:50:57 features:289] PADDING: 16
[NeMo I 2024-06-12 16:50:58 save_restore_connector:249] Model EncDecCTCModel was successfully restored from C:\Users\vdovichev\.cache\torch\NeMo\NeMo_1.21.0\stt_ru_quartznet15x5\92506570b7206ea395e295b3fbbf07e3\stt_ru_quartznet15x5.nemo.


In [26]:
# Получение конфигурации модели
config_model = asr_model.cfg

# Сохранение конфигурации в YAML-файл на Google Диск
config_yaml = OmegaConf.to_yaml(config_model)
with open(r'C:\Users\vdovichev\Documents\VKR\models_and_configs\QuartzNet15x5_Nvidia_config.yaml', 'w') as yaml_file:
    yaml_file.write(config_yaml)

In [27]:
yaml = YAML(typ='safe')
with open(r'C:\Users\vdovichev\Documents\VKR\models_and_configs\QuartzNet15x5_Nvidia_config.yaml') as f:
        config_model = yaml.load(f)
preprocessor = nemo_asr.models.EncDecCTCModel.from_config_dict(config_model['preprocessor'])
decoder = nemo_asr.metrics.wer.CTCDecoding(config_model['decoding'], vocabulary=config_model['decoder']['vocabulary'])

[NeMo I 2024-06-12 16:51:10 features:289] PADDING: 16


In [10]:
print(asr_model.device) # проверим что cuda подключилась

cuda:0


In [11]:
manifest_paths = [
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\RuDevicesDataset\manifestValidationRuDevices.jsonl",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\asr_calls_2_val\manifestValidationSTTcalls.jsonl",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\buriy_audiobooks_2_val\manifestValidationSTTaudiobooks.jsonl",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\public_youtube700_val\manifestValidationSTTyoutube.jsonl"
]

base_dirs = [
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\RuDevicesDataset",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\asr_calls_2_val",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\buriy_audiobooks_2_val",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\public_youtube700_val",
]

In [12]:
asr_model.eval()
asr_model.encoder.freeze()
asr_model.decoder.freeze()

num_of_audio_all = 0

hypotheses = []
y_target = []

total_time = 0
start_time = time.time() 

for manifest_path, base_dir in zip(manifest_paths, base_dirs):
    print(manifest_path)
    with open(manifest_path, 'r', encoding='utf-8') as f:
        manifest_data = [json.loads(line) for line in f]
    
    num_of_audio = len(manifest_data)
    num_of_audio_all += num_of_audio

    for audio_index in range(num_of_audio):
        current_audio_entry = manifest_data[audio_index]
        y_target.append(current_audio_entry.get('text', ''))

        current_audio_filename = current_audio_entry.get('audio_filepath', '')
        file_path = os.path.join(base_dir, current_audio_filename)
        #print(file_path)

        waveform, sample_rate = torchaudio.load(file_path)
        processed_signal, processed_signal_len = preprocessor(input_signal=waveform, length=torch.tensor([len(waveform[0])]))

        processed_signal = processed_signal.to('cuda')
        processed_signal_len = processed_signal_len.to('cuda')

        encoder_output = asr_model.encoder(audio_signal=processed_signal, length=processed_signal_len)
        logits = asr_model.decoder(encoder_output=encoder_output[0])
        current_hypotheses, _ = decoder.ctc_decoder_predictions_tensor(logits, decoder_lengths=processed_signal_len)
        hypotheses.extend(current_hypotheses)

total_time = time.time() - start_time

C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\RuDevicesDataset\manifestValidationRuDevices.jsonl
C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\asr_calls_2_val\manifestValidationSTTcalls.jsonl
C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\buriy_audiobooks_2_val\manifestValidationSTTaudiobooks.jsonl
C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\public_youtube700_val\manifestValidationSTTyoutube.jsonl


In [13]:
print(f"Число аудио: {num_of_audio_all}")
print(f"Общее время: {total_time:.2f} секунд")
print(f"Среднее время на одно аудио: {total_time/num_of_audio_all:.2f} секунд")

Число аудио: 2050
Общее время: 62.39 секунд
Среднее время на одно аудио: 0.03 секунд


In [14]:
# from jiwer import wer не подходит, т.к. он не работает, если в гипотезе есть пустые ("") аудио
print(f"Word Error Rate (WER): {nemo_asr.metrics.wer.word_error_rate(hypotheses, y_target) * 100:.2f}%")

Word Error Rate (WER): 72.12%


<a name='1-3'></a>
## Тестирование Quartznet15x5 Golos
Размер 70 MB

2050 примеров:

61.5 секунды

0.03 секунды на 1 пример

WER 50.47%

In [16]:
import nemo.collections.asr as nemo_asr
from nemo.core.classes import ModelPT
import json
import os
import numpy as np
import torch
from ruamel.yaml import YAML
import torchaudio
import time
from omegaconf import DictConfig, OmegaConf, open_dict

In [17]:
# загружаем модель NeMo
asr_model = ModelPT.restore_from(r"C:\Users\vdovichev\Documents\VKR\models_and_configs\QuartzNet15x5_golos.nemo", map_location = 'cuda')

[NeMo W 2024-06-12 16:09:16 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: train/golos_and_mcv.jsonl
    sample_rate: 16000
    labels:
    - ' '
    - а
    - б
    - в
    - г
    - д
    - е
    - ж
    - з
    - и
    - й
    - к
    - л
    - м
    - н
    - о
    - п
    - р
    - с
    - т
    - у
    - ф
    - х
    - ц
    - ч
    - ш
    - щ
    - ъ
    - ы
    - ь
    - э
    - ю
    - я
    batch_size: 64
    trim_silence: false
    max_duration: 20.0
    min_duration: 0.1
    num_workers: 20
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    parser: ru
    
[NeMo W 2024-06-12 16:09:16 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a

[NeMo I 2024-06-12 16:09:16 features:289] PADDING: 16
[NeMo I 2024-06-12 16:09:16 save_restore_connector:249] Model EncDecCTCModel was successfully restored from C:\Users\vdovichev\Documents\VKR\models_and_configs\QuartzNet15x5_golos.nemo.


In [18]:
print(asr_model.device) # проверим что cuda подключилась

cuda:0


In [19]:
# Получение конфигурации модели
config_model = asr_model.cfg

# Сохранение конфигурации в YAML-файл на Google Диск
config_yaml = OmegaConf.to_yaml(config_model)
with open(r'C:\Users\vdovichev\Documents\VKR\models_and_configs\QuartzNet15x5_golos_config.yaml', 'w') as yaml_file:
    yaml_file.write(config_yaml)

In [20]:
yaml = YAML(typ='safe')
with open(r'C:\Users\vdovichev\Documents\VKR\models_and_configs\QuartzNet15x5_golos_config.yaml') as f:
        config_model = yaml.load(f)
preprocessor = nemo_asr.models.EncDecCTCModel.from_config_dict(config_model['preprocessor'])
decoder = nemo_asr.metrics.wer.CTCDecoding(config_model['decoding'], vocabulary=config_model['decoder']['vocabulary'])

[NeMo I 2024-06-12 16:09:29 features:289] PADDING: 16


In [21]:
manifest_paths = [
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\RuDevicesDataset\manifestValidationRuDevices.jsonl",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\asr_calls_2_val\manifestValidationSTTcalls.jsonl",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\buriy_audiobooks_2_val\manifestValidationSTTaudiobooks.jsonl",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\public_youtube700_val\manifestValidationSTTyoutube.jsonl"
]

base_dirs = [
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\RuDevicesDataset",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\asr_calls_2_val",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\buriy_audiobooks_2_val",
    r"C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\public_youtube700_val",
]

In [22]:
asr_model.eval()
asr_model.encoder.freeze()
asr_model.decoder.freeze()

num_of_audio_all = 0

hypotheses = []
y_target = []

total_time = 0
start_time = time.time()

for manifest_path, base_dir in zip(manifest_paths, base_dirs):
    print(manifest_path)
    with open(manifest_path, 'r', encoding='utf-8') as f:
        manifest_data = [json.loads(line) for line in f]
    
    num_of_audio = len(manifest_data)
    num_of_audio_all += num_of_audio

    for audio_index in range(num_of_audio):
        current_audio_entry = manifest_data[audio_index]
        y_target.append(current_audio_entry.get('text', ''))

        current_audio_filename = current_audio_entry.get('audio_filepath', '')
        file_path = os.path.join(base_dir, current_audio_filename)
        #print(file_path)

        waveform, sample_rate = torchaudio.load(file_path)
        processed_signal, processed_signal_len = preprocessor(input_signal=waveform, length=torch.tensor([len(waveform[0])]))

        processed_signal = processed_signal.to('cuda')
        processed_signal_len = processed_signal_len.to('cuda')

        encoder_output = asr_model.encoder(audio_signal=processed_signal, length=processed_signal_len)
        logits = asr_model.decoder(encoder_output=encoder_output[0])
        current_hypotheses, _ = decoder.ctc_decoder_predictions_tensor(logits, decoder_lengths=processed_signal_len)
        hypotheses.extend(current_hypotheses)

total_time = time.time() - start_time

C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\RuDevicesDataset\manifestValidationRuDevices.jsonl
C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\asr_calls_2_val\manifestValidationSTTcalls.jsonl
C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\buriy_audiobooks_2_val\manifestValidationSTTaudiobooks.jsonl
C:\Users\vdovichev\Documents\proj1-STT\DatasetForFineTune\OpenSTTruDatasets\public_youtube700_val\manifestValidationSTTyoutube.jsonl


In [23]:
print(f"Число аудио: {num_of_audio_all}")
print(f"Общее время: {total_time:.2f} секунд")
print(f"Среднее время на одно аудио: {total_time/num_of_audio_all:.2f} секунд")

Число аудио: 2050
Общее время: 61.48 секунд
Среднее время на одно аудио: 0.03 секунд


In [24]:
# from jiwer import wer не подходит, т.к. он не работает, если в гипотезе есть пустые ("") аудио
print(f"Word Error Rate (WER): {nemo_asr.metrics.wer.word_error_rate(hypotheses, y_target) * 100:.2f}%")

Word Error Rate (WER): 50.47%
