In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pyaudio
import wave
import torch
import librosa
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util

In [2]:
class VoiseAssistentPipeline:

    def __init__(self, device="cpu"):

        self.device = device
        self.model_id = "openai/whisper-large-v3"
        self.torch_dtype = torch.float32

        self.transcribe_model = AutoModelForSpeechSeq2Seq.from_pretrained(
            self.model_id,
            torch_dtype=self.torch_dtype,
            low_cpu_mem_usage=True,
            use_safetensors=True).to(self.device)

        self.transcribe_model_processor = AutoProcessor.from_pretrained(self.model_id)

        self.pipeline = pipeline(
            "automatic-speech-recognition",
            model= self.transcribe_model,
            tokenizer=self.transcribe_model_processor.tokenizer,
            feature_extractor=self.transcribe_model_processor.feature_extractor,
            max_new_tokens=128,
            chunk_length_s=30,
            batch_size=16,
            return_timestamps=True,
            torch_dtype=self.torch_dtype,
            device=device,
        )

        self.similarity_model = SentenceTransformer(
            'sentence-transformers/LaBSE', device=self.device)

    def voise_record(self, chank=1024,
                     audio_format=pyaudio.paInt16,
                     channels=1,
                     rate=16000,
                     duration=15,
                     output_filename="output.wav"):

        p = pyaudio.PyAudio()

        stream = p.open(format=audio_format,
                        channels=channels,
                        rate=rate,
                        input=True,
                        frames_per_buffer=chank)

        frames = []

        for i in range(0, int(rate / chank * duration)):
            data = stream.read(chank)
            frames.append(data)

        stream.stop_stream()
        stream.close()
        p.terminate()

        wf = wave.open(output_filename, 'wb')
        wf.setnchannels(channels)
        wf.setsampwidth(p.get_sample_size(audio_format))
        wf.setframerate(rate)
        wf.writeframes(b''.join(frames))
        wf.close()

    
    def transcribe_audio(self, audio):
        result = self.pipeline(audio, generate_kwargs={"language": "russian"})
        return result["text"]

    def create_embedding(self, origin_sentence):
        embedding = self.similarity_model.encode(origin_sentence, convert_to_tensor=True)
        return embedding
    

In [3]:
AUDIO_PATH = "./test_output.wav"
DEVICE = "cpu"


origin_sentence = "Не включилось РУ6"
sentences = [
    'Не включилось РУ6',
    'РУ6 не включилось',
    'не включилось шестое реле управления',
    'Реле РУ6 срабатывает, но не включается реле времени РВ1, РВ2',
    'При нажатии кнопки "Пуск дизеля" (все нужные автоматы включены) КМН не включается.',
    'При нажатии кнопки "Пуск дизеля" контактор КМН включается, но маслопрокачивающий насос не работает',
    'При пуске прокачка масла есть (60-90 сек), но после отключения КМН пусковые контакторы не включаются',
    'При нажатии кнопки "ПД" включаются пусковые контакторы без предварительной прокачки масла'
]

In [4]:
assistent = VoiseAssistentPipeline(DEVICE)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
assistent.voise_record(duration=4, output_filename=AUDIO_PATH)

In [6]:
audio, _ = librosa.load(AUDIO_PATH, sr=None)

In [7]:
text = assistent.transcribe_audio(audio)
text

' не включил шестой реле управления.'

In [8]:
origin_embedding = assistent.create_embedding(text)
embeddings = [assistent.create_embedding(sentence) for sentence in sentences]
for i in embeddings:
    print(util.pytorch_cos_sim(origin_embedding, i))

tensor([[0.6121]])
tensor([[0.6537]])
tensor([[0.9444]])
tensor([[0.5294]])
tensor([[0.4407]])
tensor([[0.3804]])
tensor([[0.3741]])
tensor([[0.3401]])


In [9]:
import torch
torch.cuda.is_available()

True