In [18]:
# Cascaded Speech-to-Speech System

# -- Mukund K Roy, PhD scholar, SVNIT, Surat
# -- Dr Pruthwik Mishra, Asst. Professor, SVNIT, Surat
# -- Dr Rohit Kumar, Asst. Professor, SVNIT, Surat

## Step 1: Install Required Libraries
# Install necessary libraries for ASR, translation, and TTS
!pip install torch torchaudio transformers







In [19]:
# @title

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
import torch
import IPython
def load_asr_model():
    # Load Wav2Vec2 processor and model for speech-to-text conversion
    model_name = "facebook/wav2vec2-large-960h"
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model = Wav2Vec2ForCTC.from_pretrained(model_name)
    return processor, model



In [20]:
# @title

## Load NLLB Model for Translation

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def load_translation_model():
    # Load NLLB tokenizer and model for English-to-Hindi translation
    model_name = "facebook/nllb-200-distilled-600M"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return tokenizer, model



In [21]:
# @title
from transformers import VitsModel, AutoTokenizer

def load_tts_model():

  model = VitsModel.from_pretrained("facebook/mms-tts-hin")
  tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hin")

  return model, tokenizer



In [22]:
# @title
## Perform ASR on Input Speech

def transcribe_speech(audio_path, processor, model):
    # Load audio file and get waveform and sample rate
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample waveform to 16kHz (required for Wav2Vec2 model)
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

    # Ensure waveform is 1D (remove extra dimensions)
    if len(waveform.shape) > 1:  # Check if waveform has more than one channel
        waveform = waveform.mean(dim=0)  # Convert to mono by averaging channels

    # Process waveform for model input
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)

    # Perform inference and get logits (raw prediction before being converted to text)
    with torch.no_grad():
        logits = model(**inputs).logits

    # Decode predicted token IDs to text
    predicted_ids = torch.argmax(logits, dim=-1) #find the most likely token for each position in the audio
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription


In [23]:
# @title
def translate_text(text, tokenizer, model, src_lang="eng_Latn", tgt_lang="hin_Deva"):
    tokenizer.src_lang = src_lang
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Get the target language token ID correctly
    tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang)

    # Generate translation output
    with torch.no_grad():
        outputs = model.generate(**inputs, forced_bos_token_id=tgt_lang_id)

    # Decode translation tokens to string
    translation = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return translation


In [24]:
# @title


def text_to_speech(text,tokenizer,model):

    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
      output = model(**inputs).waveform
    return output


In [25]:
# @title

def speech_to_speech_pipeline(audio):
    # Complete pipeline: speech-to-text, translation, and text-to-speech
    text = transcribe_speech(audio, processor, asr_model)
    print("Transcription:", text)

    translated_text = translate_text(text, translator_tokenizer, translator_model)
    print("Translation:", translated_text)
    speech_output = text_to_speech(translated_text,tts_tokenizer,tts_model)

    return speech_output


In [26]:
# @title
# Load the ASR model and processor
processor, asr_model = load_asr_model()
# Load the translation model and tokenizer
translator_tokenizer, translator_model = load_translation_model()
# Load the TTS model and tokenizer
tts_model, tts_tokenizer = load_tts_model()



Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# @title
# Now let's take an English audio file
audio_file = "/content/sample_data/Recording_4.wav"  # Replace with your input audio file
IPython.display.Audio(audio_file,autoplay=True)

In [28]:
# @title
translated_speech=speech_to_speech_pipeline(audio_file)

IPython.display.Audio(translated_speech.numpy(), rate=tts_model.config.sampling_rate,autoplay=True) # That's it :)

Transcription: THIS IS THE FIRST STEP THE SYSTEM LISTENS TO THE SPEECH AND CONVERTS IT INTO FEXT THINK OF HOW BUGALIS IS TRETENT TRANSCRIBES YOUR WORD WHEN YOU SPEAK TO THEM
Translation: In addition to listing his achievements while in office he made a teenager a US Secret Service agent and reaffirmed his desire for Greenland to become part of the US.


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)