In [8]:
!pip install torchaudio bigvgan sounddevice joblib


Defaulting to user installation because normal site-packages is not writeable
Collecting torchaudio
  Using cached torchaudio-2.7.1-cp312-cp312-win_amd64.whl.metadata (6.6 kB)
Collecting bigvgan
  Using cached bigvgan-2.4.1-py3-none-any.whl.metadata (430 bytes)
Collecting auraloss (from bigvgan)
  Using cached auraloss-0.4.0-py3-none-any.whl.metadata (8.0 kB)
Collecting librosa>=0.8.1 (from bigvgan)
  Using cached librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting ninja (from bigvgan)
  Using cached ninja-1.11.1.4-py3-none-win_amd64.whl.metadata (5.0 kB)
Collecting nnaudio (from bigvgan)
  Using cached nnAudio-0.3.3-py3-none-any.whl.metadata (771 bytes)
Collecting pesq (from bigvgan)
  Using cached pesq-0.0.4.tar.gz (38 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting soundfile (from bigvgan)
  Using cached soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting audioread>=2.1.9 (from librosa>

  DEPRECATION: Building 'pesq' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'pesq'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [10]:
! pip install llama-cpp-python

Defaulting to user installation because normal site-packages is not writeable
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.14.tar.gz (51.0 MB)
     ---------------------------------------- 0.0/51.0 MB ? eta -:--:--
     -- ------------------------------------- 3.4/51.0 MB 28.6 MB/s eta 0:00:02
     ------------ -------------------------- 16.3/51.0 MB 44.4 MB/s eta 0:00:01
     ---------------------------- ---------- 37.2/51.0 MB 65.7 MB/s eta 0:00:01
     --------------------------------------  50.9/51.0 MB 72.0 MB/s eta 0:00:01
     --------------------------------------- 51.0/51.0 MB 64.9 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): starte

In [1]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), 'MP-SENet-main', 'MP-SENet-main'))
import torch
import torchaudio
import soundfile as sf
import numpy as np
from transformers import (
    SpeechT5Processor, 
    SpeechT5ForSpeechToText,
    SpeechT5ForTextToSpeech,
    SpeechT5HifiGan,
    VitsModel, 
    VitsTokenizer
)
import bigvgan
import librosa
from models.model import MPNet
import json


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

class SpeechPipeline:
    def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.device = 'cuda:0'
        self.setup_models()
        
    def setup_models(self):
        """Initialize all models for the pipeline"""
        print(" Setting up models...")
        
        # 1. Load MP-SENet for denoising
        print("Loading MP-SENet for denoising...")
        self.load_mpsenet()
        
        # 2. Load SpeechT5 for ASR
        print("Loading SpeechT5 for ASR...")
        self.asr_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
        self.asr_model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr").to(self.device)
        
        # 3. Load BigVGAN + VITS for TTS
        print("Loading TTS models (VITS + BigVGAN)...")
        self.tts_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
        self.tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng").to(self.device).eval()
        
        # BigVGAN vocoder
        self.vocoder = bigvgan.BigVGAN.from_pretrained(
            "nvidia/bigvgan_v2_44khz_128band_512x",
            use_cuda_kernel=False
        )
        self.vocoder.remove_weight_norm()
        self.vocoder = self.vocoder.to(self.device).eval()
        print("Loading DeepSeek R1 from local path...")
        from llama_cpp import Llama

        self.deepseek = Llama(
            model_path=r"C:/Users/udayr/.lmstudio/models/lmstudio-community/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf",
            n_ctx=4096,
            n_threads=8,  # adjust for your CPU
            use_mlock=True
        )
        print(" DeepSeek R1 loaded!")

        
        print(" All models loaded successfully!")
    
    def load_mpsenet(self):
        """Load MP-SENet model"""
        # Create a mock hyperparameter object for MP-SENet
        class MPSENetConfig:
            def __init__(self):
                self.dense_channel = 64
                self.n_fft = 400
                self.beta = 1.0
                self.sampling_rate = 16000
        
        config = MPSENetConfig()
        self.mpsenet = MPNet(config, num_tsblocks=4).to(self.device)
        
        # Try to load pretrained weights if available
        try:
            checkpoint_path = "MP-SENet-main/MP-SENet-main/best_ckpt/g_best_vb"
            checkpoint = torch.load(checkpoint_path, map_location=self.device)
            self.mpsenet.load_state_dict(checkpoint['generator'])
            print(" Loaded pretrained MP-SENet weights")
        except:
            print(" Using random MP-SENet weights (pretrained not found)")
        
        self.mpsenet.eval()
    
    def denoise_audio(self, audio_waveform, sample_rate=16000):
        """Denoise audio using MP-SENet"""
        print(" Denoising audio...")
        print(f" Input audio shape: {audio_waveform.shape}")
        print(f" Input audio range: [{np.min(audio_waveform):.4f}, {np.max(audio_waveform):.4f}]")
        print(f" Input sample rate: {sample_rate}")
        
        # Ensure mono and correct sample rate
        if len(audio_waveform.shape) > 1:
            audio_waveform = audio_waveform.mean(axis=0)
         # Check if audio is too quiet
        if np.max(np.abs(audio_waveform)) < 1e-6:
            print("⚠️ Warning: Audio is very quiet or silent!")
        
        # Resample if needed
        if sample_rate != 16000:
            audio_waveform = librosa.resample(audio_waveform, orig_sr=sample_rate, target_sr=16000)
        print(f"📊 Resampled audio shape: {audio_waveform.shape}")
        # Resample if needed
        if sample_rate != 16000:
            audio_waveform = librosa.resample(audio_waveform, orig_sr=sample_rate, target_sr=16000)
        
        # Convert to torch tensor
        audio_tensor = torch.FloatTensor(audio_waveform).unsqueeze(0).to(self.device)
        
        # Compute STFT
        stft = torch.stft(
            audio_tensor, 
            n_fft=400, 
            hop_length=100, 
            win_length=400, 
            return_complex=True
        )
        
        magnitude = torch.abs(stft)
        phase = torch.angle(stft)
        print(f" STFT magnitude shape: {magnitude.shape}")
        print(f" STFT phase shape: {phase.shape}")
        # Apply MP-SENet
        with torch.no_grad():
            denoised_magnitude, denoised_phase, _ = self.mpsenet(magnitude, phase)
        
        # Reconstruct audio
        denoised_stft = denoised_magnitude * torch.exp(1j * denoised_phase)
        denoised_audio = torch.istft(
            denoised_stft,
            n_fft=400,
            hop_length=100,
            win_length=400
        )
        
        return denoised_audio.squeeze().cpu().numpy()
    
    def speech_to_text(self, audio_waveform, sample_rate=16000):
        """Convert speech to text using SpeechT5 ASR"""
        print(" Converting speech to text...")
        print(f" ASR input audio shape: {audio_waveform.shape}")
        print(f" ASR input audio range: [{np.min(audio_waveform):.4f}, {np.max(audio_waveform):.4f}]")
        print(f" ASR input sample rate: {sample_rate}")
        print(f" Audio dtype: {audio_waveform.dtype}")
        print(f" Audio is None: {audio_waveform is None}")
        print(f" Audio length: {len(audio_waveform)}")
        
        # Check if audio is empty or too quiet
        if len(audio_waveform) == 0:
            print(" Error: Audio waveform is empty!")
            return "Error: No audio data"
        
        if np.max(np.abs(audio_waveform)) < 1e-6:
            print(" Warning: Audio is very quiet, may not transcribe well")
        
        # Ensure audio is float32
        if audio_waveform.dtype != np.float32:
            audio_waveform = audio_waveform.astype(np.float32)
        
        # Ensure audio is normalized
        max_val = np.max(np.abs(audio_waveform))
        if max_val > 0:
            audio_waveform = audio_waveform / max_val
            print(f" Normalized audio range: [{np.min(audio_waveform):.4f}, {np.max(audio_waveform):.4f}]")
        
        try:
            # Prepare input - explicitly pass audio parameter
            print(" Processing audio with ASR processor...")
            inputs = self.asr_processor(
                audio=audio_waveform,  # Explicitly specify audio parameter
                sampling_rate=sample_rate, 
                return_tensors="pt"
            )
            
            # Move inputs to device
            inputs = {k: v.to(self.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
            
            print(f" ASR processor inputs keys: {inputs.keys()}")
            for key, value in inputs.items():
                if torch.is_tensor(value):
                    print(f" {key} shape: {value.shape}")
                    print(f" {key} dtype: {value.dtype}")
                    print(f" {key} device: {value.device}")
            
            # Generate transcription
            with torch.no_grad():
                predicted_ids = self.asr_model.generate(**inputs)
                transcription = self.asr_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
            
            print(f" Transcription result: '{transcription}'")
            return transcription
                
        except Exception as e:
            print(f" Error in speech_to_text: {e}")
            print(f" Error type: {type(e)}")
            import traceback
            traceback.print_exc()
            return f"Error: {str(e)}"
    def generate_response(self, prompt: str) -> str:
        """Use DeepSeek R1 to generate a response"""
        print(" Generating response with DeepSeek R1...")
        if "Error:" in prompt:
            return "I'm sorry, I couldn't understand what you said. Could you please try again?"
        
        try:
            output = self.deepseek(
                prompt,
                max_tokens=150,
                stop=["</s>"],
                temperature=0.7
            )
            response = output["choices"][0]["text"].strip()
            print(f" DeepSeek Response: {response}")
            return response
        except Exception as e:
            print(f" Error generating response: {e}")
    def text_to_speech(self, text):
        """Convert text to speech using VITS + BigVGAN"""
        print(f" Converting text to speech: '{text}'")
        
        try:
            # Tokenize text with VITS
            inputs = self.tts_tokenizer(text, return_tensors="pt").to(self.device)
            
            # Generate initial waveform with VITS
            with torch.inference_mode():
                vits_waveform = self.tts_model(**inputs).waveform
                vits_audio = vits_waveform.squeeze().cpu().numpy()
            
            print(f" VITS audio shape: {vits_audio.shape}")
            
            # FIX 1: Use correct sampling rates
            vits_sr = 22050  # ← CHANGE: VITS actually outputs at 22kHz, not 16kHz
            bigvgan_sr = 22050  # ← CHANGE: Match BigVGAN to VITS rate
            
            # FIX 2: Don't resample if not needed
            if vits_sr != bigvgan_sr:
                vits_audio_resampled = librosa.resample(
                    vits_audio, 
                    orig_sr=vits_sr, 
                    target_sr=bigvgan_sr
                )
            else:
                vits_audio_resampled = vits_audio
            speed_factor =1 
            if speed_factor != 1.0:
                print(f"🐌 Slowing down audio by factor {speed_factor}")
                vits_audio_resampled = librosa.effects.time_stretch(
                    vits_audio_resampled, 
                    rate=speed_factor
                )
                print(f"📊 Slowed audio shape: {vits_audio_resampled.shape}")
            
            # FIX 3: Use correct mel-spectrogram parameters for BigVGAN
            mel_spectrogram = librosa.feature.melspectrogram(
                y=vits_audio_resampled,
                sr=bigvgan_sr,
                n_mels=80,        # ← CHANGE: BigVGAN expects 80 mel bands, not 128
                n_fft=1024,       # ← CHANGE: Smaller FFT size
                hop_length=256,   # ← CHANGE: Smaller hop length
                win_length=1024,  # ← CHANGE: Match n_fft
                fmin=0,           # ← ADD: Minimum frequency
                fmax=8000         # ← ADD: Maximum frequency (half of sample rate)
            )
            
            # FIX 4: Convert to log scale and normalize
            mel_spectrogram = np.log(mel_spectrogram + 1e-9)  # ← ADD: Log scale
            mel_spectrogram = (mel_spectrogram - np.mean(mel_spectrogram)) / np.std(mel_spectrogram)  # ← ADD: Normalize
            
            # Convert to torch tensor and add batch dimension
            mel_tensor = torch.FloatTensor(mel_spectrogram).unsqueeze(0).to(self.device)
            
            print(f" Mel spectrogram shape: {mel_tensor.shape}")
            
            # Generate high-quality audio with BigVGAN
            with torch.inference_mode():
                enhanced_audio = self.vocoder(mel_tensor)
                enhanced_audio = enhanced_audio.squeeze().cpu().numpy()
            
            print(f" Final audio shape: {enhanced_audio.shape}")
            return enhanced_audio, bigvgan_sr
            
        except Exception as e:
            print(f" Error in text_to_speech: {e}")
            # Return VITS audio as fallback
            return vits_audio, vits_sr  # ← CHANGE: Return correct sample rate
    
    def full_pipeline(self, audio_input, input_sample_rate=16000, response_text=None):
        """Complete pipeline: denoise → ASR → TTS"""
        print(" Starting full speech pipeline...")
        print(f" Pipeline input audio shape: {audio_input.shape}")
        print(f" Pipeline input sample rate: {input_sample_rate}")
        
        # Step 1: Denoise
        clean_audio = self.denoise_audio(audio_input, input_sample_rate)
        
        # Step 2: Speech to Text
        transcription = self.speech_to_text(clean_audio, 16000)
        print(f" Final transcription: {transcription}")
        
        # Step 3: Generate response
        if response_text is None:
            response_text = self.generate_response(transcription)
        
        # Step 4: Text to Speech
        response_audio, sr = self.text_to_speech(response_text)
        
        return {
            'transcription': transcription,
            'response_text': response_text,
            'response_audio': response_audio,
            'sample_rate': sr,
            'clean_audio': clean_audio
        }
    # 4. Load DeepSeek R1 (local GGUF model via llama.cpp bindings)


In [3]:

# Example usage
def main():
     # Initialize pipeline
    try:
        pipeline = SpeechPipeline()
    except Exception as e:
        print(f" Error initializing pipeline: {e}")
        return
    
    # Record audio (example with sounddevice)
    try:
        import sounddevice as sd
        print(" Recording for 5 seconds...")
        duration = 10
        sample_rate = 16000
        
        # Record audio
        audio_input = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype=np.float32)
        sd.wait()
        audio_input = audio_input.flatten()
        
        print(f" Recorded audio shape: {audio_input.shape}")
        print(f" Recorded audio range: [{np.min(audio_input):.4f}, {np.max(audio_input):.4f}]")
        
        # Check if recording worked
        if np.max(np.abs(audio_input)) < 1e-6:
            print(" Warning: Very quiet recording, using synthetic audio for testing")
            # Generate some test audio
            t = np.linspace(0, duration, int(duration * sample_rate))
            audio_input = 0.1 * np.sin(2 * np.pi * 440 * t)  # 440Hz tone
        
        # Process through pipeline
        result = pipeline.full_pipeline(audio_input, sample_rate)
        
        # Save output
        sf.write("output_speech.wav", result['response_audio'], result['sample_rate'])
        print(f"Output saved to output_speech.wav")
        print(f" Transcription: {result['transcription']}")
        print(f" Response: {result['response_text']}")
        
        # Play result
        sd.play(result['response_audio'], result['sample_rate'])
        sd.wait()
        
    except ImportError:
        print(" sounddevice not available, using dummy audio for testing")
        
        # Use dummy audio for testing
        dummy_audio = np.random.randn(16000 * 3) * 0.1  # 3 seconds of quiet noise
        result = pipeline.full_pipeline(dummy_audio, 16000, "Hello! This is a test response.")
        sf.write("test_output.wav", result['response_audio'], result['sample_rate'])
        print(" Test output saved to test_output.wav")
        print(f" Transcription: {result['transcription']}")
        print(f" Response: {result['response_text']}")
    
    except Exception as e:
        print(f" Error in main: {e}")

if __name__ == "__main__":
    main()

 Setting up models...
Loading MP-SENet for denoising...
 Loaded pretrained MP-SENet weights
Loading SpeechT5 for ASR...
 Error initializing pipeline: (MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /api/models/microsoft/speecht5_asr/tree/main/additional_chat_templates?recursive=False&expand=False (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000231FC230E30>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 88d650a7-537e-459c-8a24-da47ed40fce2)')
