In [11]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import torch
import torchaudio
import tensorflow as tf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
from sklearn.preprocessing import StandardScaler
import soundfile as sf
from IPython.display import Audio, display
import moviepy.editor as mp
import yt_dlp

In [13]:
# Step-by-step Notebook for Voice Cloning

# Step 1: Load the audio file
# Load an audio file and set the sample rate
def load_audio(file_path, sample_rate=16000):
    audio, sr = librosa.load(file_path, sr=sample_rate)
    return audio, sr

In [14]:
# Step 2: Extract Features
# Extract features such as MFCCs, spectral contrast, and chroma from the audio
def extract_features(audio, sr):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    return mfccs, spectral_contrast, chroma

In [15]:
# Step 3: Preprocess the Features
# Standardize the extracted features to make them comparable
def preprocess_features(features):
    scaler = StandardScaler()
    features_scaled = [scaler.fit_transform(feature.T).T for feature in features]
    return features_scaled

In [16]:
# Step 4: Load Pre-trained Model (e.g., Wav2Vec2 for Voice Representation)
# Load the pre-trained Wav2Vec2 model and tokenizer
def load_pretrained_model():
    tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
    return tokenizer, model

In [17]:
# Step 5: Convert Audio to Text Representation
# Convert the audio input into text representation using the pre-trained model
def audio_to_text_representation(audio_path, tokenizer, model):
    audio_input, _ = torchaudio.load(audio_path)
    input_values = tokenizer(audio_input.squeeze().numpy(), return_tensors="pt").input_values
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.decode(predicted_ids[0])
    return transcription

In [18]:
# Step 6: Voice Synthesis (using a TTS model like Tacotron or Tensorspeech)
# Synthesize voice from the given text using a Text-to-Speech model
def synthesize_voice(text, tts_model):
    audio = tts_model.tts(text)
    return audio

In [19]:
# Step 7: Voice Cloning
# Combine the synthesized voice features and the extracted features to make the cloned voice
def clone_voice(features, synthesized_audio):
    cloned_audio = synthesized_audio * 0.6 + features[0].mean() * 0.4  # Combining the original features with the new synthesized ones
    return cloned_audio

In [20]:
# Main function to execute the pipeline in a step-by-step notebook format
def main():
    # Step 1: Load Audio
    file_path = r'C:\Users\syrym\Downloads\research_2\audio.wav'
    audio, sr = load_audio(file_path)
    display(Audio(data=audio, rate=sr))
    print("Loaded Audio File: ", file_path)

    # Step 2: Extract Features
    mfccs, spectral_contrast, chroma = extract_features(audio, sr)
    print("MFCCs Shape: ", mfccs.shape)
    print("Spectral Contrast Shape: ", spectral_contrast.shape)
    print("Chroma Shape: ", chroma.shape)

    # Step 3: Preprocess Features
    features = preprocess_features([mfccs, spectral_contrast, chroma])
    print("Features Preprocessed.")

    # Step 4: Load Pre-trained Model
    tokenizer, model = load_pretrained_model()
    print("Loaded Pre-trained Wav2Vec2 Model.")

    # Step 5: Convert Audio to Text Representation
    transcription = audio_to_text_representation(file_path, tokenizer, model)
    print("Transcription: ", transcription)

    # Step 6: Voice Synthesis (commented out as it requires a TTS model)
    tts_model = load_tts_model()  # Assuming you have a TTS model available
    synthesized_audio = synthesize_voice(transcription, tts_model)
    display(Audio(data=synthesized_audio, rate=sr))

    # Step 7: Voice Cloning (commented out as it requires synthesized audio)
    cloned_voice = clone_voice(features, synthesized_audio)
    sf.write('cloned_voice.wav', cloned_voice, sr)
    display(Audio(data=cloned_voice, rate=sr))

if __name__ == "__main__":
    main()

Loaded Audio File:  C:\Users\syrym\Downloads\research_2\audio.wav
MFCCs Shape:  (40, 2879)
Spectral Contrast Shape:  (7, 2879)
Chroma Shape:  (12, 2879)
Features Preprocessed.


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded Pre-trained Wav2Vec2 Model.
Transcription:  TASFAS TS TATO FAPFA TO TA PA TOO AA SOFA PO FA PASO T TO TOTFO O TOF TO OO  TO TOTO TO  FO  TOTFOO T TFOFO   PA PA PAAAFUO AFAO O O  A FA OOO F TO A T PAOO TA PP TO   TO  O OO TO  PA O F P A PSF AAS FA AFAPAAS FA FASPA PASFA O FO O FO O O FA OTO O F SO OOO O  TOASU  O TO TOTOOSOOP PAASOO TPAA PASO O   TA   PAFAAPAAPA A TO ATPASFA A P TO TO FAO F TO I PA TOTO OA FTO SOFOSO O   TO O SO FOO TO  SO O FO TO F F  TO TO O TO FO  OATO F FO FO FO PATO FO O O  F T OTO FA FO AFA TOSO S PA TOSTO SFA MSFAOPAS SO TO O A PA   F O TO FO OO O FA A  FAO O FO  F O PA FO FA PA FO TOO FO  PA PSP O FAF TO FO O OT SO FTI TO FO OO  FOSO MAO FO FOA PO FASOOFA TOFAS TO FAFATUOFAFO T PASOOSPO TISTO TOSO O SA PAA O OO TO FAFA A TO  O PAF TO PAS FA FO PAFA FAFA SFA FA PAASU  PA A PASPAU O FOO T A FA TO  TO O OO TO PAFO APA O FOO FOOO TO O TO FOFOO O O P TF PMASAAFOO FATO PO  OPO   A FA TOO O A TO TO TOA PA FO A SO TO O O FA A PA PA PAA PO  FAPA FPA FO  TO O O FA 