In [2]:
!pip install transformers torch torchaudio gtts pydub

import os
import torch
import torchaudio
from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSequenceClassification
from gtts import gTTS
import pydub

# Dictionary of emotion labels
emotion_labels = {
    0: "anger",
    1: "joy",
    2: "surprise",
    3: "sadness",
    4: "others",
    5: "fear",
    6: "disgust"
}

def convert_to_flac(audio_file):
    filename, file_extension = os.path.splitext(audio_file)
    if file_extension.lower() != ".flac":
        print("Converting to .flac...")
        flac_file = filename + ".flac"
        sound = pydub.AudioSegment.from_file(audio_file)
        sound.export(flac_file, format="flac")
        return flac_file
    return audio_file

def speech_to_text(audio_file, model_name):
    # Load the tokenizer and model
    tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
    model = Wav2Vec2ForCTC.from_pretrained(model_name)

    # Convert MP3 file to waveform
    waveform, sample_rate = torchaudio.load(audio_file)
    target_sample_rate = 16000  # Sample rate expected by Wav2Vec2 model

    # Resample the waveform if the sample rate is not 16000 Hz
    if sample_rate != target_sample_rate:
        resampler = torchaudio.transforms.Resample(sample_rate, target_sample_rate)
        waveform = resampler(waveform)

    # Preprocess the audio file
    inputs = tokenizer(waveform.squeeze().numpy(), return_tensors="pt", padding=True)

    # Perform the speech-to-text inference
    with torch.no_grad():
        logits = model(**inputs).logits

    # Decode the CTC output to get the text
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)[0]

    return transcription

def detect_emotion(text, emotion_model):

    # Validate input text
    if not text or not isinstance(text, str):
        raise ValueError("Input text should be a non-empty string.")

    tokenizer = AutoTokenizer.from_pretrained(emotion_model)
    model = AutoModelForSequenceClassification.from_pretrained(emotion_model)

    # Encode the text and convert it into a tensor
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)

    # Get the prediction and convert it to an emotion label
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()

    emotion = emotion_labels[prediction]

    return emotion

def generate_speech(text, emotion):
    # Define emotion-specific voice options
    voice_options = {
        "anger": "com/en-us/stu/stu/",
        "joy": "com/en-us/stw/stw/",
        "surprise": "com/en-us/sts/sts/",
        "sadness": "com/en-us/sts/sts/",
        "others": "com/en-us/sts/sts/",
        "fear": "com/en-us/stf/stf/",
        "disgust": "com/en-us/std/std/"
    }

    if emotion in voice_options:
        voice = voice_options[emotion] + "default"
    else:
        voice = "com/en-us/sts/sts/default"

    tts = gTTS(text=text, lang='en', slow=True, tld='com', lang_check=False)
    filename = f"emotion_{emotion}.mp3"
    tts.save(filename)

    return filename

def main(audio_file):
    model_transcribe = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")

    transcription = speech_to_text(audio_file, "facebook/wav2vec2-large-960h-lv60-self")
    print("--- Transcription ---")
    print(transcription)

    detected_emotion = detect_emotion(transcription, "finiteautomata/bertweet-base-emotion-analysis")
    print("Detected Emotion:", detected_emotion)

    speech_text = transcription  # Use the transcription as the speech text
    speech_file = generate_speech(speech_text, detected_emotion)
    print("Generated Speech File:", speech_file)

    return speech_file

# Example usage
audio_file_path = "/content/ElevenLabs_sample.mp3"
final_audio_file = main(audio_file_path)
print("Final Audio File:", final_audio_file)


Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
Collecting gtts
  Downloading gTTS-2.3.2-py3-none-any.whl (28 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinu

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/162 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Transcription ---
AS THE GOLDEN SUN DIPPED BELOW THE HORIZON CASTING A MESMERIZING TAPESTRY OF COLORS ACROSS THE SKY MY HEART SWELLED WITH AN OVERWHELMING MIX OF NOSTALGIA GRATITUDE AND HOPE FOR IT WAS IN THAT FLEETING MOMENT THAT I REALIZED THE BEAUTY OF LIFE'S IMPERMANENCE


Downloading (…)okenizer_config.json:   0%|          | 0.00/295 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/999 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Downloading model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

Detected Emotion: joy
Generated Speech File: emotion_joy.mp3
Final Audio File: emotion_joy.mp3
