In [1]:
!pip install torch torchaudio transformers jiwer
!pip install git+https://github.com/openai/whisper.git

Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.0.4-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.4 rapidfuzz-3.9.6
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-x2zi2b_7
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-x2zi2b_7
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [

In [2]:
import whisper
import torchaudio
import os
import torch

class AudioTranscriber:
    def __init__(self, model_name="base.en"):
        self.model = whisper.load_model(model_name)
    
    def transcribe(self, audio_path):
        audio, sample_rate = torchaudio.load(audio_path)

        if sample_rate != 16000:
            resample = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            audio = resample(audio)

        audio = whisper.pad_or_trim(audio.flatten()).to("cuda" if torch.cuda.is_available() else "cpu")

        mel = whisper.log_mel_spectrogram(audio)

        options = whisper.DecodingOptions(language="en", without_timestamps=True)
        result = self.model.decode(mel, options)

        return result.text


In [3]:
audio_file = '/kaggle/input/voices/84-121123-0010.wav'
transcriber = AudioTranscriber()

transcription = transcriber.transcribe(audio_file)
print(f"Transcription:\n{transcription}")

output_dir = "/kaggle/working/output_text"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "transcription.txt")

with open(output_file, "w") as file:
    file.write(transcription)

print(f"Transcription saved to {output_file}")

100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 142MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Transcription:
Nautier looked upon morale with one of those melancholy smiles which had so often made Valentine happy and thus fixed his attention. Nautier looked upon morale with one of those melancholy smiles which had so often made Valentine happy and thus fixed his attention.
Transcription saved to /kaggle/working/output_text/transcription.txt


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline

tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b")

generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)

with open(output_file, "r") as file:
    transcription = file.read()

response = generator(transcription, max_length=150, num_return_sequences=1)
response_text = response[0]['generated_text']
print(f"LLM Response:\n{response_text}")

response_dir = "/kaggle/working/response"
os.makedirs(response_dir, exist_ok=True)
response_file = os.path.join(response_dir, "llm_response.txt")

with open(response_file, "w") as file:
    file.write(response_text)

print(f"LLM Response saved to {response_file}")


tokenizer_config.json:   0%|          | 0.00/2.28k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


LLM Response:
Nautier looked upon morale with one of those melancholy smiles which had so often made Valentine happy and thus fixed his attention. Nautier looked upon morale with one of those melancholy smiles which had so often made Valentine happy and thus fixed his attention.
"I am not a man of the world," he said, "and I have never been in love. I have never been in love. I have never been in love.
"I have never been in love," he said, "and I have never been in love.
"I have never been in love," he said, "and I have never been in love.
"I have never been in love," he
LLM Response saved to /kaggle/working/response/llm_response.txt


In [5]:
!pip install gtts

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting gtts
  Downloading gTTS-2.5.3-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.3-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.3


In [7]:
from gtts import gTTS
from IPython.display import Audio, display
import os

def text_to_audio(text, output_file):
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    tts = gTTS(text=text, lang='en', slow=False)
    tts.save(output_file)
    print(f"Audio saved to {output_file}")

response_file = "response/llm_response.txt"
audio_file = "response/llm_response.mp3"

with open(response_file, "r") as file:
    text = file.read()

text_to_audio(text, audio_file)

display(Audio(audio_file, autoplay=True))


Audio saved to response/llm_response.mp3


In [12]:
!sudo apt-get install espeak-ng -y

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
The following NEW packages will be installed:
  espeak-ng espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
0 upgraded, 5 newly installed, 0 to remove and 30 not upgraded.
Need to get 4215 kB of archives.
After this operation, 12.0 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/main amd64 libpcaudio0 amd64 1.1-4 [7908 B]
Get:2 http://archive.ubuntu.com/ubuntu focal/main amd64 libsonic0 amd64 0.2.0-8 [13.1 kB]
Get:3 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 espeak-ng-data amd64 1.50+dfsg-6ubuntu0.1 [3682 kB]
Get:4 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 libespeak-ng1 amd64 1.50+dfsg-6ubuntu0.1 [190 kB]
Get:5 http://archive.ubuntu.com/ubuntu focal-updates/universe amd64 espeak-ng amd64 1.50+dfsg-6ubuntu0.1 [322 kB]


In [29]:
import os
import subprocess

def text_to_audio_espeak(text_file, output_file, pitch=50, speed=175, voice='en-us+f2'):
    # Ensure the directory for the output file exists
    directory = os.path.dirname(output_file)
    if directory:
        os.makedirs(directory, exist_ok=True)
    
    # Read the text from the file
    with open(text_file, 'r') as file:
        text = file.read().strip()
    
    # Construct the espeak-ng command
    command = [
        'espeak-ng',
        f'-p {pitch}',   # Set pitch (0 to 99)
        f'-s {speed}',   # Set speed (words per minute)
        f'-v {voice}',   # Set voice, e.g., 'en-us+f2' for a female voice
        text,            # The text to synthesize
        '--stdout'       # Output to stdout for piping to a file
    ]
    
    # Run the command and write the output to the file
    with open(output_file, 'wb') as f:
        subprocess.run(command, stdout=f, stderr=subprocess.PIPE)
    
    print(f"Audio saved to {output_file}")

# Example usage
text_file = '/kaggle/working/response/llm_response.txt'
output_file = "espeak_response.mp3"
text_to_audio_espeak(text_file, output_file, pitch=70, speed=150, voice='en-us+f2')


Audio saved to espeak_response.mp3


In [31]:
k = str(input("jane or jake"))

if(k=='jane'):
    m = 'en'
else:
    m = 'en-us+f2'

text_file = '/kaggle/working/response/llm_response.txt'
output_file = "espeak_response.mp3"
text_to_audio_espeak(text_file, output_file, pitch=70, speed=150, voice=m)

jane or jake jane


Audio saved to espeak_response.mp3
