add one more function for fixed interval chunking for audio files for speech-to-text tutorial

x4nth055 · x4nth055 · commit 1b4dc98fc25c · 2023-05-08T06:06:13.000+01:00
diff --git a/machine-learning/speech-recognition/long_audio_recognizer.py b/machine-learning/speech-recognition/long_audio_recognizer.py
@@ -7,16 +7,24 @@
 # create a speech recognition object
 r = sr.Recognizer()
 
-# a function that splits the audio file into chunks
+# a function to recognize speech in the audio file
+# so that we don't repeat ourselves in in other functions
+def transcribe_audio(path):
+    # use the audio file as the audio source
+    with sr.AudioFile(path) as source:
+        audio_listened = r.record(source)
+        # try converting it to text
+        text = r.recognize_google(audio_listened)
+    return text
+
+# a function that splits the audio file into chunks on silence
 # and applies speech recognition
-def get_large_audio_transcription(path):
-    """
-    Splitting the large audio file into chunks
-    and apply speech recognition on each of these chunks
-    """
+def get_large_audio_transcription_on_silence(path):
+    """Splitting the large audio file into chunks
+    and apply speech recognition on each of these chunks"""
     # open the audio file using pydub
-    sound = AudioSegment.from_wav(path)  
-    # split audio sound where silence is 700 miliseconds or more and get chunks
+    sound = AudioSegment.from_file(path)  
+    # split audio sound where silence is 500 miliseconds or more and get chunks
     chunks = split_on_silence(sound,
         # experiment with this value for your target audio file
         min_silence_len = 500,
@@ -37,24 +45,59 @@ def get_large_audio_transcription(path):
         chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
         audio_chunk.export(chunk_filename, format="wav")
         # recognize the chunk
-        with sr.AudioFile(chunk_filename) as source:
-            audio_listened = r.record(source)
-            # try converting it to text
-            try:
-                text = r.recognize_google(audio_listened)
-            except sr.UnknownValueError as e:
-                print("Error:", str(e))
-            else:
-                text = f"{text.capitalize()}. "
-                print(chunk_filename, ":", text)
-                whole_text += text
+        try:
+            text = transcribe_audio(chunk_filename)
+        except sr.UnknownValueError as e:
+            print("Error:", str(e))
+        else:
+            text = f"{text.capitalize()}. "
+            print(chunk_filename, ":", text)
+            whole_text += text
     # return the text for all chunks detected
     return whole_text
 
 
+# a function that splits the audio file into fixed interval chunks
+# and applies speech recognition
+def get_large_audio_transcription_fixed_interval(path, minutes=5):
+    """Splitting the large audio file into fixed interval chunks
+    and apply speech recognition on each of these chunks"""
+    # open the audio file using pydub
+    sound = AudioSegment.from_file(path)  
+    # split the audio file into chunks
+    chunk_length_ms = int(1000 * 60 * minutes) # convert to milliseconds
+    chunks = [sound[i:i + chunk_length_ms] for i in range(0, len(sound), chunk_length_ms)]
+    folder_name = "audio-fixed-chunks"
+    # create a directory to store the audio chunks
+    if not os.path.isdir(folder_name):
+        os.mkdir(folder_name)
+    whole_text = ""
+    # process each chunk 
+    for i, audio_chunk in enumerate(chunks, start=1):
+        # export audio chunk and save it in
+        # the `folder_name` directory.
+        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
+        audio_chunk.export(chunk_filename, format="wav")
+        # recognize the chunk
+        try:
+            text = transcribe_audio(chunk_filename)
+        except sr.UnknownValueError as e:
+            print("Error:", str(e))
+        else:
+            text = f"{text.capitalize()}. "
+            print(chunk_filename, ":", text)
+            whole_text += text
+    # return the text for all chunks detected
+    return whole_text
+
+
+
 if __name__ == '__main__':
     import sys
     # path = "30-4447-0004.wav"
     # path = "7601-291468-0006.wav"
     path = sys.argv[1]
-    print("\nFull text:", get_large_audio_transcription(path))
+    print("\nFull text:", get_large_audio_transcription_on_silence(path))
+    print("="*50)
+    print("\nFull text:", get_large_audio_transcription_fixed_interval(path, minutes=1/6))
+