Skip to content

Commit 1b4dc98

Browse files
committed
add one more function for fixed interval chunking for audio files for speech-to-text tutorial
1 parent 8f89ebe commit 1b4dc98

File tree

1 file changed

+63
-20
lines changed

1 file changed

+63
-20
lines changed

machine-learning/speech-recognition/long_audio_recognizer.py

+63-20
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,24 @@
77
# create a speech recognition object
88
r = sr.Recognizer()
99

10-
# a function that splits the audio file into chunks
10+
# a function to recognize speech in the audio file
11+
# so that we don't repeat ourselves in in other functions
12+
def transcribe_audio(path):
13+
# use the audio file as the audio source
14+
with sr.AudioFile(path) as source:
15+
audio_listened = r.record(source)
16+
# try converting it to text
17+
text = r.recognize_google(audio_listened)
18+
return text
19+
20+
# a function that splits the audio file into chunks on silence
1121
# and applies speech recognition
12-
def get_large_audio_transcription(path):
13-
"""
14-
Splitting the large audio file into chunks
15-
and apply speech recognition on each of these chunks
16-
"""
22+
def get_large_audio_transcription_on_silence(path):
23+
"""Splitting the large audio file into chunks
24+
and apply speech recognition on each of these chunks"""
1725
# open the audio file using pydub
18-
sound = AudioSegment.from_wav(path)
19-
# split audio sound where silence is 700 miliseconds or more and get chunks
26+
sound = AudioSegment.from_file(path)
27+
# split audio sound where silence is 500 miliseconds or more and get chunks
2028
chunks = split_on_silence(sound,
2129
# experiment with this value for your target audio file
2230
min_silence_len = 500,
@@ -37,24 +45,59 @@ def get_large_audio_transcription(path):
3745
chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
3846
audio_chunk.export(chunk_filename, format="wav")
3947
# recognize the chunk
40-
with sr.AudioFile(chunk_filename) as source:
41-
audio_listened = r.record(source)
42-
# try converting it to text
43-
try:
44-
text = r.recognize_google(audio_listened)
45-
except sr.UnknownValueError as e:
46-
print("Error:", str(e))
47-
else:
48-
text = f"{text.capitalize()}. "
49-
print(chunk_filename, ":", text)
50-
whole_text += text
48+
try:
49+
text = transcribe_audio(chunk_filename)
50+
except sr.UnknownValueError as e:
51+
print("Error:", str(e))
52+
else:
53+
text = f"{text.capitalize()}. "
54+
print(chunk_filename, ":", text)
55+
whole_text += text
5156
# return the text for all chunks detected
5257
return whole_text
5358

5459

60+
# a function that splits the audio file into fixed interval chunks
61+
# and applies speech recognition
62+
def get_large_audio_transcription_fixed_interval(path, minutes=5):
63+
"""Splitting the large audio file into fixed interval chunks
64+
and apply speech recognition on each of these chunks"""
65+
# open the audio file using pydub
66+
sound = AudioSegment.from_file(path)
67+
# split the audio file into chunks
68+
chunk_length_ms = int(1000 * 60 * minutes) # convert to milliseconds
69+
chunks = [sound[i:i + chunk_length_ms] for i in range(0, len(sound), chunk_length_ms)]
70+
folder_name = "audio-fixed-chunks"
71+
# create a directory to store the audio chunks
72+
if not os.path.isdir(folder_name):
73+
os.mkdir(folder_name)
74+
whole_text = ""
75+
# process each chunk
76+
for i, audio_chunk in enumerate(chunks, start=1):
77+
# export audio chunk and save it in
78+
# the `folder_name` directory.
79+
chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
80+
audio_chunk.export(chunk_filename, format="wav")
81+
# recognize the chunk
82+
try:
83+
text = transcribe_audio(chunk_filename)
84+
except sr.UnknownValueError as e:
85+
print("Error:", str(e))
86+
else:
87+
text = f"{text.capitalize()}. "
88+
print(chunk_filename, ":", text)
89+
whole_text += text
90+
# return the text for all chunks detected
91+
return whole_text
92+
93+
94+
5595
if __name__ == '__main__':
5696
import sys
5797
# path = "30-4447-0004.wav"
5898
# path = "7601-291468-0006.wav"
5999
path = sys.argv[1]
60-
print("\nFull text:", get_large_audio_transcription(path))
100+
print("\nFull text:", get_large_audio_transcription_on_silence(path))
101+
print("="*50)
102+
print("\nFull text:", get_large_audio_transcription_fixed_interval(path, minutes=1/6))
103+

0 commit comments

Comments
 (0)