In [1]:
import os
import re

import speech_recognition as sr 

from sys import argv  
from pydub import AudioSegment, effects
from pydub.silence import split_on_silence 



In [2]:

def normalization():
    '''
    to normalize the audio pitch inside the audio file
    
    input: will take thepath of audio file in .wav form as input
    output: will normalize the audio and save it as normalized.wav
    '''
    
    file = input('Enter the path to recorded file')
    rawsound = AudioSegment.from_file(file, "wav")  
    normalizedsound = effects.normalize(rawsound)  
    normalizedsound.export("normalized.wav", format="wav")
    print("normalized recording saved as normalized.wav \n")

def speechToTextModule(lang="en-in"):
    '''
    to convert audio file to text
    brief: It will firstly normalize the audion after that on the basis of silence
    and frecuence will cut the audio into segments and then it will process 
    each chunk of audio and convert to text. 
    
    input: will take thepath of audio file in .wav form as input
    output: recognized.txt file with all text converted
    '''
    # calling normalization function
    normalization()
    
    # opening normalized audio file and recognized.txt for appending detected text
    song = AudioSegment.from_wav("normalized.wav") 
    fh = open("recognized.txt", "w+") 
          
    # spliting audio into chunks with parameter as silence of 1.2 seconds  
    chunks = split_on_silence(song, 
        # must be silent for at least 1.2 seconds 
        min_silence_len = 1200, 
        # consider it silent if quieter than -50 dBFS 
        silence_thresh = -50
    ) 
  
    # creating a directory to store the audio chunks. 
    try: 
        os.mkdir('audio_chunks') 
    except(FileExistsError): 
        pass
    
    print("folder created for storing the chunks of audio file \n")


    os.chdir('audio_chunks') 
  
    i = 0
    # processing  each chunk 
    for chunk in chunks: 
              
        # Create 0.5 seconds silence chunk 
        chunk_silent = AudioSegment.silent(duration = 10) 
   
        audio_chunk = chunk_silent + chunk + chunk_silent 
  
        # export audio chunk and save it in the current directory. 
        print("saving chunk{0}.wav".format(i)) 
        
        # specify the bitrate to be 192 k 
        audio_chunk.export("./chunk{0}.wav".format(i), bitrate ='192k', format ="wav") 
  
        # the name of the newly created chunk 
        filename = 'chunk'+str(i)+'.wav'
  
        print("Processing chunk "+str(i)) 
  
        # get the name of the newly created chunk 
        # in the AUDIO_FILE variable for later use. 
        file = filename 
  
        # create a speech recognition object 
        r = sr.Recognizer() 
  
        # recognize the chunk 
        with sr.AudioFile(file) as source:  
            r.pause_threshhold = 1
            r.energy_threshold = 7000
            audio_listened = r.listen(source)
            # below could be used in case above three lines are not giving good results 
            # r.adjust_for_ambient_noise(source) 
            # audio_listened = r.listen(source) 
            
        try: 
            # try converting it to text by specifying the language
            rec = r.recognize_google(audio_listened, language=lang) 
            # write the output to the file. 
            fh.write(rec+". ") 
  
        # catch any errors. 
        except sr.UnknownValueError: 
            print("Could not understand audio") 
  
        except sr.RequestError as e: 
            print("no internet connection or access") 
  
        i += 1
  
    os.chdir('..') 

In [4]:
speechToTextModule()