In [None]:
import os
import torch

import numpy as np
import matplotlib.pyplot as plt

import librosa
from librosa import display
from pydub import AudioSegment
from pydub.silence import split_on_silence

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [None]:
dir = os.path.abspath(os.getcwd())
path = os.path.join(dir, "sample_audio.wav")

sample_rate = 16000

audio, _ = librosa.load(path, sr = sample_rate)

In [None]:
plt.figure()
display.waveplot(y=audio, sr=sample_rate)
plt.xlabel("Time (seconds) ==>")
plt.ylabel("Amplitude")
plt.show()

In [None]:
# Function to normalize given audio chunk
def match_target_amplitude(aChunk, target_dBFS):
  change_in_dBFS = target_dBFS - aChunk.dBFS
  return aChunk.apply_gain(change_in_dBFS)


# Function that splits the audio file into chunks
def silence_based_conversion(path = "sample_audio.wav"):

  audio = AudioSegment.from_wav(path)
  dBFS = audio.dBFS

  # split track where there is silence is
  # for 0.8 seconds or more and get chunks
  chunks = split_on_silence(audio,
      # Specify that a silent chunk must be
      # at least 0.8 seconds or 800 ms long.
      min_silence_len = 800,

      # consider it silent if quieter than -16 dBFS
      # adjust this per requirement
      silence_thresh = dBFS-16
  )

  # Create directory to store chunks
  try:
      os.mkdir('audio_chunks')
  except(FileExistsError):
      pass

  os.chdir('audio_chunks')

  # Create 0.5 seconds silence chunk
  chunk_silent = AudioSegment.silent(duration = 500)
  i = 0

  # process each chunk
  for chunk in chunks:

    # add 0.5 sec silence to beginning and 
    # end of audio chunk. This is done so that
    # it doesn't seem abruptly sliced.
    audio_chunk = chunk_silent + chunk + chunk_silent

    # Normalize the entire chunk.
    normalized_chunk = match_target_amplitude(audio_chunk, -20.0)

    print("saving chunk{0}.wav".format(i))
    normalized_chunk.export("./chunk{0}.wav".format(i), bitrate ='192k', format ="wav")
    i += 1
  
  os.chdir('..')
  return i

total_chunks = silence_based_conversion(path)

In [None]:
# load pretrained model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

In [None]:
os.chdir(os.path.join(dir, 'audio_chunks'))

text = ''

for i in range(total_chunks):
  audio, _ = librosa.load("chunk{0}.wav".format(i), sr = sample_rate)
  input_values = processor(audio, sampling_rate=sample_rate, return_tensors = "pt").input_values

  # Storing logits (non-normalized prediction values)
  logits = model(input_values).logits

  # Storing predicted ids
  prediction = torch.argmax(logits, dim = -1)

  # Passing the prediction to the tokenzer decode to get the transcription
  transcription = processor.batch_decode(prediction)[0]

  text += transcription + '\n'
  print('chunk{0} processed'.format(i))

os.chdir('..')

In [None]:
print(text)

In [None]:
file = open(r"text.txt","w+")
file.write(text)
file.close()