Setting up the GPU

In [1]:
import tensorflow as tf
tf.config.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')


Checking if GPU is working

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs:", len(physical_devices))


Num GPUs: 1


General Dependencies

In [3]:
import pandas as pd
import numpy as np


In [4]:
FRAME_RATE=16000
CHANNELS = 1

Importing the Vosk library, the model to build the model and the recognizer to use this model

In [5]:
from vosk import Model, KaldiRecognizer

Loading the pretrained model

In [6]:
model = Model(model_name='vosk-model-en-us-0.22')

Initializing the recognizer

In [7]:
recognizer = KaldiRecognizer(model, FRAME_RATE)
recognizer.SetWords(True)

Load and preprocess the audio files using pydub/soundfile

In [8]:
# import soundfile as sf
# from scipy import signal

# file_path = "sample-0.mp3"  # Replace with the correct file path

# # Manually set the number of channels and frame rate


# # Open the audio file with the desired channels
# with sf.SoundFile(file_path, "r") as audio_file:
#     audio = audio_file.read(frames=-1, dtype="float32", always_2d=True)

# # Adjust the number of channels if necessary
# if audio.shape[1] != CHANNELS:
#     audio = audio[:, :CHANNELS]

# # Resample the audio to the desired frame rate
# resampled_audio = signal.resample_poly(audio, FRAME_RATE * audio_file.samplerate, audio_file.samplerate)

# # Print the updated audio properties
# print("Audio shape:", resampled_audio.shape)
# print("Number of channels:", CHANNELS)
# print("Sample rate:", FRAME_RATE)



# Trying Pydub

from pydub import AudioSegment

audio =AudioSegment.from_mp3('./marketplace.mp3')

Pass the audio to the model for conversion/transcription

In [9]:

audio = audio.set_channels(CHANNELS)
audio = audio.set_frame_rate(FRAME_RATE)


Pass the audio to model now

In [10]:
recognizer.AcceptWaveform(audio.raw_data)

result = recognizer.Result()
result

'{\n  "result" : [{\n      "conf" : 1.000000,\n      "end" : 0.180000,\n      "start" : 0.090000,\n      "word" : "the"\n    }, {\n      "conf" : 1.000000,\n      "end" : 0.570000,\n      "start" : 0.180000,\n      "word" : "funny"\n    }, {\n      "conf" : 1.000000,\n      "end" : 0.960000,\n      "start" : 0.600000,\n      "word" : "thing"\n    }, {\n      "conf" : 1.000000,\n      "end" : 1.230000,\n      "start" : 0.960000,\n      "word" : "about"\n    }, {\n      "conf" : 1.000000,\n      "end" : 1.320000,\n      "start" : 1.230000,\n      "word" : "the"\n    }, {\n      "conf" : 1.000000,\n      "end" : 1.680000,\n      "start" : 1.320000,\n      "word" : "big"\n    }, {\n      "conf" : 1.000000,\n      "end" : 2.220000,\n      "start" : 1.710000,\n      "word" : "economic"\n    }, {\n      "conf" : 1.000000,\n      "end" : 2.490000,\n      "start" : 2.220000,\n      "word" : "news"\n    }, {\n      "conf" : 1.000000,\n      "end" : 2.580000,\n      "start" : 2.490000,\n      "wo

The result is in the json format, we need to reconfigure it in a readable format

In [11]:
import json
text = json.loads(result)['text']

text

"the funny thing about the big economic news of the day the fed raising interest rates half a percentage point was that there was only really one tidbit of actual news in the news and the interest rate increase wasn't it you knew it was coming i knew it was common wall street news come and businesses knew it was common so on this fed day on this program something a little bit different jay powell in his own words five of 'em his most used economic words from today's press conference were number one of course it's the biggie two percent inflation inflation inflation inflation inflation inflation lh dealing with inflation pals big worry the thing keeping him up at night price stability is the fed's whole ballgame right now pau basically said as much today we're"

Adding Punctuations to the text

In [12]:
import sys

python_path = sys.executable
print(python_path)


r:\YRANA\ML\PROJECTS\NLP\Speech_Recognition\nlpenv\python.exe


In [17]:
import subprocess


cased = subprocess.check_output("r:/YRANA/ML/PROJECTS/NLP/Speech_Recognition/nlpenv/python.exe recasepunc/recasepunc.py predict recasepunc/checkpoint", shell=True, text=True, input=text)

cased


"The funny thing about the big economic news of the day, the Fed raising interest rates half a percentage point, was that there was only really one tidbit of actual news in the news. And the interest rate increase, wasn ' t it. You knew it was coming. I knew it was common. Wall Street news come and businesses knew it was common. So on this Fed day, on this program, something a little bit different. Jay Powell, in his own words, five of ' em. His most used economic words from today ' s press conference were number one, Of course, it ' s the biggie Two percent inflation, inflation, inflation, inflation, inflation, inflation. Lh dealing with inflation pals Big worry. The thing keeping him up at night, price stability, is the Fed ' s whole ballgame right now. Pau basically said as much today. We ' re.\n"

Define a function to apply the model and the punctuation on an audio file >45s

In [18]:
def Voice_Recognition(file_name):
    #Define and load model
    model = Model(model_name='vosk-model-en-us-0.22')
    recognizer = KaldiRecognizer(model, FRAME_RATE)
    recognizer.SetWords(True)

    audio = AudioSegment.from_mp3('./marketplace_full.mp3')
    audio = audio.set_frame_rate(FRAME_RATE)
    audio =  audio.set_channels(CHANNELS)

    
    step = 45000
    transcript = ""  #define an empty string

    for i in range(0, len(audio), step):
        print(f"progress : {i/len(audio)}")

        #Break the audio into segments
        segment = audio[i:(i+step)]

        #run the recognizer now on this small segment
        recognizer.AcceptWaveform(segment.raw_data)
        result = recognizer.Result()
        text =  json.loads(result)['text']

        #Add this text to transcript
        transcript +=text

    #Apply Punctuation to the transcripted text
    cased = subprocess.check_output("r:/YRANA/ML/PROJECTS/NLP/Speech_Recognition/nlpenv/python.exe recasepunc/recasepunc.py predict recasepunc/checkpoint", shell=True, text=True, input=text)
    return cased

In [19]:
transcript = Voice_Recognition('Notebooks\marketplace_full.mp3')
transcript

progress : 0.0
progress : 0.02666815218151411
progress : 0.05333630436302822
progress : 0.08000445654454233
progress : 0.10667260872605644
progress : 0.13334076090757055
progress : 0.16000891308908466
progress : 0.18667706527059877
progress : 0.21334521745211288
progress : 0.240013369633627
progress : 0.2666815218151411
progress : 0.29334967399665524
progress : 0.3200178261781693
progress : 0.34668597835968346
progress : 0.37335413054119754
progress : 0.4000222827227117
progress : 0.42669043490422576
progress : 0.4533585870857399
progress : 0.480026739267254
progress : 0.5066948914487681
progress : 0.5333630436302822
progress : 0.5600311958117963
progress : 0.5866993479933105
progress : 0.6133675001748246
progress : 0.6400356523563386
progress : 0.6667038045378528
progress : 0.6933719567193669
progress : 0.720040108900881
progress : 0.7467082610823951
progress : 0.7733764132639093
progress : 0.8000445654454234
progress : 0.8267127176269374
progress : 0.8533808698084515
progress : 0.880

"Musk ' s new company, Twitter, of course, reports earnings on Thursday. Our daily production team is Unease, I ' m in and Corbyn Richard Cunningham, Real horse, John McHenry and Daisy Palacios on Car. Resolved, We will see tomorrow the APIa\n"

Text Summarization

In [20]:
from transformers import pipeline

In [21]:
summarizer = pipeline('summarization')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading (…)lve/main/config.json: 100%|██████████| 1.80k/1.80k [00:00<00:00, 259kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading pytorch_model.bin: 100%|██████████| 1.22G/1.22G [06:25<00:00, 3.17MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 2.37kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.04MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 810kB/s]


Split the text into small sets :-  Since hugging face processes only 1024 tokens at a time

So, we need to tokenize our text/transcript

In [22]:
split_tokens = transcript.split(" ")
docs = []

for i in range(0, len(split_tokens), 850):
    selection =  "".join(split_tokens[i:(i+850)])
    docs.append(selection)

ValueError: empty separator