# Speech regonition using Vosk
%pip install vosk

In [15]:
from vosk import Model, KaldiRecognizer

In [47]:
FRAME_RATE = 16000
CHANNELS = 1

model = Model(model_name = "vosk-model-en-us-0.22")

rec = KaldiRecognizer(model, FRAME_RATE)
rec.SetWords(True)

%pip install pydub

In [18]:
from pydub import AudioSegment

In [19]:
mp3 = AudioSegment.from_mp3("marketplace.MP3")
mp3 = mp3.set_channels(CHANNELS)
mp3 = mp3.set_frame_rate(FRAME_RATE)

In [20]:
rec.AcceptWaveform(mp3.raw_data)
result = rec.Result()

In [21]:
import json

text = json.loads(result)["text"]

In [22]:
text

"the funny thing about the big economic news of the day the fed raising interest rates half a percentage point was that there was only really one tidbit of actual news in the news and the interest rate increase wasn't it you knew it was coming i knew it was common wall street news come and businesses knew it was common so on this fed day on this program something a little bit different jay powell in his own words five of 'em his most used economic words from today's press conference were number one of course it's the biggie two percent inflation inflation inflation inflation inflation inflation lh dealing with inflation pals big worry the thing keeping him up at night price stability is the fed's whole ballgame right now pau basically said as much today we're"

%pip install transformers

In [24]:
%pip install torch

Note: you may need to restart the kernel to use updated packages.


In [25]:
# Adding punctuation to our transcript with recasepunc file

import subprocess

cased = subprocess.check_output("/Users/sangw/miniconda3/python recasepunc/recasepunc.py predict recasepunc/checkpoint", shell=True, text=True, input=text)

In [26]:
cased

"The funny thing about the big economic news of the day, the Fed raising interest rates half a percentage point, was that there was only really one tidbit of actual news in the news. And the interest rate increase, wasn ' t it. You knew it was coming. I knew it was common. Wall Street news come and businesses knew it was common. So on this Fed day, on this program, something a little bit different. Jay Powell, in his own words, five of ' em. His most used economic words from today ' s press conference were number one, Of course, it ' s the biggie Two percent inflation, inflation, inflation, inflation, inflation, inflation. Lh dealing with inflation pals Big worry. The thing keeping him up at night, price stability, is the Fed ' s whole ballgame right now. Pau basically said as much today. We ' re.\n"

In [27]:
# A function to transcribe longer audio files

def voice_recognition(filename):
    model = Model(model_name = "vosk-model-en-us-0.22")

    rec = KaldiRecognizer(model, FRAME_RATE)
    rec.SetWords(True)
    
    mp3 = AudioSegment.from_mp3(filename)
    mp3 = mp3.set_channels(CHANNELS)
    mp3 = mp3.set_frame_rate(FRAME_RATE)
    
    step = 45000
    transcript = ""
    for i in range(0, len(mp3), step):
        print(f"Progress: {i/len(mp3)}")
        segment = mp3[i:i+step]
        
        rec.AcceptWaveform(segment.raw_data)
        result = rec.Result()
        
        text = json.loads(result)["text"]
        transcript += text
    
    cased = subprocess.check_output("/Users/sangw/miniconda3/python recasepunc/recasepunc.py predict recasepunc/checkpoint", 
                                    shell=True, text=True, input=transcript)
    return cased

In [28]:
voice_recognition("marketplace.mp3")

Progress: 0.0
Progress: 0.9782608695652174


"The funny thing about the big economic news of the day, the Fed raising interest rates half a percentage point was that there was only really one tidbit of actual news in the news. And the interest rate increase, wasn ' t it. You knew it was coming. I knew it was common Wall Street news come and businesses knew it was common. So on this Fed day on this program, something a little bit different. Jay Powell, in his own words, five of ' em. His most used economic words from today ' s press conference were Number one, Of course, it ' s the biggie Two percent inflation, inflation, inflation, inflation, inflation, inflation. Lh Dealing with inflation pals Big worry. The thing keeping him up at night, price stability, is the Fed ' s whole ballgame right now. Pau basically said as muchmuch to day award.\n"

In [29]:
# Summarizing the transcripts using huggingface transformers

from transformers import pipeline

summarizer = pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [30]:
with open("transcript.txt") as f:
    transcript = f.read()

In [31]:
split_tokens = transcript.split(" ")
docs = []

for i in range(0, len(split_tokens), 850):
    selection = " ".join(split_tokens[i:(i+850)])
    docs.append(selection)

In [32]:
docs

["Turns out fifty four dollars and twenty cents was not a joke from American Public media. This is Marketplace, The in Los Angeles, CA raised on Monday today. I do believe the twenty fifth of April. Good as always to have you along. Everybody. Alright ? Just for fun, I am going to see if I can do this in two hundred and eighty characters. Which is of course Twitter ' s limit starting right now. After making a not very veiled marijuana reference in offering fifty four dollarstwenty cents a share to buy Twitter, Elon Musk has sealed the deal. As of today, Lauren Hirsch has been covering the story for the New York Times. Thanks for coming on. Thanks for having me setting aside all marijuana jokes that many people made with this price that Musk offered. And clearly he was serious. And now this has happened in an unbelievably fast timeline, right ? And we will be fast. I tell you. I was at a shower yesterday, communicating like us to where it ' s kind of casually tracking and then my source

In [33]:
summaries = summarizer(docs)

In [34]:
summaries

[{'summary_text': " This is Marketplace, The in Los Angeles, CA raised on Monday today . After making a not very veiled marijuana reference in offering fifty four dollarstwenty cents a share to buy Twitter, Elon Musk has sealed the deal . Marketplace's Lauren Hirsch has been covering the story for the New York Times . She says the overwhelming majority of the American public isn't interested in Twitter ."},
 {'summary_text': ' Twitter shares on this Monday up almost six percent still, though a couple of bucks shy of must offer of one hundred and fifty four dollars, twenty cents a piece . Jacob Orchard, a PhD candidate at U.C. San Diego, explains how the price index forlow income and high income households can systematically differ over the course of the business cycle . The two biggest sources of inflation are energy prices and food prices .'},
 {'summary_text': " The National Association for Business Economics is out with a new survey of economists to work at big companies . A record 

In [35]:
summary = "\n\n".join(d["summary_text"] for d in summaries)

In [36]:
print(summary)

 This is Marketplace, The in Los Angeles, CA raised on Monday today . After making a not very veiled marijuana reference in offering fifty four dollarstwenty cents a share to buy Twitter, Elon Musk has sealed the deal . Marketplace's Lauren Hirsch has been covering the story for the New York Times . She says the overwhelming majority of the American public isn't interested in Twitter .

 Twitter shares on this Monday up almost six percent still, though a couple of bucks shy of must offer of one hundred and fifty four dollars, twenty cents a piece . Jacob Orchard, a PhD candidate at U.C. San Diego, explains how the price index forlow income and high income households can systematically differ over the course of the business cycle . The two biggest sources of inflation are energy prices and food prices .

 The National Association for Business Economics is out with a new survey of economists to work at big companies . A record number of economists say their businesses have been book and 

%pip install pyaudio

In [10]:
import pyaudio

p = pyaudio.PyAudio()
p.get_device_count()

for i in range(p.get_device_count()):
    print(p.get_device_info_by_index(i).get('name'))

Microsoft Sound Mapper - Input
마이크 (Realtek(R) Audio)
Microsoft Sound Mapper - Output
Speakers/Headphones (Realtek(R)
Primary Sound Capture Driver
마이크 (Realtek(R) Audio)
Primary Sound Driver
Speakers/Headphones (Realtek(R) Audio)
Speakers/Headphones (Realtek(R) Audio)
마이크 (Realtek(R) Audio)
Speakers 1 (Realtek HD Audio output with SST)
Speakers 2 (Realtek HD Audio output with SST)
PC 스피커 (Realtek HD Audio output with SST)
스테레오 믹스 (Realtek HD Audio Stereo input)
마이크 1 (Realtek HD Audio Mic input with SST)
마이크 2 (Realtek HD Audio Mic input with SST)
마이크 3 (Realtek HD Audio Mic input with SST)


In [49]:
def record_microphone(seconds=10, chunk=1024, audio_format=pyaudio.paInt16):
    p = pyaudio.PyAudio()

    stream = p.open(format=audio_format,
                    channels=CHANNELS,
                    rate=FRAME_RATE,
                    input=True,
                    input_device_index=2,
                    frames_per_buffer=chunk)

    frames = []

    for i in range(0, int(FRAME_RATE / chunk * seconds)):
        data = stream.read(chunk)
        frames.append(data)

    stream.stop_stream()
    stream.close()
    p.terminate()

    sound = AudioSegment(
        data=b''.join(frames),
        sample_width=p.get_sample_size(audio_format),
        frame_rate=FRAME_RATE,
        channels=CHANNELS
    )
    sound.export("temp.mp3", "mp3")

%pip install ipywidgets

In [52]:
import ipywidgets as widgets
from IPython.display import display

record_button = widgets.Button(
    description='Record',
    disabled=False,
    button_style='success',
    tooltip='Record',
    icon='microphone'
)

summary = widgets.Output()

def start_recording(data):
    with summary:
        display("Starting the recording.")
        record_microphone()
        display("Finished recording.")
        transcript = voice_recognition("temp.mp3")
        display(f"Transcript: {transcript}")

record_button.on_click(start_recording)

display(record_button, summary)

Button(button_style='success', description='Record', icon='microphone', style=ButtonStyle(), tooltip='Record')

Output()