Skip to content

Commit

Permalink
Incorporate GPU optimizations, noise threshold, and window size in fa…
Browse files Browse the repository at this point in the history
…sterWhisper

This commit enhances the performance of the fasterWhisper script by introducing several optimizations.
- The code is modified to run on the GPU with FP32 precision, providing faster computation speed.
- A noise threshold value is added, set to a high value of 0.9, to handle noisy radio traffic effectively.
- A window size of 1536 samples is supported for 16000 sampling_rate to capture precise audio information.
These improvements enhance transcription accuracy and make the script more efficient. No issues referenced.
  • Loading branch information
swiftraccoon committed Nov 26, 2023
1 parent 5c2b00e commit 6143cb3
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 22 deletions.
20 changes: 14 additions & 6 deletions scripts/fasterWhisper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sys
import json
from faster_whisper import WhisperModel
import logging
import json

# Check if a filename is provided
if len(sys.argv) < 2:
Expand All @@ -17,10 +17,14 @@
MODEL_SIZE = "large-v2"
BEAM_SIZE = 1
LANGUAGE = "en"
MIN_SILENCE_DURATION_MS = 1500
MIN_SILENCE_DURATION_MS = 1000
# Radio traffic is usually noisy, so we set the threshold to a high value
THRESHOLD = 0.9
# Supported window_size_samples: [512, 1024, 1536] for 16000 sampling_rate
WINDOW_SIZE_SAMPLES = 1536

# Run on GPU with FP16
model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float16")
# Run on GPU with FP32
model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="float32")

# or run on GPU with INT8
# model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="int8_float16")
Expand All @@ -29,15 +33,19 @@

segments, info = model.transcribe(
MP3_FILEPATH, beam_size=BEAM_SIZE, vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=MIN_SILENCE_DURATION_MS),
vad_parameters=dict(threshold=THRESHOLD,
min_silence_duration_ms=MIN_SILENCE_DURATION_MS,
window_size_samples=WINDOW_SIZE_SAMPLES),
language=LANGUAGE)

segments = list(segments)
# Create a new list with the desired format
FORMATTED_SEGMENTS = [{"text": segment.text} for segment in segments]

# Extract the 'text' values and concatenate them into a single string
FORMATTED_TEXT = " ".join([segment['text'] for segment in FORMATTED_SEGMENTS])
FORMATTED_TEXT = " ".join(
[segment['text'].strip() for segment in FORMATTED_SEGMENTS]
)

# Create a dictionary with the concatenated text
FORMATTED_RESULT = {"text": FORMATTED_TEXT}
Expand Down
44 changes: 28 additions & 16 deletions src/fasterWhisper.cpp
Original file line number Diff line number Diff line change
@@ -1,42 +1,54 @@
#include <iostream>
#include <string>
#include <stdio.h>
#include <memory>
#include <array>
#include <memory>
// Include necessary headers for Windows
#ifdef _WIN32
#include <stdio.h>
#else
// POSIX headers for other platforms
#include <cstdio>
#endif

std::string trim(const std::string& str) {
const char* whitespace = " \t\n\r\f\v";
std::string trim(const std::string &str)
{
const char *whitespace = " \t\n\r\f\v";

size_t start = str.find_first_not_of(whitespace);
size_t end = str.find_last_not_of(whitespace);

return (start == std::string::npos || end == std::string::npos) ? "" : str.substr(start, end - start + 1);
}

std::string local_transcribe_audio(const std::string& mp3FilePath) {
// Command to execute the Python script
std::string local_transcribe_audio(const std::string &mp3FilePath)
{
std::string command = "python fasterWhisper.py " + mp3FilePath;

// Create a pipe to read the output of the executed command
std::array<char, 128> buffer;
std::string result;

// Use the appropriate popen and pclose functions based on the platform
#ifdef _WIN32
std::unique_ptr<FILE, decltype(&_pclose)> pipe(_popen(command.c_str(), "r"), _pclose);
#else
std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(command.c_str(), "r"), pclose);
if (!pipe) {
#endif

if (!pipe)
{
throw std::runtime_error("popen() failed!");
}

// Read the output a line at a time
while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr)
{
result += buffer.data();
}

// Find the start of the JSON object and return everything from this point
size_t jsonStartPos = result.find('{');
if (jsonStartPos != std::string::npos) {
std::string jsonResult = result.substr(jsonStartPos);
// Trim whitespace and newline characters
return trim(jsonResult);
if (jsonStartPos != std::string::npos)
{
return trim(result.substr(jsonStartPos));
}

return "";
return "MUCH_BROKEN_very_wow";
}

0 comments on commit 6143cb3

Please sign in to comment.