# Read Audio

In [13]:
import whisper
import os
from io import BytesIO
import jose.utils
import jose.jws
import pandas as pd
import jose.jwe

In [14]:
file_path = './audio/a.mp3'

#### Read and encode audio

In [15]:
def read_audio(recordingName):
    with open(recordingName, 'rb') as file_handle:
        recording_bytes = file_handle.read()
        encoded_bytes = jose.utils.base64url_encode(recording_bytes).decode('utf-8')
    # return byte form
    return bytes(encoded_bytes,'utf-8')

In [16]:
enc_audio = read_audio(file_path)
enc_audio[:5]

b'SUQzB'

#### Transcribe audio

#### Load model

In [17]:
model = whisper.load_model("base")

In [18]:
decoded_body = jose.utils.base64url_decode(enc_audio)

#### Save a temporary file

In [19]:
tmp_file = open("./_temp_file", 'wb')
tmp_file.write(decoded_body)
tmp_file.close()

In [20]:
%%time
transcription = model.transcribe('./_temp_file', fp16=False,word_timestamps=True,language='English')
transcribed_text = transcription['text']
transcribed_word_timestamps = transcription['segments'][0]['words']

CPU times: user 8.3 s, sys: 4.59 s, total: 12.9 s
Wall time: 3.05 s


In [21]:
transcribed_word_timestamps[:10]

[{'word': ' Hey', 'start': 0.0, 'end': 1.14, 'probability': 0.694226086139679},
 {'word': ' my',
  'start': 1.14,
  'end': 2.04,
  'probability': 0.29764410853385925},
 {'word': ' name',
  'start': 2.04,
  'end': 2.28,
  'probability': 0.996715784072876},
 {'word': ' is',
  'start': 2.28,
  'end': 2.46,
  'probability': 0.9935603141784668},
 {'word': ' John',
  'start': 2.46,
  'end': 2.74,
  'probability': 0.7001069784164429},
 {'word': ' my',
  'start': 2.74,
  'end': 3.96,
  'probability': 0.44776391983032227},
 {'word': ' phone',
  'start': 3.96,
  'end': 4.26,
  'probability': 0.9741031527519226},
 {'word': ' number',
  'start': 4.26,
  'end': 4.56,
  'probability': 0.9960719347000122},
 {'word': ' is',
  'start': 4.56,
  'end': 4.88,
  'probability': 0.9915987253189087},
 {'word': ' 813',
  'start': 4.88,
  'end': 5.78,
  'probability': 0.8989248275756836}]

#### Redaction

In [55]:
from transformers import pipeline

In [56]:
gen = pipeline("token-classification", "lakshyakh93/deberta_finetuned_pii", device=-1)

In [57]:
output = gen(transcribed_text, aggregation_strategy="first")



In [61]:
words_to_silence = [i['word'] for i in output]
words_to_silence

[' John', ' 813-567-980', ' Lake Viox']

In [109]:
timetamps_silence = [((j['start']-0.05)*1000,(j['end']+0.2)*1000) for j in transcribed_word_timestamps if j['word'] in words_to_silence]

In [110]:
timetamps_silence

[(2410.0, 2940.0000000000005)]

In [111]:
from pydub import AudioSegment

In [112]:
audio = AudioSegment.from_file(file_path)

In [119]:
timetamps_silence

[(2410.0, 2940.0000000000005)]

In [129]:
import re
def replace_special_characters_with_space(text):
    # This will replace all non-alphanumeric characters with spaces
    # If you want to keep underscores, remove the underscore from the regex pattern
    return re.sub(r'\W|_', ' ', text)
words_to_silence = [replace_special_characters_with_space(i).split() for i in words_to_silence]

TypeError: expected string or bytes-like object

In [140]:
words_to_silence = [j for i in words_to_silence for j in i]

In [143]:
silence_segments = []

for ts in transcribed_word_timestamps:
    # Normalize the word for matching (strip leading/trailing spaces and convert to lowercase)
    normalized_word = ts['word'].strip().lower()
    if any(word[0].strip().lower() in normalized_word for word in words_to_silence):
        silence_segments.append((ts['start']*1000, ts['end']*1000))

silence_segments

[(2460.0, 2740.0),
 (4880.0, 5780.0),
 (5780.0, 7300.0),
 (7300.0, 8600.0),
 (10100.0, 10340.0),
 (10520.0, 10700.0),
 (10700.0, 11240.0)]

In [146]:
delta_start = 0.0
delta_end = 0.05
for start, end in silence_segments:
    start += delta_start
    end += delta_end
    silence = AudioSegment.silent(duration=end - start)
    audio = audio[:start] + silence + audio[end:]

In [147]:
audio.export("./audio/redacted_a.mp3", format="mp3")


<_io.BufferedRandom name='./audio/redacted_a.mp3'>

In [115]:
words_to_silence

[' John', ' 813-567-980', ' Lake Viox']

In [116]:
transcribed_word_timestamps

[{'word': ' Hey', 'start': 0.0, 'end': 1.14, 'probability': 0.694226086139679},
 {'word': ' my',
  'start': 1.14,
  'end': 2.04,
  'probability': 0.29764410853385925},
 {'word': ' name',
  'start': 2.04,
  'end': 2.28,
  'probability': 0.996715784072876},
 {'word': ' is',
  'start': 2.28,
  'end': 2.46,
  'probability': 0.9935603141784668},
 {'word': ' John',
  'start': 2.46,
  'end': 2.74,
  'probability': 0.7001069784164429},
 {'word': ' my',
  'start': 2.74,
  'end': 3.96,
  'probability': 0.44776391983032227},
 {'word': ' phone',
  'start': 3.96,
  'end': 4.26,
  'probability': 0.9741031527519226},
 {'word': ' number',
  'start': 4.26,
  'end': 4.56,
  'probability': 0.9960719347000122},
 {'word': ' is',
  'start': 4.56,
  'end': 4.88,
  'probability': 0.9915987253189087},
 {'word': ' 813',
  'start': 4.88,
  'end': 5.78,
  'probability': 0.8989248275756836},
 {'word': '-567',
  'start': 5.78,
  'end': 7.3,
  'probability': 0.7167696058750153},
 {'word': '-980', 'start': 7.3, 'end'