# Read Audio

In [48]:
import whisper
import os
from io import BytesIO
import jose.utils
import jose.jws
import pandas as pd
import jose.jwe

In [16]:
file_path = './audio/a.mp3'

#### Read and encode audio

In [21]:
def read_audio(recordingName):
    with open(recordingName, 'rb') as file_handle:
        recording_bytes = file_handle.read()
        encoded_bytes = jose.utils.base64url_encode(recording_bytes).decode('utf-8')
    # return byte form
    return bytes(encoded_bytes,'utf-8')

In [20]:
enc_audio = read_audio(file_path)
enc_audio[:5]

b'SUQzB'

#### Transcribe audio

#### Load model

In [43]:
model = whisper.load_model("base")

In [44]:
decoded_body = jose.utils.base64url_decode(enc_audio)

#### Save a temporary file

In [45]:
tmp_file = open("./_temp_file", 'wb')
tmp_file.write(decoded_body)
tmp_file.close()

In [46]:
%%time
transcription = model.transcribe('./_temp_file', fp16=False,word_timestamps=True,language='English')
transcribed_text = transcription['text']
transcribed_word_timestamps = transcription['segments'][0]['words']

CPU times: user 8.09 s, sys: 4.59 s, total: 12.7 s
Wall time: 2.62 s


In [64]:
transcribed_word_timestamps[:10]

[{'word': ' Hey', 'start': 0.0, 'end': 1.14, 'probability': 0.694226086139679},
 {'word': ' my',
  'start': 1.14,
  'end': 2.04,
  'probability': 0.29764410853385925},
 {'word': ' name',
  'start': 2.04,
  'end': 2.28,
  'probability': 0.996715784072876},
 {'word': ' is',
  'start': 2.28,
  'end': 2.46,
  'probability': 0.9935603141784668},
 {'word': ' John',
  'start': 2.46,
  'end': 2.74,
  'probability': 0.7001069784164429},
 {'word': ' my',
  'start': 2.74,
  'end': 3.96,
  'probability': 0.44776391983032227},
 {'word': ' phone',
  'start': 3.96,
  'end': 4.26,
  'probability': 0.9741031527519226},
 {'word': ' number',
  'start': 4.26,
  'end': 4.56,
  'probability': 0.9960719347000122},
 {'word': ' is',
  'start': 4.56,
  'end': 4.88,
  'probability': 0.9915987253189087},
 {'word': ' 813',
  'start': 4.88,
  'end': 5.78,
  'probability': 0.8989248275756836}]

#### Redaction

In [55]:
from transformers import pipeline

In [56]:
gen = pipeline("token-classification", "lakshyakh93/deberta_finetuned_pii", device=-1)

In [57]:
output = gen(transcribed_text, aggregation_strategy="first")



In [61]:
words_to_silence = [i['word'] for i in output]
words_to_silence

[' John', ' 813-567-980', ' Lake Viox']

In [103]:
timetamps_silence = [((j['start']-0.05)*1000,(j['end']+0.2)*1000) for j in transcribed_word_timestamps if j['word'] in words_to_silence]

In [104]:
timetamps_silence

[(2410.0, 2840.0000000000005)]

In [105]:
from pydub import AudioSegment

In [106]:
audio = AudioSegment.from_file(file_path)

In [107]:
for start, end in timetamps_silence:
    silence = AudioSegment.silent(duration=end - start)
    audio = audio[:start] + silence + audio[end:]

In [108]:
audio.export("./audio/redacted_a.mp3", format="mp3")


<_io.BufferedRandom name='./audio/redacted_a.mp3'>