# Transcribe (with speaker IDs) audio files

### Overview

* Audio files (mp3) of interviews and podcasts in `data/audio`
* Diarization (giving segments of speech speaker IDs) using picovoice model and API
* Transcription of speech using OpenAI Whisper model
* **IMPORTANT - OpenAI Whisper Model through API has a limit of 25Mb for an audio file to transcribe**
---

### Setup

In [64]:
import pvfalcon
from openai import OpenAI
from dotenv import load_dotenv

import re
import os
import shutil
import json
import glob

_ = load_dotenv()

#### Set up model access

In [65]:
os.environ['ACCESS_KEY'] = 'IgAZ8Ob8wFjSNjp765e+MsvjFLzEKKM/2GUUyxc9TPR+zSXMg7u8MA=='

In [66]:
falcon = pvfalcon.create(access_key=os.environ['ACCESS_KEY'])
client = OpenAI()

In [67]:
# parameters

# where are audio files
AUDIO_DIR = '../data/audio'

#### Get list of mp3 files in `AUDIO_DIR`

In [68]:
mp3_files = glob.glob(f'{AUDIO_DIR}/*/*.mp3')
len(mp3_files)

67

#### Fix filenames

* **NOTE** do not want spaces or quotes and parentheses in file names
* Some of the mp3 files had long filenames with these characters
* The following code cell will clean up mp3 filenames

In [69]:
# Preprocessing to rename audio files to remove spaces etc.

replace_RE = re.compile('[,()\'‘’]')

for m3 in mp3_files:
    fname = m3.split('/')[-1]
    new_fname = re.sub('_+','_',re.sub('\s+','_',replace_RE.sub('',fname)))
    if fname != new_fname:
        new_fname = m3.replace(fname, new_fname)
        shutil.move(m3, new_fname)

In [70]:
mp3_files

['../data/audio/harris_interviews/Harris_60_Minutes.mp3',
 '../data/audio/harris_interviews/Harris_CNN_Part_1.mp3',
 '../data/audio/harris_interviews/Harris_CNN_Part_2.mp3',
 '../data/audio/harris_interviews/Harris_CNN_Part_3.mp3',
 '../data/audio/harris_interviews/Harris_Fox_News.mp3',
 '../data/audio/harris_interviews/Harris_NBC.mp3',
 '../data/audio/harris_podcasts/Harris_All_The_Smoke_Part_1.mp3',
 '../data/audio/harris_podcasts/Harris_All_The_Smoke_Part_2.mp3',
 '../data/audio/harris_podcasts/Harris_All_The_Smoke_Part_3.mp3',
 '../data/audio/harris_podcasts/Harris_Club_Shay_Shay_Part_1.mp3',
 '../data/audio/harris_podcasts/Harris_Club_Shay_Shay_Part_2.mp3',
 '../data/audio/harris_podcasts/Harris_Club_Shay_Shay_Part_3.mp3',
 '../data/audio/harris_podcasts/Harris_Howard_Stern_Part_1.mp3',
 '../data/audio/harris_podcasts/Harris_Howard_Stern_Part_2.mp3',
 '../data/audio/harris_podcasts/Harris_Howard_Stern_Part_3.mp3',
 '../data/audio/harris_podcasts/Harris_The_Breakfast_Club_Part_1.mp

## Key Functions

1. `get_speaker_ids(file_path)`
2. `get_transcript(file_path)`
3. `create_transcript(file_path)`

In [71]:
def get_speaker_ids(file_path):
    '''
    Get speaker IDs (diarization) from audio (mp3) file using picovoice model API call

    1. Check if file `file_path` has already been processed and has a file_path_SPEAKER.json 
       file already

    2. If not already processed call API and handle errors

    3. Create list of dictionaries and save as a JSON file
    '''
    
    filename=file_path.split('/')[-1]
    
    # 1. check if the diarization has already but run and saved
    diary_json = file_path.replace('/audio/','/raw_transcripts/').replace('.mp3','_SPEAKERS.json')
    if os.path.exists(diary_json):
        print(f"SPEAKER IDS ALREADY OBTAINED FOR {filename}")
        return

    # 2. if no JSON file 

    try:
        print(f'Calling picovoice model for {filename}')
        segments = falcon.process_file(file_path)
    except pvfalcon.FalconError as e:
        print('Error trying to call picovoice model', e)
        return

    # 3. convert segments object into JSON and save
    fields = ('speaker_tag', 'start_sec', 'end_sec')
    
    segment_list = []
    for segment in segments:
        seg_dict=dict(zip(
            fields,
            (segment.speaker_tag, segment.start_sec, segment.end_sec)
        ))
        segment_list.append(seg_dict)

    # 4. write segment data structure out to JSON file in `data/raw_transcripts`
    with open(diary_json, 'w') as out:
        out.write(json.dumps(segment_list, indent=4))
        

In [72]:
def get_transcript(file_path):
    filename=file_path.split('/')[-1]

    # 1. check if the whisper transcript has already but run and saved
    transcript_json = file_path.replace('/audio/','/raw_transcripts/').replace('.mp3','_TRANSCRIPT.json')
    if os.path.exists(transcript_json):
        print(f"WHISPER API TRANSCRIPT ALREADY OBTAINED FOR {filename}")
        return

    # 2. if no JSON file make call to Whisper API

    audio_file= open(file_path, "rb")

    try:
        print(f'Calling OpenAI Whisper API for {filename}')

        transcript = client.audio.transcriptions.create(
          file=audio_file,
          model="whisper-1",
          response_format="verbose_json",
          timestamp_granularities=["segment"]
        )
    except Exception as e:
        print('ERROR trying to get transcript from OpenAI Whisper API call', e)
        return


    with open(transcript_json, 'w') as out:
        out.write(transcript.json())
        

In [73]:
def segment_score(transcript_segment, speaker_segment):
    transcript_segment_start = transcript_segment["start"]
    transcript_segment_end = transcript_segment["end"]
    speaker_segment_start = speaker_segment["start_sec"]
    speaker_segment_end = speaker_segment["end_sec"]

    overlap = min(transcript_segment_end, speaker_segment_end) - max(transcript_segment_start, speaker_segment_start)
    try:
        overlap_ratio = overlap / (transcript_segment_end - transcript_segment_start)
    except:
        overlap_ratio = -1
    return overlap_ratio

In [74]:
def create_transcript(file_path, output='text'):

    # 1. Check for JSON files for speakers and transcript
    diary_json = file_path.replace('/audio/','/raw_transcripts/').replace('.mp3','_SPEAKERS.json')
    transcript_json = file_path.replace('/audio/','/raw_transcripts/').replace('.mp3','_TRANSCRIPT.json')

    if not os.path.exists(diary_json) or not os.path.exists(transcript_json):
        print('Cannot create transcript because TRANSCRIPT and/or SPEAKERS json file(s) missing')
        return 


    # 2. Load JSON files for speaker ids and transcript segments
    speaker_segments = json.load(open(diary_json))
    transcript_segments = json.load(open(transcript_json))



    transcript = []
    
    # 3. combine speaker ids and transcript segments
    for t_segment in transcript_segments['segments']:
        #print(t_segment)
        max_score = 0
        best_s_segment = None
        for s_segment in speaker_segments:
            score = segment_score(t_segment, s_segment)
            if score > max_score:
                max_score = score
                best_s_segment = s_segment

        speaker = "UNKNOWN" if not best_s_segment else best_s_segment['speaker_tag']
        
        transcript.append({"speaker": speaker, "text": t_segment['text']})

    # write the output

    suffix = ".json" if output.lower()=='json' else ".txt"
    
    output_filename = file_path.replace('/audio/','/transcriptions/').replace('.mp3',suffix)

    with open(output_filename, 'w') as out:
        if suffix=='json':
            out.write(json.dumps(transcript, indent=4))
        else:
            transcript_txt = [f"SPEAKER {line['speaker']}:\t{line['text']}" for line in transcript]
            out.write('\n'.join(transcript_txt))

### Get speaker ids

In [75]:
for mp3_file in mp3_files:
    get_speaker_ids(mp3_file)

SPEAKER IDS ALREADY OBTAINED FOR Harris_60_Minutes.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_CNN_Part_1.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_CNN_Part_2.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_CNN_Part_3.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_Fox_News.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_NBC.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_All_The_Smoke_Part_1.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_All_The_Smoke_Part_2.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_All_The_Smoke_Part_3.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_Club_Shay_Shay_Part_1.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_Club_Shay_Shay_Part_2.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_Club_Shay_Shay_Part_3.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_Howard_Stern_Part_1.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_Howard_Stern_Part_2.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_Howard_Stern_Part_3.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_The_Breakfast_Club_Part_1.mp3
SPEAKER IDS ALREADY OBTA

## Get transcriptions using Whisper API

In [18]:
for mp3_file in mp3_files:
    get_transcript(mp3_file)

WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_60_Minutes.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_CNN_Part_1.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_CNN_Part_2.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_CNN_Part_3.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_Fox_News.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_NBC.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_All_The_Smoke_Part_1.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_All_The_Smoke_Part_2.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_All_The_Smoke_Part_3.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_Breakfast_Club.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_Club_Shay_Shay_Part_1.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_Club_Shay_Shay_Part_2.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_Club_Shay_Shay_Part_3.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_Howard_Stern.mp3
WHISPER API TRAN

In [21]:
for mp3_file in mp3_files:
    create_transcript(mp3_file)

In [14]:
get_speaker_ids('../data/audio/harris_podcasts/Harris_Howard_Stern_Part_1.mp3')
get_speaker_ids('../data/audio/harris_podcasts/Harris_Howard_Stern_Part_2.mp3')
get_speaker_ids('../data/audio/harris_podcasts/Harris_Howard_Stern_Part_3.mp3')

Calling picovoice model for Harris_Howard_Stern_Part_1.mp3
Calling picovoice model for Harris_Howard_Stern_Part_2.mp3
Calling picovoice model for Harris_Howard_Stern_Part_3.mp3


In [15]:
get_transcript('../data/audio/harris_podcasts/Harris_Howard_Stern_Part_1.mp3')
get_transcript('../data/audio/harris_podcasts/Harris_Howard_Stern_Part_2.mp3')
get_transcript('../data/audio/harris_podcasts/Harris_Howard_Stern_Part_3.mp3')

Calling OpenAI Whisper API for Harris_Howard_Stern_Part_1.mp3
Calling OpenAI Whisper API for Harris_Howard_Stern_Part_2.mp3
Calling OpenAI Whisper API for Harris_Howard_Stern_Part_3.mp3


In [16]:
create_transcript('../data/audio/harris_podcasts/Harris_Howard_Stern_Part_1.mp3')
create_transcript('../data/audio/harris_podcasts/Harris_Howard_Stern_Part_2.mp3')
create_transcript('../data/audio/harris_podcasts/Harris_Howard_Stern_Part_3.mp3')

In [17]:
get_speaker_ids('../data/audio/harris_podcasts/Harris_The_Breakfast_Club_Part_1.mp3')
get_speaker_ids('../data/audio/harris_podcasts/Harris_The_Breakfast_Club_Part_2.mp3')
get_speaker_ids('../data/audio/harris_podcasts/Harris_The_Breakfast_Club_Part_3.mp3')

get_transcript('../data/audio/harris_podcasts/Harris_The_Breakfast_Club_Part_1.mp3')
get_transcript('../data/audio/harris_podcasts/Harris_The_Breakfast_Club_Part_2.mp3')
get_transcript('../data/audio/harris_podcasts/Harris_The_Breakfast_Club_Part_3.mp3')

create_transcript('../data/audio/harris_podcasts/Harris_The_Breakfast_Club_Part_1.mp3')
create_transcript('../data/audio/harris_podcasts/Harris_The_Breakfast_Club_Part_2.mp3')
create_transcript('../data/audio/harris_podcasts/Harris_The_Breakfast_Club_Part_3.mp3')

Calling picovoice model for Harris_The_Breakfast_Club_Part_1.mp3
Calling picovoice model for Harris_The_Breakfast_Club_Part_2.mp3
Calling picovoice model for Harris_The_Breakfast_Club_Part_3.mp3
Calling OpenAI Whisper API for Harris_The_Breakfast_Club_Part_1.mp3
Calling OpenAI Whisper API for Harris_The_Breakfast_Club_Part_2.mp3
Calling OpenAI Whisper API for Harris_The_Breakfast_Club_Part_3.mp3


In [76]:
get_speaker_ids('../data/audio/harris_podcasts/Harris_All_The_Smoke_Part_1.mp3')
get_speaker_ids('../data/audio/harris_podcasts/Harris_All_The_Smoke_Part_2.mp3')
get_speaker_ids('../data/audio/harris_podcasts/Harris_All_The_Smoke_Part_3.mp3')

get_transcript('../data/audio/harris_podcasts/Harris_All_The_Smoke_Part_1.mp3')
get_transcript('../data/audio/harris_podcasts/Harris_All_The_Smoke_Part_2.mp3')
get_transcript('../data/audio/harris_podcasts/Harris_All_The_Smoke_Part_3.mp3')

create_transcript('../data/audio/harris_podcasts/Harris_All_The_Smoke_Part_1.mp3')
create_transcript('../data/audio/harris_podcasts/Harris_All_The_Smoke_Part_2.mp3')
create_transcript('../data/audio/harris_podcasts/Harris_All_The_Smoke_Part_3.mp3')

SPEAKER IDS ALREADY OBTAINED FOR Harris_All_The_Smoke_Part_1.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_All_The_Smoke_Part_2.mp3
SPEAKER IDS ALREADY OBTAINED FOR Harris_All_The_Smoke_Part_3.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_All_The_Smoke_Part_1.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_All_The_Smoke_Part_2.mp3
WHISPER API TRANSCRIPT ALREADY OBTAINED FOR Harris_All_The_Smoke_Part_3.mp3
