In [None]:
# @title Run this cell first - it will load every required package/model { display-mode: "form" }

import subprocess

try:
  import whisper as wp
except ImportError:
  ! pip install -U openai-whisper

try:
  from simple_diarizer.diarizer import Diarizer
except ImportError:
  ! pip install simple-diarizer

try:
  subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
except subprocess.CalledProcessError:
  ! apt install ffmpeg
  ! pip install ffmpeg

# ----------------------------------------------------------------
# ----------------------MAIN SETUP--------------------------------
# ----------------------------------------------------------------

import whisper as wp
import pandas as pd
from simple_diarizer.diarizer import Diarizer
import os

model = wp.load_model('large-v2')

def perform_diarization(audio_path, num_speakers):
    diarizer = Diarizer(embed_model='xvec', cluster_method='sc')
    speaker_segments = diarizer.diarize(audio_path, num_speakers=num_speakers)

    segments_df = pd.DataFrame(speaker_segments)
    unique_speakers = segments_df['label'].drop_duplicates().reset_index()['label']
    speaker_dict = dict((v, k + 1) for k, v in unique_speakers.items())

    segments_df['speaker'] = segments_df['label'].replace(speaker_dict)
    return segments_df

def convert_audio_to_dataframe(audio_path):
    preprocess_audio(audio_path)
    result = model.transcribe('mono.wav',
                              fp16 = False,
                              language='Russian',
                              verbose=True,
                              without_timestamps=False,
                              initial_prompt="Здравствуйте, добро пожаловать!")
    transcript_df = pd.DataFrame(result['segments'])
    return transcript_df

def preprocess_audio(input_path):
    command = f"ffmpeg -y -i {input_path} -acodec pcm_s16le -ar 16000 -ac 1 mono.wav"
    os.system(command)

def format_transcript(row):
    text = row['text'].replace('\n', '')
    speaker = row['speaker']
    return f'Speaker {speaker}: {text}'

def transcribe_audio(audio_path, num_speakers):
    preprocess_audio(audio_path)
    transcript_df = convert_audio_to_dataframe(audio_path)
    segments_df = perform_diarization('mono.wav', num_speakers)

    segments_list = segments_df[['start', 'speaker']].to_dict(orient='records')

    for segment in segments_list:
        input_time = segment['start']
        segment_id = transcript_df.iloc[
            (transcript_df['start'] - input_time).abs().idxmin()
        ]['id']
        transcript_df.loc[transcript_df['id'] == segment_id, 'speaker'] = segment['speaker']

    transcript_df['speaker'].fillna(method='ffill', inplace=True)
    transcript_df['n1'] = transcript_df['speaker'] != transcript_df['speaker'].shift(1)
    transcript_df['speech'] = transcript_df['n1'].cumsum()
    grouped_df = transcript_df.groupby(['speech', 'speaker'])['text'].apply('\n'.join).reset_index()

    grouped_df['speaker'] = grouped_df['speaker'].astype(int)
    grouped_df['output'] = grouped_df.apply(format_transcript, axis=1)

    lines = grouped_df['output'].values.tolist()

    os.remove('mono.wav')

    return '\n'.join(lines)

def audio_folder_organize(folder_path):
    audio_dirs = [
        os.path.join(folder_path, file_name)
        for file_name in os.listdir(folder_path)
        if file_name.endswith(('.mp3', '.wav'))
    ]

    text_dirs = [file_name[:-3] + 'txt' for file_name in audio_dirs]

    return audio_dirs, text_dirs

In [None]:
# @title Audio Transcription with Speaker Diarization { display-mode: "form" }

#@markdown *Specify the number of speakers presented in your audio file*

num_speakers = 2 #@param [2,3,4,5,6,7,8,9]

#@markdown *Choose whether to transcribe 1 file or ALL audio files IN A FOLDER*

from_file_or_folder = "Audio File" #@param ["Audio File","Folder with Audio Files"]

#@markdown *Copy the path to the Audio File or to the Folder (according to the previous choice)*

path = "/content/untitled.wav" #@param {type:"string"}

#@markdown **ATTENTION!** RULES FOR YOUR AUDIO FILES
#@markdown *   Upload your files either into Colab Session Storage or (recommended) mount your google drive to use files from your own storage
#@markdown *   Sometimes Google Colab can disconnect you - that is why it is better to use your own drive folder to not miss the files that were transcribed
#@markdown *   Only WAV and MP3 formats are acceptable, use online converters if you have something else
#@markdown *   Do not use the same names for the audio files in the folder
#@markdown *   Remove all the spaces and use only English letters in the names of the audio files e.g. <s>"1 - Иван Иванов - Интервью.mp3"</s> --> "1-Ivan_Ivanov-Interview.mp3"


# ----------------------------------------------------------------
# ----------------------WHY R U RUNNING---------------------------
# ----------------------------------------------------------------

if from_file_or_folder == "Audio File":

  result = transcribe_audio(path, num_speakers)
  with open(f"{path[:-3]}txt", 'w', encoding='utf-8') as f:
    f.write(result)

else:
  audio_dirs, text_dirs = audio_folder_organize(path)

  for i in range(0,len(audio_dirs)):
    result = transcribe_audio(audio_dirs[i],num_speakers)
    with open(text_dirs[i], 'w', encoding='utf-8') as f:
      f.write(result)