###### conda install: ffmpeg

In [None]:
!pip install pydantic==1.10.8 #need to upgrade to successfully import pyannote audio

In [None]:
!pip install -q git+https://github.com/pyannote/pyannote-audio.git #for diarization
!pip install -q git+https://github.com/openai/whisper.git #for transcription
!pip install protobuf==3.20.* #need to downgrade protobuf for pyannote to work

In [None]:
import whisper
import subprocess
import torch
import wave
import contextlib

In [None]:
#run this cell twice if first results to error
import pyannote.audio

from pyannote.audio import Audio
from pyannote.core import Segment

In [None]:
import ffmpeg
import os
import pandas as pd
from pydub import AudioSegment

In [None]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import datetime as dt
from datetime import timedelta

### Convert mp3,m4a,ogg,flac to wav
##### Input: audio file path
##### Output: wav file path

In [None]:
def prepare_voice_file(path: str) -> str:
    """
    Converts the input audio file to WAV format if necessary and returns the path to the WAV file.
    """
    if os.path.splitext(path)[1] == '.wav':
        return path
    elif os.path.splitext(path)[1] in ('.mp3', '.m4a', '.ogg', '.flac'):
        audio_file = AudioSegment.from_file(
            path, format=os.path.splitext(path)[1][1:])
        audio_file=audio_file.set_channels(1)
        wav_file = os.path.splitext(path)[0] + '.wav'
        audio_file.export(wav_file, format='wav')
        return wav_file
    else:
        raise ValueError(
            f'Unsupported audio format: {format(os.path.splitext(path)[1])}')

In [None]:
# wav_file=prepare_voice_file(r"D:\Downloads\mara_PSMDS\Call_Center_Conversation_(chargeback_refund)_(192_kbps).mp3")

### Transcription
##### Input: wav file path
##### Output: model transcription

In [None]:
#define other parameters for trancscriber: use small model for English only

#num_speakers = 2 #@param {type:"integer"}

language = 'English' #@param ['any', 'English']

#model_size = 'large' #@param ['tiny', 'base', 'small', 'medium', 'large']
model_size = 'small' #@param ['tiny', 'base', 'small', 'medium', 'large']

model_name = model_size
if language == 'English' and model_size != 'large':
  model_name += '.en'

model = whisper.load_model('model_size') #use small model for English only

In [None]:
def transcribe(wav_file: str):
    result=model.transcribe(wav_file)
    return(result)

In [None]:
#result=transcribe(wav_file)
#result['text']

### Speaker diarization
##### Input: wav file path, result from transcription
##### Output: dataframe with speaker, text and time stamp

In [None]:
#use embedding and clustering for speaker diarization
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
embedding_model = PretrainedSpeakerEmbedding(
    "pyannote/embedding", use_auth_token='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')

Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.2.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\kim\.cache\torch\pyannote\models--pyannote--embedding\snapshots\c6335d8f1cd77b30084387468a6cf26fea90009b\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.2.1+cpu. Bad things might happen unless you revert torch to 1.x.


In [None]:
def diarize(wav_file, result):
    audio=Audio()
    segments=result["segments"]
    #get audio framerate and duration
    with contextlib.closing(wave.open(wav_file,'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)

    #embed passed audio segment ---> revisit for real-time use case
    def segment_embedding(segment):
        start = segment["start"]
        # Whisper overshoots the end timestamp in the last segment
        end = min(duration, segment["end"])
        # diarization results to error when input segment is too short
        if(end-start<0.1): end=end+(0.1-(end-start))
        clip = Segment(start, end)
        waveform, sample_rate = audio.crop(wav_file, clip)
        return embedding_model(waveform[None])

    embeddings = np.zeros(shape=(len(segments), len(segment_embedding(segments[0])[0])))
    for i, segment in enumerate(segments):
        embeddings[i] = segment_embedding(segment)

    embeddings = np.nan_to_num(embeddings)

    #clustering for speaker identification
    clustering = AgglomerativeClustering(2).fit(embeddings)
    labels = clustering.labels_
    for i in range(len(segments)):
        segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

    #create dataframe with speaker, text and time stamp
    def time(secs):
        return pd.to_datetime(pd.to_datetime(round(segment['start']),unit='s'),unit='m').strftime('%H:%M:%S')

    transcription_=[]
    for (i, segment) in enumerate(segments):
        transcription_.append(dict(zip(['Speaker','Text','Time start','Time end'],[segment['speaker'],segment['text'],time(segment['start']),time(segment['end'])])))
    df_transcript=pd.DataFrame(transcription_)

    j=0
    i=0
    f_transcript=[]
#merge data of same speaker in consecutive rows
    while i<df_transcript.shape[0]-1:
        j=i+1
        d=df_transcript['Text'][i]
        while(df_transcript['Speaker'][i]==df_transcript['Speaker'][j])&(j<df_transcript.shape[0]-1):
            a=df_transcript['Speaker'][i]
            b=df_transcript['Time start'][i]
            c=df_transcript['Time end'][j]
            d=d+df_transcript['Text'][j]
            j=j+1
        if(j-i==1):
            a=df_transcript['Speaker'][i]
            b=df_transcript['Time start'][i]
            c=df_transcript['Time end'][i]
            d=df_transcript['Text'][i]
        f_transcript.append(dict(zip(['Speaker','Text','Time Start','Time End'],[a,d,b,c])))
        i=j
#for last index
    a=df_transcript['Speaker'][i]
    b=df_transcript['Time start'][i]
    c=df_transcript['Time end'][j]
    d=df_transcript['Text'][i]
    f_transcript.append(dict(zip(['Speaker','Text','Time Start','Time End'],[a,d,b,c])))

    if(f_transcript[-1]['Speaker']==f_transcript[-2]['Speaker']):
        f_transcript[-2]['Text']=f_transcript[-2]['Text']+(f_transcript[-1]['Text'])
        f_transcript[-2]['Time End']=f_transcript[-1]['Time End']
        f_transcript.pop()
    return pd.DataFrame(f_transcript)

In [None]:
#transcript=diarize(wav_file, result)
#transcript

In [None]:
def makecsv(wav_file, transcript):
    filename=os.path.splitext(os.path.basename(wav_file))[0]
    transcript.to_csv(filename+'.csv')

In [None]:
#makecsv(wav_file, transcript)

### Next: define "transcribe" function, write to csv using audio filename, loop for all mp3 files in folder, apply NER

In [None]:
#transcription pipeline:
#prepare wav file
wav_file=prepare_voice_file(r"D:\Downloads\mara_PSMDS\Call_Center_Conversation_(chargeback_refund)_(192_kbps).mp3")
#transcribe wav file
result=transcribe(wav_file)
#diarization
transcript=diarize(wav_file, result)
#write to csv
makecsv(wav_file, transcript)

### Code ni Lei

In [None]:
#loop for all mp3 files in folder
import os
import speech_recognition as sr
import pandas as pd
import spacy

# Load English NER model
nlp = spacy.load("en_core_web_sm")

# Initialize the recognizer
r = sr.Recognizer()

# List to store data for all call recordings
data = []

# Directory containing all audio files
directory = "D:\Downloads\mara_PSMDS\Recordings"

# Iterate over each audio file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".wav"):
        filepath = os.path.join(directory, filename)
        print("Processing:", filename)

        # Load audio file
        audio = False
        with sr.AudioFile(filepath) as source:
            audio = r.record(source)

        # Recognize speech
        try:
            s = r.recognize_google(audio)

            # Perform Named Entity Recognition
            doc = nlp(s)
            entities = [(ent.text, ent.label_) for ent in doc.ents]

            # Append data to the list
            data.append({'Filename': filename, 'Text': s, 'Entities': entities})

        except Exception as e:
            print("Exception:", str(e))

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Print the DataFrame
print(df.head())

### Code with Loop for all recordings

In [None]:
directory="D:\Downloads\mara_PSMDS\Recordings"

for filename in os.listdir(directory):
    if os.path.splitext(filename)[1] in ('.mp3', '.m4a', '.ogg', '.flac'):
        wav_file=prepare_voice_file(os.path.join(directory,filename))
        #transcribe wav file
        result=transcribe(wav_file)
        #diarization
        transcript=diarize(wav_file, result)
        #write to csv
        makecsv(wav_file, transcript)

In [None]:
result['segments']

[{'id': 0,
  'seek': 0,
  'start': 0.0,
  'end': 10.0,
  'text': ' Thank you for calling Coats and Gounds, my name is Sam, how can I help you?',
  'tokens': [50364,
   1044,
   291,
   337,
   5141,
   3066,
   1720,
   293,
   460,
   4432,
   11,
   452,
   1315,
   307,
   4832,
   11,
   577,
   393,
   286,
   854,
   291,
   30,
   50864],
  'temperature': 0.0,
  'avg_logprob': -0.3494122603843952,
  'compression_ratio': 1.3133333333333332,
  'no_speech_prob': 0.11384373903274536},
 {'id': 1,
  'seek': 0,
  'start': 10.0,
  'end': 20.240000000000002,
  'text': ' Oh yes, I bought a coat from you guys but I need to return it because it was the wrong',
  'tokens': [50864,
   876,
   2086,
   11,
   286,
   4243,
   257,
   10690,
   490,
   291,
   1074,
   457,
   286,
   643,
   281,
   2736,
   309,
   570,
   309,
   390,
   264,
   2085,
   51376],
  'temperature': 0.0,
  'avg_logprob': -0.3494122603843952,
  'compression_ratio': 1.3133333333333332,
  'no_speech_prob': 0.113843

In [None]:
transcript=diarize(wav_file, result)

In [None]:
transcript

Unnamed: 0,Speaker,Text,Time Start,Time End
0,SPEAKER 1,"Thank you for calling Coats and Gounds, my na...",00:00:00,00:00:24
1,SPEAKER 2,Seems like it does not apply to my account ye...,00:00:31,00:00:43
2,SPEAKER 1,"Alright, please wait for a moment. Okay, hold...",00:00:44,00:00:56
3,SPEAKER 2,"Just to verify that 017 1425 793, correct? Ye...",00:00:57,00:01:21
4,SPEAKER 1,"Yeah, I'm not sure about a 07 or 3.0 something.",00:01:23,00:01:23
5,SPEAKER 2,"Anyway, it's Adam Wilson. Alright, thank you....",00:01:28,00:01:51
6,SPEAKER 1,Not sure. Is it the 74391?,00:01:58,00:01:59
7,SPEAKER 2,That's actually the return authorization from...,00:02:04,00:02:16
8,SPEAKER 1,"Okay, I probably have to dig it up. Okay, I i...",00:02:19,00:02:31
9,SPEAKER 2,"Anyway, I tried to search for your name and n...",00:02:33,00:02:36


In [None]:
makecsv(wav_file, transcript)