In [25]:
import os
import pandas as pd
import numpy as np
import wave
import whisper
import contextlib
from sklearn.cluster import AgglomerativeClustering
from pydub import AudioSegment
from pyannote.audio import Audio
from pyannote.core import Segment
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from collections import Counter

torchvision is not available - cannot save figures


In [26]:
def prepare_voice_file(path: str) -> str:
    file_name = os.path.basename(path)
    processed_dir = "Recordings/processed/"
    os.makedirs(processed_dir, exist_ok=True)
    
    if os.path.splitext(path)[1] == '.wav':
        print(f'File {file_name} already in wav format')
        return path
    elif os.path.splitext(path)[1] in ('.mp3', '.m4a', '.ogg', '.flac'):
        audio_file = AudioSegment.from_file(path, format=os.path.splitext(path)[1][1:])
        audio_file=audio_file.set_channels(1)
        wav_file = processed_dir + os.path.splitext(file_name)[0] + '.wav'
        audio_file.export(wav_file, format='wav')
        print(f'Converted {path} to {wav_file}')
        return wav_file
    else:
        raise ValueError(
            f'Unsupported audio format: {format(os.path.splitext(path)[1])}')
    
def transcribe_wav_file(model_size, wav_file):
    language = 'English'
    model_name = model_size
    if language == 'English' and model_size != 'large':
        model_name += '.en'

        model = whisper.load_model(model_name)
        result = model.transcribe(wav_file)

    else:
        pass

    return result

def diarize(wav_file, result):
    audio=Audio()
    segments=result["segments"]
    #get audio framerate and duration
    with contextlib.closing(wave.open(wav_file,'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)

    embedding_model = PretrainedSpeakerEmbedding("pyannote/embedding", use_auth_token='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')

    #embed passed audio segment ---> revisit for real-time use case
    def segment_embedding(segment):
        start = segment["start"]
        # Whisper overshoots the end timestamp in the last segment
        end = min(duration, segment["end"])
        # diarization results to error when input segment is too short
        if(end-start<0.1): end=end+(0.1-(end-start))
        clip = Segment(start, end)
        waveform, sample_rate = audio.crop(wav_file, clip)
        return embedding_model(waveform[None])

    embeddings = np.zeros(shape=(len(segments), len(segment_embedding(segments[0])[0])))
    for i, segment in enumerate(segments):
        embeddings[i] = segment_embedding(segment)

    embeddings = np.nan_to_num(embeddings)

    #clustering for speaker identification
    clustering = AgglomerativeClustering(2).fit(embeddings)
    labels = clustering.labels_
    for i in range(len(segments)):
        segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

    #create dataframe with speaker, text and time stamp
    def time(secs):
        return pd.to_datetime(pd.to_datetime(round(segment['start']),unit='s'),unit='m').strftime('%H:%M:%S')

    transcription_=[]
    for (i, segment) in enumerate(segments):
        transcription_.append(dict(zip(['Speaker','Text','Time start','Time end'],[segment['speaker'],segment['text'],time(segment['start']),time(segment['end'])])))
    df_transcript=pd.DataFrame(transcription_)

    j=0
    i=0
    f_transcript=[]
#merge data of same speaker in consecutive rows
    while i<df_transcript.shape[0]-1:
        j=i+1
        d=df_transcript['Text'][i]
        while(df_transcript['Speaker'][i]==df_transcript['Speaker'][j])&(j<df_transcript.shape[0]-1):
            a=df_transcript['Speaker'][i]
            b=df_transcript['Time start'][i]
            c=df_transcript['Time end'][j]
            d=d+df_transcript['Text'][j]
            j=j+1
        if(j-i==1):
            a=df_transcript['Speaker'][i]
            b=df_transcript['Time start'][i]
            c=df_transcript['Time end'][i]
            d=df_transcript['Text'][i]
        f_transcript.append(dict(zip(['Speaker','Text','Time Start','Time End'],[a,d,b,c])))
        i=j
#for last index
    a=df_transcript['Speaker'][i]
    b=df_transcript['Time start'][i]
    c=df_transcript['Time end'][j]
    d=df_transcript['Text'][i]
    f_transcript.append(dict(zip(['Speaker','Text','Time Start','Time End'],[a,d,b,c])))

    if(f_transcript[-1]['Speaker']==f_transcript[-2]['Speaker']):
        f_transcript[-2]['Text']=f_transcript[-2]['Text']+(f_transcript[-1]['Text'])
        f_transcript[-2]['Time End']=f_transcript[-1]['Time End']
        f_transcript.pop()
    return pd.DataFrame(f_transcript)

def makecsv(wav_file, transcript):
    filename=os.path.splitext(os.path.basename(wav_file))[0]
    transcript.to_csv(filename+'.csv')

In [27]:
file_path = r"Recordings/raw/Customer Service Sample Call - Product Refund (192 kbps).mp3"
wav_file = prepare_voice_file(file_path)

Converted Recordings/raw/Customer Service Sample Call - Product Refund (192 kbps).mp3 to Recordings/processed/Customer Service Sample Call - Product Refund (192 kbps).wav


In [28]:
result = transcribe_wav_file("small", wav_file)

In [29]:
transcript = diarize(wav_file, result)

Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\Jeb\.cache\torch\pyannote\models--pyannote--embedding\snapshots\c6335d8f1cd77b30084387468a6cf26fea90009b\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.2.2+cpu. Bad things might happen unless you revert torch to 1.x.


In [30]:
makecsv(wav_file, transcript)

In [31]:
df_original = pd.read_csv(os.path.basename(wav_file).replace('.wav','.csv'))
df = df_original
df.head()

Unnamed: 0.1,Unnamed: 0,Speaker,Text,Time Start,Time End
0,0,SPEAKER 1,Thank you for calling Cokes and Gouts.,00:00:00,00:00:00
1,1,SPEAKER 2,My name is Sam.,00:00:08,00:00:08
2,2,SPEAKER 1,"How can I help you? Oh yes, I bought a code f...",00:00:09,00:00:24
3,3,SPEAKER 2,Seems like it does not reflect on my account ...,00:00:31,00:00:39
4,4,SPEAKER 1,"Okay, let me see. Alright, please wait for a ...",00:00:43,00:00:49


In [32]:
import en_core_web_sm
from functools import reduce
nlp = en_core_web_sm.load()

df['Entities'] = df['Text'].apply(lambda x: [(ent.text) for ent in nlp(x).ents])

df['Type'] = df['Text'].apply(lambda x: [(ent.label_) for ent in nlp(x).ents])

df['Redacted_Text'] = df.apply(lambda row: reduce(lambda a, kv: a.replace(kv, '****'), row['Entities'], row['Text']), axis=1)

df

Unnamed: 0.1,Unnamed: 0,Speaker,Text,Time Start,Time End,Entities,Type,Redacted_Text
0,0,SPEAKER 1,Thank you for calling Cokes and Gouts.,00:00:00,00:00:00,"[Cokes, Gouts]","[PRODUCT, PERSON]",Thank you for calling **** and ****.
1,1,SPEAKER 2,My name is Sam.,00:00:08,00:00:08,[Sam],[PERSON],My name is ****.
2,2,SPEAKER 1,"How can I help you? Oh yes, I bought a code f...",00:00:09,00:00:24,[],[],"How can I help you? Oh yes, I bought a code f..."
3,3,SPEAKER 2,Seems like it does not reflect on my account ...,00:00:31,00:00:39,[],[],Seems like it does not reflect on my account ...
4,4,SPEAKER 1,"Okay, let me see. Alright, please wait for a ...",00:00:43,00:00:49,[017-1425-793],[DATE],"Okay, let me see. Alright, please wait for a ..."
5,5,SPEAKER 2,"Thank you. Just to verify that 017-1425-793, ...",00:00:56,00:00:57,[017-1425-793],[CARDINAL],"Thank you. Just to verify that ****, correct?"
6,6,SPEAKER 1,"Yes, that's right.",00:01:03,00:01:03,[],[],"Yes, that's right."
7,7,SPEAKER 2,I do apologize but the one that you provided ...,00:01:05,00:01:05,[],[],I do apologize but the one that you provided ...
8,8,SPEAKER 1,Do you actually have a number that starts wit...,00:01:12,00:01:12,[007],[CARDINAL],Do you actually have a number that starts wit...
9,9,SPEAKER 2,Can I ask for your first and last name so I c...,00:01:20,00:01:20,[first],[ORDINAL],Can I ask for your **** and last name so I ca...


In [33]:
def calculate_2A1_score(df):
    score = 0

    for index, row in df.iterrows():
        if 'PERSON' in row['Type'] or 'CARDINAL' in row['Type']:
            if index == 0:
                x = 0
            else:
                x = index - 1

            if 'name' in df['Text'][index] or 'name' in df['Text'][x]:
                score += 1

            if 'number' in df['Text'][index] or 'number' in df['Text'][x]:
                score += 1

    if score >= 2:
        score = 2
    else:
        score = 0

    return score


In [34]:
def calculate_2B2_score():
    df = pd.read_csv('call_bank.csv')
    file_name_upload = os.path.splitext(os.path.basename(file_path))[0]
    score = 0
    for index, row in df.iterrows():
        file_path_csv = df['filepath'][index]
        file_name_csv = os.path.basename(file_path_csv)
        if file_name_csv == file_name_upload:
            if df['resolution'][index] == 1:
                score = 1
    return score

In [35]:
def calculate_2C5_score():
    df = pd.read_csv('call_bank.csv')
    file_name_upload = os.path.splitext(os.path.basename(file_path))[0]
    score = 0
    for index, row in df.iterrows():
        file_path_csv = df['filepath'][index]
        file_name_csv = os.path.basename(file_path_csv)
        if file_name_csv == file_name_upload:
            if df['resolution'][index] == 1:
                score = 2
    return score

In [36]:
def calculate_2C2_score():
    df = pd.read_csv('call_bank.csv')
    file_name_upload = os.path.splitext(os.path.basename(file_path))[0]
    score = 0
    for index, row in df.iterrows():
        file_path_csv = df['filepath'][index]
        file_name_csv = os.path.basename(file_path_csv)
        if file_name_csv == file_name_upload:
            if df['resolution'][index] == 0:
                score = 0
            else:
                score = 3
    return score

In [37]:
def calculate_2C9_score():
    df = pd.read_csv('call_bank.csv')
    file_name_upload = os.path.splitext(os.path.basename(file_path))[0]
    score = 0
    for index, row in df.iterrows():
        file_path_csv = df['filepath'][index]
        file_name_csv = os.path.basename(file_path_csv)
        if file_name_csv == file_name_upload:
            if df['resolution'][index] == 2:
                score = 1
    return score

In [38]:
def calculate_2C7_score(df):
    names = []
    for index, row in df.iterrows():
        if 'PERSON' in row['Type']:
            names.append(df['Entities'][index])

    flat_list = [item for sublist in names for item in sublist]

    name_counts = Counter(flat_list)

    if any(count >= 5 for count in name_counts.values()):
        return 1
    else:
        return 0

In [43]:
# Most Downloaded, Like and 2nd Most Trending
from sentence_transformers import SentenceTransformer,util
model1 = SentenceTransformer('all-MiniLM-L6-v2')

emb1 = model1.encode("chuva company, what can i do for you?")

# Compute cosine similarity between all pairs
sentences = ['Hi this is chuva company, thank you for calling. What can i do for you?'
        ]

#Encode all sentences
embeddings = model1.encode(sentences)

#Compute cosine similarity between all pairs
cos_sim = util.cos_sim(emb1, embeddings)

cos_sim

tensor([[0.8746]])

In [59]:
df.loc[df['Speaker'] == 'SPEAKER 1', 'Text'].iloc[0]

' Thank you for calling Cokes and Gouts.'

In [58]:
df.loc[df['Speaker'] == 'SPEAKER 1', 'Text'].iloc[-1]

' Bye. Alright thank you.'

In [73]:
df.loc[df['Speaker'] == 'SPEAKER 1', 'Text'].iloc[-4]

' At first I thought that it would probably be like three to five weeks to process.'

In [70]:
df_ref = pd.read_csv('Scripts/scripts_closing.csv', usecols=['Script'])['Script']
df_ref

0     To summarize, we have successfully addressed [...
1     I’m glad we were able to resolve your concern ...
2     Thank you for bringing this matter to our atte...
3     We appreciate your patience and understanding ...
4     If you have any further questions or concerns,...
5     Should you require any additional support in t...
6     If you need any further assistance, you can re...
7     For any future inquiries, please don’t hesitat...
8     Once again, thank you for contacting us. We ho...
9     It was a pleasure assisting you today. We wish...
10    I’m glad we could resolve this for you. Is the...
11    Thank you for allowing us to assist you. Have ...
12    I’m happy we could sort this out. Would you li...
13          It was a pleasure assisting you. Take care!
14    I’m glad we could help. If you have any more q...
15    Could you please provide feedback on how we di...
16    We’re always looking to improve. Would you be ...
17    Your feedback helps us serve you better. C

In [67]:
# Most Downloaded, Like and 2nd Most Trending
from sentence_transformers import SentenceTransformer,util
model1 = SentenceTransformer('all-MiniLM-L6-v2')

emb1 = model1.encode(df.loc[df['Speaker'] == 'SPEAKER 1', 'Text'].iloc[0])

# Compute cosine similarity between all pairs
sentences = pd.read_csv('Scripts/scripts_opening.csv', usecols=['Script'])['Script']

#Encode all sentences
embeddings = model1.encode(sentences)

#Compute cosine similarity between all pairs
cos_sim = util.cos_sim(emb1, embeddings)

cos_sim

tensor([[0.1460, 0.2075, 0.1094, 0.1819, 0.1776, 0.1983, 0.2274, 0.2580, 0.1519,
         0.1039, 0.0845, 0.1549, 0.1162, 0.1935, 0.1612, 0.1808, 0.1810, 0.1790,
         0.2210, 0.2140, 0.1035, 0.0499, 0.0094, 0.2042, 0.1353]])

In [88]:
# Most Downloaded, Like and 2nd Most Trending
from sentence_transformers import SentenceTransformer,util
import torch

model1 = SentenceTransformer('all-MiniLM-L6-v2')

# emb1 = model1.encode(df.loc[df['Speaker'] == 'SPEAKER 1', 'Text'].iloc[-1])
emb1 = model1.encode("You're welcome and my pleasure. Have a nice day.")

# Compute cosine similarity between all pairs
sentences = pd.read_csv('Scripts/scripts_closing.csv', usecols=['Script'])['Script']

#Encode all sentences
embeddings = model1.encode(sentences)

#Compute cosine similarity between all pairs
cos_sim = util.cos_sim(emb1, embeddings)

# Convert the list of lists into a tensor
scores = torch.tensor(cos_sim)

# Check if any score is greater than 0.6
result = (scores > 0.6).any().item()

if result:
    score = 1
else:
    score = 0

score

tensor([[0.0928, 0.1820, 0.3966, 0.4134, 0.2219, 0.1621, 0.1232, 0.1400, 0.5388,
         0.5572, 0.1764, 0.6038, 0.1922, 0.5041, 0.3840, 0.1317, 0.1052, 0.1974,
         0.1126]])


0