In [2]:
# instantiate the pipeline
from pyannote.audio import Pipeline
from transformers import pipeline
import numpy as np
import os
from pydub import AudioSegment
from pydub.utils import mediainfo
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
hf_token = os.getenv("HF_TOKEN")

In [3]:
def transcribe(audio):
    sr, y = audio
    
    # Convert to mono if stereo
    if y.ndim > 1:
        y = y.mean(axis=1)
        
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    return transcriber({"sampling_rate": sr, "raw": y})["text"]  

def convert_mp3_to_wav(mp3_path, wav_path):
    audio = AudioSegment.from_mp3(mp3_path)
    audio.export(wav_path, format="wav")
    return wav_path

In [7]:
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en",device=0)
diarization_pipeline = Pipeline.from_pretrained(
  "pyannote/speaker-diarization-3.1",
  use_auth_token=hf_token)

  if ismodule(module) and hasattr(module, '__file__'):


In [15]:
file_path = "audio.wav"

In [19]:
def get_converation_from_audio(file_path):
    ext = os.path.splitext(file_path)[1]
    if ext == ".mp3":
        wav_path = convert_mp3_to_wav(file_path, "audio.wav")
    else:
        wav_path = file_path

    diarization = diarization_pipeline(wav_path)
    audio_meta = diarization.to_lab()
    audio_meta = audio_meta.split('\n')
    conversation_text = []

    for meta in audio_meta:
        try:
            starttime, endtime, speaker = meta.split(' ')
            # use whisper to extract text. 
            audio = AudioSegment.from_file("audio.wav")
            extracted_segment = audio[float(starttime)*1000:float(endtime)*1000]
            text = transcribe((extracted_segment.frame_rate, np.array(extracted_segment.get_array_of_samples())))
            # print(f"Speaker {speaker} said: {text}")
            conversation_text.append(f"Speaker {speaker}: {text}")
        except:
            pass
    return conversation_text




In [20]:
conversation_text = get_converation_from_audio(file_path)



In [27]:
# openai chat completion

from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
system_message = """You are a NLP Agent. You analyze conversation between Agent and customer and provide insights on, 
- Sentiment (Positive, Negative, Neutral)
- Intent/Topic List
- Entity List
- Issue List
- Resolution Summary
- Customer Satisfaction
"""
def get_completion(conversation_text):
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_message},
            {
                "role": "user",
                "content": conversation_text
            }
        ]
    )
    return completion.choices[0].message

In [28]:
get_completion("\n".join(conversation_text))

ChatCompletionMessage(content='Here are the insights based on the conversation:\n\n- **Sentiment**: Neutral\n- **Intent/Topic List**: \n  - Travel (discussion about travel from Wales to London)\n  - Family (visiting family, mentioning cousins)\n  - Local knowledge (questions about London and transportation)\n- **Entity List**: \n  - Locations: Wales, London, Oxford, Marble Arch\n  - Family Members: Cousins, Mum, Dad\n  - Transportation: Bus, Train\n- **Issue List**: \n  - Travel concerns (how long it takes to travel, mode of transportation)\n- **Resolution Summary**: The conversation revolves around travel from Wales to London, discussing the time it takes and the means of transportation, along with family connections.\n- **Customer Satisfaction**: Indeterminate from this interaction, as there are no clear indications of satisfaction or dissatisfaction expressed by the customer. \n\nThe conversation appears casual and friendly, primarily focused on travel logistics and personal connect

In [35]:
class AudioProcessor:
    def __init__(self):
        load_dotenv()
        self.hf_token = os.getenv("HF_TOKEN")
        self.transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device=0)
        self.diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=self.hf_token)
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        self.system_message = """You are a NLP Agent. You analyze conversation between Agent and customer and provide insights on, 
        - Sentiment (Positive, Negative, Neutral)
        - Intent/Topic List
        - Entity List
        - Issue List
        - Resolution Summary
        - Customer Satisfaction
        """

    def transcribe(self, audio):
        sr, y = audio
        if y.ndim > 1:
            y = y.mean(axis=1)
        y = y.astype(np.float32)
        y /= np.max(np.abs(y))
        return self.transcriber({"sampling_rate": sr, "raw": y})["text"]

    def convert_mp3_to_wav(self, mp3_path, wav_path):
        audio = AudioSegment.from_mp3(mp3_path)
        audio.export(wav_path, format="wav")
        return wav_path

    def get_conversation_from_audio(self, file_path):
        ext = os.path.splitext(file_path)[1]
        if ext == ".mp3":
            wav_file_path = file_path.replace(".mp3", ".wav")
            wav_path = self.convert_mp3_to_wav(file_path, wav_file_path)
        else:
            wav_path = file_path
            
        print("wav path", wav_path)
        diarization = self.diarization_pipeline(wav_path)
        audio_meta = diarization.to_lab().split('\n')
        conversation_text = []

        for meta in audio_meta:
            try:
                starttime, endtime, speaker = meta.split(' ')
                audio = AudioSegment.from_file(wav_path)
                extracted_segment = audio[float(starttime) * 1000:float(endtime) * 1000]
                text = self.transcribe((extracted_segment.frame_rate, np.array(extracted_segment.get_array_of_samples())))
                conversation_text.append(f"Speaker {speaker}: {text}")
            except:
                pass
        return conversation_text

    def get_completion(self, conversation_text):
        completion = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": self.system_message},
                {"role": "user", "content": conversation_text}
            ]
        )
        return completion.choices[0].message

processor = AudioProcessor()