<a href="https://colab.research.google.com/github/swaraj-coder/CallQualityAnalyzer/blob/main/call_quality_analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Call Quality Analyzer

## Setup & Imports


In [36]:
!pip install -q yt-dlp pydub openai-whisper resemblyzer transformers


In [37]:
import os, re, sys, math, time
import numpy as np

def check_ssl_available():
    try:
        import ssl
        return True
    except ImportError:
        return False


In [38]:
from pathlib import Path

def download_youtube_audio(url, out_fname="call_raw.%(ext)s"):
    if not check_ssl_available():
        from google.colab import files
        uploaded = files.upload()
        for fn in uploaded.keys():
            return fn
        raise RuntimeError("No SSL and no local file uploaded.")

    import yt_dlp
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': out_fname,
        'postprocessors': [{'key': 'FFmpegExtractAudio','preferredcodec': 'wav','preferredquality': '192'}],
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return "call_raw.wav"


In [39]:
import warnings
warnings.filterwarnings("ignore", category=SyntaxWarning, module="pydub.utils")


In [40]:
from pydub import AudioSegment
from resemblyzer import VoiceEncoder, preprocess_wav
from sklearn.cluster import KMeans
import numpy as np

def prepare_wav(in_wav="call_raw.wav", out_wav="call_16k.wav"):
    sound = AudioSegment.from_file(in_wav)
    sound = sound.set_channels(1).set_frame_rate(16000)
    sound.export(out_wav, format="wav")
    return out_wav

import numpy as np

def diarize_by_embeddings(wav_fpath, window_s=1.2, hop_s=0.6, n_speakers=2):
    wav = preprocess_wav(wav_fpath)
    encoder = VoiceEncoder()

    # Break wav into overlapping chunks and embed each
    frames = []
    i = 0
    while i + int(window_s * 16000) <= len(wav):
        chunk = wav[i:i + int(window_s * 16000)]
        emb = encoder.embed_utterance(chunk)
        frames.append(emb)
        i += int(hop_s * 16000)

    frames = np.vstack(frames)

    # Cluster embeddings
    kmeans = KMeans(n_clusters=n_speakers).fit(frames)
    labels = kmeans.labels_

    diarization = []
    t = 0
    for lab in labels:
        diarization.append((t, t + hop_s, int(lab)))
        t += hop_s

    return diarization



In [41]:
from pydub.silence import detect_nonsilent
from pydub import effects as pydub_effects

def trim_nonsilent_audio(in_wav, out_wav='call_trimmed.wav',
                         min_silence_len=400, silence_thresh=-40, keep_silence=200):
    """Remove long silent parts. Returns path to trimmed file."""
    if AudioSegment is None:
        raise RuntimeError("pydub not installed.")
    sound = AudioSegment.from_file(in_wav)
    nonsilent = detect_nonsilent(sound, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
    if not nonsilent:
        # nothing to trim
        sound.export(out_wav, format='wav')
        return out_wav
    pieces = []
    for (start, end) in nonsilent:
        s = max(0, start - keep_silence)
        e = min(len(sound), end + keep_silence)
        pieces.append(sound[s:e])
    out = pieces[0]
    for p in pieces[1:]:
        out += p
    out = pydub_effects.normalize(out)
    out.export(out_wav, format='wav')
    return out_wav


In [42]:
import whisper

def transcribe_whisper(wav_fpath, model_size="tiny"):
    model = whisper.load_model(model_size)
    result = model.transcribe(wav_fpath)
    return result["segments"], result["text"]


In [43]:
def align_and_analyze(segments, diar):
    speaker_times, speaker_questions, longest_monologue = {}, {}, {}
    transcript_by_speaker, combined = {}, []

    for start, end, sp in diar:
        speaker_times[sp] = speaker_times.get(sp, 0) + (end-start)
    for seg in segments:
        txt = seg["text"]
        sp = min(diar, key=lambda d: abs(d[0]-seg["start"]))[2]
        combined.append(f"Speaker {sp}: {txt}")
        transcript_by_speaker.setdefault(sp, []).append(txt)
        if "?" in txt: speaker_questions[sp] = speaker_questions.get(sp,0)+1
        dur = seg["end"]-seg["start"]
        if dur > longest_monologue.get(sp,0): longest_monologue[sp] = dur

    total = sum(speaker_times.values())
    ratios = {sp: (t/total*100) for sp,t in speaker_times.items()}
    return {
        "talk_ratios": ratios,
        "speaker_times": speaker_times,
        "speaker_questions": speaker_questions,
        "longest_monologue": longest_monologue,
        "combined_transcript": "\n".join(combined),
        "speaker_texts": transcript_by_speaker
    }


In [44]:
from transformers import pipeline

def analyze_sentiment_and_insight(transcript, speaker_times, speaker_questions):
    sent_pipeline = pipeline("sentiment-analysis")
    parts = transcript.split("\n")
    sentiments = [sent_pipeline(p[:200])[0]["label"] for p in parts if p.strip()]
    pos = sentiments.count("POSITIVE"); neg = sentiments.count("NEGATIVE")
    overall = "neutral"
    if pos > neg: overall = "positive"
    elif neg > pos: overall = "negative"

    rep = max(speaker_times, key=speaker_times.get)
    insight = f"Sales rep (speaker {rep}) dominates the call. Try listening more."
    return overall, insight, rep


In [45]:
def run_full_pipeline(youtube_url):
    # Step 1: Download YouTube audio
    raw_wav = download_youtube_audio(youtube_url, out_fname="call_raw.%(ext)s")

    # Step 2: Trim silences (if you’ve defined trim_nonsilent_audio)
    trimmed = trim_nonsilent_audio(raw_wav, out_wav="call_trimmed.wav")

    # Step 3: Prepare for 16kHz mono
    wav16 = prepare_wav(trimmed, out_wav="call_16k.wav")

    # Step 4: Diarization
    diar = diarize_by_embeddings(wav16, n_speakers=2)

    # Step 5: Transcription
    segs, full_text = transcribe_whisper(wav16)

    # Step 6: Align & analyze
    res = align_and_analyze(segs, diar)

    # Step 7: Sentiment + actionable insight
    sentiment, insight, rep = analyze_sentiment_and_insight(
        res["combined_transcript"], res["speaker_times"], res["speaker_questions"]
    )

    # Step 8: Print results
    print("=== Final Results ===")
    print("Talk-time ratios:", res["talk_ratios"])
    print("Questions:", res["speaker_questions"])
    print("Longest monologues:", res["longest_monologue"])
    print("Call sentiment:", sentiment)
    print("Actionable insight:", insight)
    print(f"(Heuristic: speaker {rep} is likely the sales rep)")
    print("Transcript excerpt:", res["combined_transcript"][:400])

    return res, sentiment, insight, rep


In [46]:
run_full_pipeline("https://www.youtube.com/watch?v=4ostqJD3Psc")


[youtube] Extracting URL: https://www.youtube.com/watch?v=4ostqJD3Psc
[youtube] 4ostqJD3Psc: Downloading webpage
[youtube] 4ostqJD3Psc: Downloading tv simply player API JSON
[youtube] 4ostqJD3Psc: Downloading tv client config
[youtube] 4ostqJD3Psc: Downloading tv player API JSON
[info] 4ostqJD3Psc: Downloading 1 format(s): 251
[download] Destination: call_raw.webm
[download] 100% of    1.99MiB in 00:00:00 at 5.11MiB/s   
[ExtractAudio] Destination: call_raw.wav
Deleting original file call_raw.webm (pass -k to keep)
Loaded the voice encoder model on cpu in 0.07 seconds.


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


=== Final Results ===
Talk-time ratios: {1: 58.24175824175819, 0: 41.75824175824181}
Questions: {1: 4, 0: 3}
Longest monologues: {1: 7.359999999999999, 0: 7.040000000000003}
Call sentiment: negative
Actionable insight: Sales rep (speaker 1) dominates the call. Try listening more.
(Heuristic: speaker 1 is likely the sales rep)
Transcript excerpt: Speaker 1:  Thank you for calling Nissan. My name is Lauren. Can I have your name?
Speaker 0:  Hamany, Miss. John Smith. Thank you, John. How can I help you?
Speaker 0:  I was just calling about to see how much it would cost to update the map in my car.
Speaker 1:  I'd be happy to help you with that today. Did you receive a mail-air from us?
Speaker 0:  I did. Do you need the customer number?
Spea


({'talk_ratios': {1: 58.24175824175819, 0: 41.75824175824181},
  'speaker_times': {1: 63.59999999999975, 0: 45.59999999999992},
  'speaker_questions': {1: 4, 0: 3},
  'longest_monologue': {1: 7.359999999999999, 0: 7.040000000000003},
  'combined_transcript': "Speaker 1:  Thank you for calling Nissan. My name is Lauren. Can I have your name?\nSpeaker 0:  Hamany, Miss. John Smith. Thank you, John. How can I help you?\nSpeaker 0:  I was just calling about to see how much it would cost to update the map in my car.\nSpeaker 1:  I'd be happy to help you with that today. Did you receive a mail-air from us?\nSpeaker 0:  I did. Do you need the customer number?\nSpeaker 0:  Yes, please. Okay. It's 15243. Thank you, and the year-making model of your vehicle.\nSpeaker 0:  Yeah, I have a 2009 Nissan Altaman.\nSpeaker 0:  So nice car.\nSpeaker 0:  Yes, thank you. We really enjoy it.\nSpeaker 1:  Okay. I think I found your profile here. Can I have you verify your address and phone number, please?\nSp

In [47]:
readme_content = """# Call Quality Analyzer

This project analyzes sales call recordings and returns:
1. Talk-time ratio (what % each person spoke)
2. Number of questions asked
3. Longest monologue duration
4. Call sentiment (positive/negative/neutral)
5. One actionable insight

**Bonus:** It also heuristically identifies the sales rep.

### How to run
- Open `call_quality_analyzer.ipynb` in Google Colab.
- Run all cells top to bottom.
- Provide a YouTube link (e.g. https://www.youtube.com/watch?v=4ostqJD3Psc).
- The notebook will download audio, process it, and print results within ~30s.

### Requirements
- Free Google Colab tier (CPU is fine).
- Python packages: `pydub`, `resemblyzer`, `scikit-learn`, `transformers`, `yt-dlp`, `openai-whisper`.
"""

with open("README.md", "w") as f:
    f.write(readme_content)
