In [1]:
from sentence_transformers import SentenceTransformer, util

def semantic_segment_transcript(transcript, threshold=0.6):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    segments = []
    current_segment = [transcript[0]]
    current_embedding = model.encode(transcript[0]['utterance'])

    for entry in transcript[1:]:
        next_embedding = model.encode(entry['utterance'])
        similarity = util.cos_sim(current_embedding, next_embedding)[0][0].item()
        if similarity < threshold:
            segments.append(current_segment)
            current_segment = [entry]
        else:
            current_segment.append(entry)
        current_embedding = next_embedding

    if current_segment:
        segments.append(current_segment)

    return segments


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from modules.db.postgres import retrieve_transcript

transcript = retrieve_transcript(meeting_id="1f253dcb-2838-48b9-8ef1-73e70259f116")

semantic_segment_transcript(transcript)

INFO:modules.db.postgres:Retrieved 211 rows for meeting 1f253dcb-2838-48b9-8ef1-73e70259f116
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.76it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 59.34it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 56.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 52.78it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.78it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 38.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.40it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 68.55it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 54.18it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 35.67it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.78it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 61.26it/s]
Batches: 100%|█

[[{'speaker': 'Client Lead',
   'start': 0.0,
   'end': 1.375,
   'utterance': 'Hi, Chris Sauer.'}],
 [{'speaker': 'Sales Executive',
   'start': 1.375,
   'end': 3.125,
   'utterance': 'Hi, Kate Johnson. Nice to meet you.'}],
 [{'speaker': 'Client Lead',
   'start': 3.125,
   'end': 3.93,
   'utterance': 'Nice to meet you, okay?'}],
 [{'speaker': 'Sales Executive',
   'start': 4.94,
   'end': 6.815,
   'utterance': 'Um, please, tell me a little bit about yourself.'}],
 [{'speaker': 'Client Lead',
   'start': 6.815,
   'end': 10.33,
   'utterance': "I'm currently finishing my Masters of Education program at Lake Erie College."}],
 [{'speaker': 'Client Lead',
   'start': 10.54,
   'end': 18.33,
   'utterance': 'and working on transitioning from a Northeastern Ohioaner to being a member of the Jacksonville area community.'}],
 [{'speaker': 'Sales Executive',
   'start': 19.26,
   'end': 20.73,
   'utterance': 'Why the move?'}],
 [{'speaker': 'Client Lead',
   'start': 21.18,
   'end': 25