In [1]:
import json
from sentence_transformers import SentenceTransformer, util
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def file_name_builder(folder_path, file_prefix, file_extension, index):
  return f"{folder_path}{file_prefix}_{index:03d}.{file_extension}"

In [3]:
folder_path = '../processed_data/'

In [4]:
def match_content_with_title(title_object_list,
                             content_object_list,
                             model_name="all-mpnet-base-v2",
                             window_size=20,
                             threshold=0.3
                            ):
    model = SentenceTransformer(model_name)

    content_embedding = model.encode(
        [content_object["content"] for content_object in content_object_list],
        convert_to_tensor=True
    )
    title_embedding = model.encode(
        [title_object["content"] for title_object in title_object_list],
        convert_to_tensor=True
    )
    print(f"the content_embedding is of shape {content_embedding.shape}")
    print(f"the title_embedding is of shape {title_embedding.shape}")
    
    matched_content_object_list = []
    current_title_index = 0
    m = len(title_object_list)

    for i, embedding in enumerate(content_embedding):
        high_title_index = min(current_title_index + window_size, m - 1)
        similarity_vector = util.cos_sim(embedding, title_embedding[current_title_index: high_title_index+1])[0]

        best_title_rel_index = int(np.argmax(similarity_vector))
        best_similarity_score = float(similarity_vector[best_title_rel_index])
        best_title_index = current_title_index + best_title_rel_index

        if best_similarity_score >= threshold:
            current_title_index = best_title_index
        else:
            best_title_index = None
        matched_object = {**content_object_list[i], "titleIndex": best_title_index}
        matched_content_object_list.append(matched_object)
    return matched_content_object_list

In [8]:
import re
import copy
import pprint

def pair_lecture_content_final(titles, contents):
    """
    Pairs audio transcript segments with presentation bullet points using
    advanced heuristics for accuracy. (Version 3 - Definitive Logic)

    This version incorporates:
    1. A limited Search Window to prevent illogical jumps.
    2. A Contiguity Bonus to favor a natural, sequential slide progression.
    """
    # --- Parameters ---
    SIMILARITY_THRESHOLD = 0.45  # A balanced threshold
    MIN_TRANSCRIPT_LENGTH = 10
    SEARCH_WINDOW = 8  # **NEW**: Only look ahead this many titles for a match.

    # --- State Initialization ---
    modified_contents = copy.deepcopy(contents)
    search_start_index = 0

    # --- Main Loop ---
    for i, content_item in enumerate(modified_contents):
        transcript_text = content_item['content']
        if len(transcript_text) < MIN_TRANSCRIPT_LENGTH:
            modified_contents[i]['titleIndex'] = -1
            continue
        transcript_chars = set(transcript_text)
        
        candidates = []
        # Define the end of our search window for this specific transcript
        search_end_index = min(search_start_index + SEARCH_WINDOW, len(titles))

        for j in range(search_start_index, search_end_index):
            title_text = titles[j]['content']
            cleaned_title = re.sub(r'[（）\s():\-“”,.V徒Acts]', '', title_text)
            title_chars = set(cleaned_title)
            if not title_chars:
                continue
            
            overlap = len(transcript_chars.intersection(title_chars))
            score = overlap / len(title_chars)

            if score >= SIMILARITY_THRESHOLD:
                # --- NEW: Calculate the Contiguity Bonus ---
                # This rewards matches that are closer to the last known point.
                # A distance of 0 (the current slide) gets the highest bonus.
                distance = j - search_start_index
                bonus = 1.0 / (1.0 + distance) # Bonus is high for small distances
                
                # The final score is a combination of the raw overlap and the bonus
                final_score = overlap * bonus
                
                candidates.append({
                    'index': j,
                    'final_score': final_score, # Our new primary sorting key
                    'overlap': overlap # Fallback sorting key
                })

        if not candidates:
            modified_contents[i]['titleIndex'] = -1
        else:
            # --- UPDATED SORTING LOGIC ---
            # Sort by the new, intelligent final_score. This balances substance (overlap)
            # with natural flow (bonus).
            best_candidate = sorted(candidates, key=lambda x: (x['final_score'], x['overlap']), reverse=True)[0]
            
            best_index = best_candidate['index']
            modified_contents[i]['titleIndex'] = best_index
            search_start_index = best_index

    return modified_contents

In [9]:
def update_content_json_files_with_matched_titles(start_index, end_index):
    for i in range(start_index, end_index):
        title_file_name = file_name_builder(f"{folder_path}title_from_ppt/", "part"
        , "json", i)
        content_file_name = file_name_builder(f"{folder_path}combined_text_block/", "part"
        , "json", i)
        with open(title_file_name, "r", encoding="utf-8") as f:
            title_object_list = json.load(f)
        with open(content_file_name, "r", encoding="utf-8") as f:
            content_object_list = json.load(f)

        matched_content_object_list = pair_lecture_content_final(title_object_list, content_object_list)

        with open(content_file_name, "w", encoding="utf-8") as f:
            json.dump(matched_content_object_list, f, ensure_ascii=False, indent=2)

In [10]:
update_content_json_files_with_matched_titles(0, 3)