In [1]:
import json
from rapidfuzz.distance import LCSseq

In [2]:
input_json = "../data/raw/merged_text_chunks.json"
output_text = "../data/raw/merged_text_chunks.txt"

In [3]:
def find_overlap_end(tail, head):
    """
    Returns how many characters from 'head' belong to the overlapping region.
    Uses LCS sequence algorithm from RapidFuzz.
    """
    # LCSseq.distance returns the length of the longest common subsequence
    length = LCSseq.similarity(tail, head)
    
    # For simplicity, assume overlap starts at the beginning of head
    # and extends for the length of the LCS
    return min(length, len(head))

def merge_chunks(chunks, overlap_size):
    merged_text = chunks[0]
    for i in range(1, len(chunks)):
        prev = chunks[i - 1]
        curr = chunks[i]
        
        k = find_overlap_end(prev[-overlap_size:], curr[:overlap_size])
        merged_text += curr[k:]
    
    return merged_text

In [4]:
with open(input_json, "r") as f:
    data = json.load(f)
    
overlap_size = 250 # 300 characters

In [5]:
merged_text = ""
chunks = [c['content'].strip() for c in data]
merged_text = merge_chunks(chunks, overlap_size)

with open(output_text, "w") as f:
    f.write(merged_text)