# Delta in Base texts

In [1]:
import json
import unicodedata
import difflib
from difflib import SequenceMatcher
from itertools import zip_longest

def remove_all_punctuation(text):
    """Remove any punctuation character based on Unicode categories."""
    return ''.join(ch for ch in text if not unicodedata.category(ch).startswith('P'))

def normalize_text(text):
    """
    Remove punctuation, strip leading/trailing whitespace,
    and collapse multiple spaces into a single space.
    """
    no_punct = remove_all_punctuation(text)
    return " ".join(no_punct.split())

def highlight_diff(n1904_text, comp_text):
    """
    Compare the normalized N1904 text (baseline) to another version's text
    on a word-by-word basis, and insert HTML spans to mark differences:
      - Inserted words (present in comp_text but not in N1904): light blue background.
      - Deleted words (present in N1904 but missing in comp_text): light red background with strikethrough.
      - Replaced words that are similar (>=50% similarity): light yellow background.
      - Replaced words that are not similar: treated as an insertion (light blue).
    """
    base_words = n1904_text.split()
    comp_words = comp_text.split()
    matcher = SequenceMatcher(None, base_words, comp_words)
    result_tokens = []
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "equal":
            result_tokens.extend(comp_words[j1:j2])
        elif tag == "insert":
            # Words added in comp_text (insertion)
            inserted = " ".join(comp_words[j1:j2])
            result_tokens.append(f'<span style="background-color: lightblue;">{inserted}</span>')
        elif tag == "delete":
            # Words deleted from N1904 (missing in comp_text)
            deleted = " ".join(base_words[i1:i2])
            result_tokens.append(f'<span style="background-color: lightcoral; text-decoration: line-through;">{deleted}</span>')
        elif tag == "replace":
            # For replaced segments, compare each corresponding word
            base_segment = base_words[i1:i2]
            comp_segment = comp_words[j1:j2]
            for b_word, c_word in zip_longest(base_segment, comp_segment):
                if b_word is None:
                    # Extra word in comp_text (insertion)
                    result_tokens.append(f'<span style="background-color: lightblue;">{c_word}</span>')
                elif c_word is None:
                    # Word missing in comp_text (deletion)
                    result_tokens.append(f'<span style="background-color: lightcoral; text-decoration: line-through;">{b_word}</span>')
                else:
                    # Both words exist: check similarity
                    ratio = SequenceMatcher(None, b_word, c_word).ratio()
                    if ratio >= 0.5:
                        # Similar wordforms: mark with light yellow background.
                        result_tokens.append(f'<span style="background-color: lightyellow;">{c_word}</span>')
                    else:
                        # Not similar enough: treat as an insertion.
                        result_tokens.append(f'<span style="background-color: lightblue;">{c_word}</span>')
    return " ".join(result_tokens)

# Load JSON data
with open("Combined-John.json", "r", encoding="utf-8") as f:
    combined_data = json.load(f)

# Normalize texts for each version in each verse
normalized_data = {}
for verse_id, versions in combined_data.items():
    normalized_data[verse_id] = {}
    for version, text in versions.items():
        normalized_data[verse_id][version] = normalize_text(text)

# Build final_data: only keep verses where at least one version differs from N1904,
# and include N1904 alongside the differing versions.
final_data = {}
for verse_id, versions in normalized_data.items():
    n1904_text = versions.get("N1904", "")
    diff_versions = {}
    for version, text in versions.items():
        if version == "N1904":
            continue
        if text != n1904_text:
            diff_versions[version] = text
    if diff_versions:
        diff_versions["N1904"] = n1904_text
        final_data[verse_id] = diff_versions

# Create an annotated version with HTML highlighting for each non-N1904 version.
annotated_final_data = {}
for verse_id, versions in final_data.items():
    n1904_text = versions["N1904"]
    annotated_versions = {}
    # Keep the baseline N1904 text unannotated.
    annotated_versions["N1904"] = n1904_text
    for version, text in versions.items():
        if version == "N1904":
            continue
        annotated_text = highlight_diff(n1904_text, text)
        annotated_versions[version] = annotated_text
    annotated_final_data[verse_id] = annotated_versions

# Optionally, write the annotated data to a new JSON file
with open("John-diff-annotated.json", "w", encoding="utf-8") as f:
    json.dump(annotated_final_data, f, ensure_ascii=False, indent=2)

print("Final annotated data saved to John-diff-annotated.json")


Final annotated data saved to John-diff-annotated.json


# Now create HTML output from the stored JSON data

In [2]:
from IPython.display import display, HTML

# Build an HTML string from the annotated data.
html_content = "<html><head><meta charset='utf-8'></head><body>"
for verse_id, versions in annotated_final_data.items():
    chapter=verse_id[3:5].lstrip("0")
    verse=verse_id[6:].lstrip("0")
    html_content += f"<h2>John {chapter}:{verse}</h2>"
    for version, annotated_text in versions.items():
        html_content += f"<b>{version}:</b> {annotated_text}<br>"
html_content += "</body></html>"

# Display the HTML content in the notebook.
display(HTML(html_content))