In [2]:
import lmstudio as lms
import time
import pandas as pd
import csv
import difflib
from IPython.display import display, HTML, Markdown

## Extract Table from Image

In [None]:
basepath = '.raw/'

# extract table from each image
for i in range(1,27+1):
    start_time = time.perf_counter()
    filename = f'{i}.png'
    img_path = f'{basepath}{filename}'
    print(f'Extracting table from {img_path}...', end=' ')

    # create LM Studio model for each page
    with lms.Client() as client:
        image_handle = client.files.prepare_image(img_path)
        model = client.llm.model("qwen/qwen3-vl-8b")
        chat = lms.Chat()
        message = "The attached image has three columns of text for Bible Reference, " \
        "Changed From, and Changed To. Extract the text from the image and display as a table." \
        "I only want the table returned with no commentary"
        chat.add_user_message(message, images=[image_handle])
        prediction = model.respond(chat)

        # print execution time
        finish_time = time.perf_counter()
        total_time = finish_time - start_time
        print(f'Finished in {total_time:.3f} seconds')

        # append table to file
        with open('_tables/_esv2011revisions.md', 'a') as f:
            if i == 1:      # include header
                print(prediction.content, file=f)
            else:           # remove header
                table = prediction.content.split('\n')[2:]
                content = '\n'.join(table)
                print(content, file=f)

# finished in 17 min on MBP

## Highlight Differences Between Columns

In [4]:
# load table from file
revisions = pd.read_csv('.tables/esv2011revisions.md', sep='|', skipinitialspace=True, usecols=[1,2,3], skiprows=[1], quoting=csv.QUOTE_NONE)

### Markdown

In [16]:
# create markdown table
md_table = '|Bible Reference|2007 Text|2011 Text|\n|---|---|---|\n'

# create sequence matcher
s = difflib.SequenceMatcher()

# iterate through changes
for index, row in revisions.iterrows():
    ref = row['Bible Reference ']
    text1 = row['Change From ']
    text2 = row['Change To ']
    changes1 = ''
    changes2 = ''

    # Get opcodes for word-level comparison
    s.set_seqs(text1.split(), text2.split())
    opcodes = s.get_opcodes()

    # format changes
    for tag, i1, i2, j1, j2 in opcodes:
        if tag == 'equal':
            # add unchanged words
            changes1 += f' {" ".join(text1.split()[i1:i2])}'
            changes2 += f' {" ".join(text2.split()[j1:j2])}'
        elif tag == 'replace':
            # Highlight changed words
            changes1 += f' **{" ".join(text1.split()[i1:i2])}**'
            changes2 += f' **{" ".join(text2.split()[j1:j2])}**'
        elif tag == 'delete':
            changes1 += f' **{" ".join(text1.split()[i1:i2])}**'
            # changes2 += f' **{" ".join(text2.split()[j1:j2])}**'
        elif tag == 'insert':
            # changes1 += f' **{" ".join(text1.split()[i1:i2])}**'
            changes2 += f' **{" ".join(text2.split()[j1:j2])}**'

    md_table += f'|{ref}|{changes1}|{changes2}|\n'

# save to file
with open('.tables/_esv2011revisions_formatted.md', 'w') as f:
    print(md_table, file=f)

# Markdown(md_table)

### HTML

In [None]:
# create markdown table
html_table = '<table>\n<tr><th>Bible Reference</th><th>2007 Text</th><th>2011 Text</th></tr>\n'

# create sequence matcher
s = difflib.SequenceMatcher()

# iterate through changes
for index, row in revisions.iterrows():
    ref = row['Bible Reference ']
    text1 = row['Change From ']
    text2 = row['Change To ']
    changes1 = ''
    changes2 = ''

    # Get opcodes for word-level comparison
    s.set_seqs(text1.split(), text2.split())
    opcodes = s.get_opcodes()

    # format changes
    for tag, i1, i2, j1, j2 in opcodes:
        if tag == 'equal':
            # add unchanged words
            changes1 += f' {" ".join(text1.split()[i1:i2])}'
            changes2 += f' {" ".join(text2.split()[j1:j2])}'
        elif tag == 'replace':
            # Highlight changed words
            changes1 += f' <b>{" ".join(text1.split()[i1:i2])}</b>'
            changes2 += f' <b>{" ".join(text2.split()[j1:j2])}</b>'
        elif tag == 'delete':
            changes1 += f' <b>{" ".join(text1.split()[i1:i2])}</b>'
            # changes2 += f' <b>{" ".join(text2.split()[j1:j2])}</b>'
        elif tag == 'insert':
            # changes1 += f' <b>{" ".join(text1.split()[i1:i2])}</b>'
            changes2 += f' <b>{" ".join(text2.split()[j1:j2])}</b>'

    html_table += f'<tr><td>{ref}</td><td>{changes1}</td><td>{changes2}</td></tr>\n'

html_table += '</table>'

# save to file
with open('.tables/_esv2011revisions_formatted.html', 'w') as f:
    print(html_table, file=f)

# HTML(html_table)