### Loading in libraries

In [11]:
import boto3
import PyPDF2
from IPython.display import Audio, display, clear_output, HTML
from time import sleep
import base64
import json
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting
import fitz
from PIL import Image, ImageDraw, ImageFont
import random
import os
import re

### Load in text for a page

In [56]:
PAGE_NUMBER = 15

In [57]:
pdf_file_path = 'data/grade_3_english_book.pdf'

with open(pdf_file_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    page_number = PAGE_NUMBER
    page = pdf_reader.pages[page_number]
    text = page.extract_text()
    print(text)

For free distribution For free distribution6 7
Listen, count and say.
10
ten
6
six
7
seven
8
eight
9
nine



### Convert page to image

In [58]:
pdf_file_path = 'data/grade_3_english_book.pdf'
pdf_document = fitz.open(pdf_file_path)

page = pdf_document.load_page(PAGE_NUMBER)
pix = page.get_pixmap()

image_path = f"data/grade_3_english_book_page_{PAGE_NUMBER+1}.png"
pix.save(image_path)

print(f"Page {PAGE_NUMBER + 1} saved as {image_path}")

Page 16 saved as data/grade_3_english_book_page_16.png


### Visualizing the blocks for the page

- Blocks seperate text based on where they belong 
- This helps us group together the things that belong together
- Save image as blocks
- Group the words and their bounding boxes by their blocks

In [59]:
random.seed(42)

pdf_file_path = 'data/grade_3_english_book.pdf'
pdf_document = fitz.open(pdf_file_path)

#since we want the bounding boxes, block number, line number, & word numbers for each word in the page
page = pdf_document.load_page(PAGE_NUMBER)
words = page.get_text("words")

# each item in 'words' is a tuple:
# (x0, y0, x1, y1, word, block_no, line_no, word_no)

pix = page.get_pixmap()
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
draw = ImageDraw.Draw(image)

block_numbers = set(w[5] for w in words)
color_palette = {}
for block_no in block_numbers:
    color = tuple(random.randint(0, 255) for _ in range(3))
    color_palette[block_no] = color

try:
    font = ImageFont.truetype("arial.ttf", size=14)
except IOError:
    font = ImageFont.load_default()

for word_info in words:
    x0, y0, x1, y1, word, block_no, line_no, word_no = word_info
    rect = [(x0, y0), (x1, y1)]
    color = color_palette[block_no]
    draw.rectangle(rect, outline=color, width=2)
    if word_no == 0 and line_no == 0:
        label_position = (x0, y0 - 15)
        draw.text(label_position, f"Block {block_no}", fill=color, font=font)

image_path = f"output/annotated_page_{PAGE_NUMBER+1}_blocks.png"
image.save(image_path)
print(f"Annotated page saved as {image_path}")

Annotated page saved as output/annotated_page_16_blocks.png


In [60]:
words_with_coords = []
for w in words:
    x0, y0, x1, y1, word, block_no, line_no, word_no = w
    word_info = {
        'text': word,
        'bbox': (x0, y0, x1, y1),
        'block_no': block_no,
        'line_no': line_no,
        'word_no': word_no
    }
    words_with_coords.append(word_info)

for word_info in words_with_coords:
    print(f"Word: {word_info['text']}")
    print(f"Coordinates: {word_info['bbox']}")
    print(f"Block: {word_info['block_no']}, Line: {word_info['line_no']}, Word No: {word_info['word_no']}")
    print("---")

Word: For
Coordinates: (33.53379821777344, 738.6478881835938, 49.54179763793945, 755.0518798828125)
Block: 0, Line: 0, Word No: 0
---
Word: free
Coordinates: (52.265804290771484, 738.6478881835938, 69.52179718017578, 755.0518798828125)
Block: 0, Line: 0, Word No: 1
---
Word: distribution
Coordinates: (72.24579620361328, 738.6478881835938, 127.91380310058594, 755.0518798828125)
Block: 0, Line: 0, Word No: 2
---
Word: 6
Coordinates: (38.060001373291016, 18.79998779296875, 53.480003356933594, 60.019989013671875)
Block: 1, Line: 0, Word No: 0
---
Word: Listen,
Coordinates: (62.29439926147461, 51.608978271484375, 122.89439392089844, 75.4289779663086)
Block: 2, Line: 0, Word No: 0
---
Word: count
Coordinates: (126.89439392089844, 51.608978271484375, 181.2144012451172, 75.4289779663086)
Block: 2, Line: 0, Word No: 1
---
Word: and
Coordinates: (186.014404296875, 51.608978271484375, 222.93441772460938, 75.4289779663086)
Block: 2, Line: 0, Word No: 2
---
Word: say.
Coordinates: (227.934417724609

### Cleaning text and applying SSML tags

- Remove noisy text in the page
- Apply SSML tags (need refinement)

In [61]:
def generate():
    vertexai.init(project="syy-eag-np-61cd", location="us-central1")
    model = GenerativeModel(
        "gemini-1.5-flash-002",
    )
    responses = model.generate_content(
        [prompt],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=False,
    )
    
    return responses.text

def remove_code_blocks(text):
    lines = text.splitlines()
    cleaned_lines = [line for line in lines if not line.startswith('```') and 'xml' not in line]
    
    cleaned_text = "\n".join(cleaned_lines)
    
    return cleaned_text

prompt = f"""You are professional text to speech transcriber.
Your goal is to add SSML tags to make plain text more human-like whilst ensuring it is easy for students to understand.

Consider the following steps when transforming plain text.

STEP 1: Read the plain text and remove any unnecessary text that is not part of the story (page numbers, publication names, etc). Do not change the text that is part of the story.
STEP 2: Identify the best way to make the story natural & human-like.
STEP 3: Include the necessary SSML tags to make the story natural and have a nicer flow.
STEP 4: Ensure the output is ready to be loaded in with AWS Polly.

Plain text input:
Hello! Today, we are going to learn a new story. Are you ready?
Once upon a time, there was a little cat who loved to play. The cat’s name was Whiskers.
Whiskers had a best friend, a big, brown dog named Buddy.
One day, Whiskers and Buddy decided to go on an adventure. They wanted to find a hidden treasure!
First, they went to the forest. The forest was big and quiet, and Whiskers felt a little scared.
But Buddy said, “Don’t worry, Whiskers! I’m here with you.” And Whiskers felt brave.
Let’s stop here. Did you understand the story? Let’s say the words together: Whiskers and Buddy.
Great job! Let’s continue the story next time.

SSML Output:
<speak>
  <prosody rate=\"slow\">
   Hello! Today, we are going to <emphasis level=\"strong\">learn</emphasis> a new story. Are you <emphasis level=\"moderate\">ready</emphasis>?
  </prosody>
  <break time=\"1s\"/>
   Once upon a time, there was a little <emphasis level=\"moderate\">cat</emphasis> who loved to play. The cat’s name was Whiskers.
  <break time=\"500ms\"/>
   <emphasis level=\"strong\">Whiskers</emphasis> had a best friend, a big, brown <emphasis level=\"moderate\">dog</emphasis> named Buddy.
  <break time=\"1s\"/>
   One day, <emphasis level=\"moderate\">Whiskers</emphasis> and <emphasis level=\"moderate\">Buddy</emphasis> decided to go on an <emphasis level=\"strong\">adventure</emphasis>. They wanted to find a hidden treasure!
  <break time=\"1s\"/>
   First, they went to the <emphasis level=\"strong\">forest</emphasis>. The forest was <prosody rate=\"slow\">big and quiet</prosody>, and <emphasis level=\"moderate\">Whiskers</emphasis> felt a little <emphasis level=\"moderate\">scared</emphasis>.
  <break time=\"1s\"/>
   But Buddy said, “<prosody rate=\"slow\" volume=\"x-loud\">Don’t worry, Whiskers! I’m here with you.</prosody>” And <emphasis level=\"moderate\">Whiskers</emphasis> felt <emphasis level=\"strong\">brave</emphasis>.
  <break time=\"1.5s\"/>
   Let’s <emphasis level=\"strong\">stop</emphasis> here. Did you understand the story? Let’s say the words together: <break time=\"500ms\"/> <emphasis level=\"moderate\">Whiskers</emphasis> and <emphasis level=\"moderate\">Buddy</emphasis>.
  <break time=\"500ms\"/>
   Great job! Let’s continue the story next time.
</speak>

Plain text input: {text}

SSML Output:"""

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 0.1,
    "top_p": 0.95,
}

safety_settings = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
]

In [62]:
ssml_output = generate()
ssml_output = remove_code_blocks(ssml_output)
print(ssml_output)

<speak>
  <prosody rate="slow">Listen, count and say.</prosody>
  <break time="1s"/>
  <say-as interpret-as="cardinal">10</say-as>
  <break time="500ms"/>
  <say-as interpret-as="characters">ten</say-as>
  <break time="500ms"/>
  <say-as interpret-as="cardinal">6</say-as>
  <break time="500ms"/>
  <say-as interpret-as="characters">six</say-as>
  <break time="500ms"/>
  <say-as interpret-as="cardinal">7</say-as>
  <break time="500ms"/>
  <say-as interpret-as="characters">seven</say-as>
  <break time="500ms"/>
  <say-as interpret-as="cardinal">8</say-as>
  <break time="500ms"/>
  <say-as interpret-as="characters">eight</say-as>
  <break time="500ms"/>
  <say-as interpret-as="cardinal">9</say-as>
  <break time="500ms"/>
  <say-as interpret-as="characters">nine</say-as>
</speak>


### Creating speech markers JSON and audio Mp3

In [71]:
session = boto3.Session(profile_name='123233845129_DevOpsUser', region_name='us-east-1')
polly_client = session.client('polly')

In [73]:
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)
ssml_text = ssml_output

# generate & save the mp3 for the ssml generated, swap to neural for prod
response = polly_client.synthesize_speech(
    Engine='standard',
    OutputFormat='mp3',
    Text=ssml_text,
    TextType='ssml',
    VoiceId='Joanna'
)

audio_stream = response.get('AudioStream')
audio_file_path = os.path.join(output_dir, f"page_{PAGE_NUMBER+1}_audio.mp3")
with open(audio_file_path, 'wb') as audio_file:
    audio_file.write(audio_stream.read())

# generate & save the json with the speech markers 
speech_marks_response = polly_client.synthesize_speech(
    Engine='standard',
    OutputFormat='json',
    Text=ssml_text,
    TextType='ssml',
    VoiceId='Joanna',
    SpeechMarkTypes=['word']
)

speech_marks_stream = speech_marks_response.get('AudioStream').read().decode('utf-8')
speech_marks = [json.loads(line) for line in speech_marks_stream.strip().split('\n') if line]

speech_marks_file_path = os.path.join(output_dir, f"page_{PAGE_NUMBER+1}_speech_marks.json")
with open(speech_marks_file_path, 'w') as json_file:
    json.dump(speech_marks, json_file, indent=2)

for mark in speech_marks:
    print(mark)

{'time': 6, 'type': 'word', 'start': 31, 'end': 37, 'value': 'Listen'}
{'time': 811, 'type': 'word', 'start': 39, 'end': 44, 'value': 'count'}
{'time': 1135, 'type': 'word', 'start': 45, 'end': 48, 'value': 'and'}
{'time': 1309, 'type': 'word', 'start': 49, 'end': 52, 'value': 'say'}
{'time': 1811, 'type': 'word', 'start': 66, 'end': 84, 'value': '<break time="1s"/>'}
{'time': 3198, 'type': 'word', 'start': 119, 'end': 121, 'value': '10'}
{'time': 3754, 'type': 'word', 'start': 133, 'end': 154, 'value': '<break time="500ms"/>'}
{'time': 4349, 'type': 'word', 'start': 191, 'end': 194, 'value': 'ten'}
{'time': 5102, 'type': 'word', 'start': 206, 'end': 227, 'value': '<break time="500ms"/>'}
{'time': 5695, 'type': 'word', 'start': 262, 'end': 263, 'value': '6'}
{'time': 6357, 'type': 'word', 'start': 275, 'end': 296, 'value': '<break time="500ms"/>'}
{'time': 6950, 'type': 'word', 'start': 333, 'end': 336, 'value': 'six'}
{'time': 7940, 'type': 'word', 'start': 348, 'end': 369, 'value': '

### Sync together speech words with blocks

In [74]:
with open(f"output/page_{PAGE_NUMBER+1}_speech_marks.json", 'r') as json_file:
    speech_marks = json.load(json_file)

# function to identify words with the ssml tags 
def is_valid_word(mark):
    word = mark['value']
    if word.startswith('<') and word.endswith('/>'):
        return False
    return True

speech_marks_filtered = [mark for mark in speech_marks if is_valid_word(mark)]

# get rid of the punctuations
def normalize_word(word):
    word = word.lower()
    word = re.sub(r'[^\w\s]', '', word)
    return word

for mark in speech_marks_filtered:
    mark['normalized_value'] = normalize_word(mark['value'])
    
for word_info in words_with_coords:
    word_info['normalized_text'] = normalize_word(word_info['text'])

In [75]:
from difflib import SequenceMatcher

# get the list of normalized words and match the blocks
speech_normalized_words = [mark['normalized_value'] for mark in speech_marks_filtered]
pdf_normalized_words = [info['normalized_text'] for info in words_with_coords]

matcher = SequenceMatcher(None, speech_normalized_words, pdf_normalized_words)
matching_blocks = matcher.get_matching_blocks()

# build a mapping from speech marks to PDF words
word_mappings = []
for block in matching_blocks:
    i, j, n = block
    for k in range(n):
        speech_index = i + k
        pdf_index = j + k
        word_mappings.append({
            'speech_mark': speech_marks_filtered[speech_index],
            'pdf_word_info': words_with_coords[pdf_index]
        })

In [76]:
mapped_words = []
for mapping in word_mappings:
    time_ms = mapping['speech_mark']['time']
    bbox = mapping['pdf_word_info']['bbox']
    mapped_words.append({
        'time_ms': time_ms,
        'bbox': bbox
    })
    
print(mapped_words)

[{'time_ms': 6, 'bbox': (62.29439926147461, 51.608978271484375, 122.89439392089844, 75.4289779663086)}, {'time_ms': 811, 'bbox': (126.89439392089844, 51.608978271484375, 181.2144012451172, 75.4289779663086)}, {'time_ms': 1135, 'bbox': (186.014404296875, 51.608978271484375, 222.93441772460938, 75.4289779663086)}, {'time_ms': 1309, 'bbox': (227.93441772460938, 51.608978271484375, 263.9544372558594, 75.4289779663086)}, {'time_ms': 3198, 'bbox': (499.6839904785156, 630.9380493164062, 527.2779541015625, 658.1597290039062)}, {'time_ms': 4349, 'bbox': (491.458984375, 677.738037109375, 521.6837158203125, 704.959716796875)}, {'time_ms': 5695, 'bbox': (515.56298828125, 120.278076171875, 529.35693359375, 147.499755859375)}, {'time_ms': 6950, 'bbox': (504.0379943847656, 167.07806396484375, 530.3379516601562, 194.29974365234375)}, {'time_ms': 8540, 'bbox': (504.9989929199219, 240.87808227539062, 518.79296875, 268.0997619628906)}, {'time_ms': 9803, 'bbox': (492.52398681640625, 287.6780700683594, 547