In [1]:
import os
from pypdf import PdfReader
import pandas as pd

In [3]:
file = "../data/pdf/Boston.pdf"

In [8]:
# read and extrt text from PDF
reader = PdfReader(file)
text = ""
for page in reader.pages:
    text += page.extract_text()
# split text into lines
# create a DataFrame from the lines
df = pd.DataFrame(text.split('\n'), columns=['Text'])


In [9]:
df

Unnamed: 0,Text
0,C i t y o f B o s t o n I n t e r i m G u i d ...
1,V e r s i o n 1 . 1 P r e p a r e d b y S a n...
2,Purpose
3,GenerativeAIisasetofrelativelynewtechnologiest...
4,G e n e r a t i v e A I i s a t o o l . W e a...
5,Theseguidelinesshouldbereplacedinthefuturewit...
6,Unset
7,Unset
8,SampleUseCases
9,T h e s e a r e s o m e o f t h e t y p e s o ...


In [12]:
# Improved extraction using PyMuPDF (fitz) with spacing fixes
import re
try:
    import fitz  # PyMuPDF
except ImportError:
    raise RuntimeError("PyMuPDF not installed. Run: pip install pymupdf")

SPACED_LETTERS_RX = re.compile(r'\b(?:[A-Za-z]\s){3,}[A-Za-z]\b')
def unsplit_spaced_words(line: str) -> str:
    def _join(m: re.Match) -> str:
        return m.group(0).replace(' ', '')
    prev = None
    while prev != line:
        prev = line
        line = SPACED_LETTERS_RX.sub(_join, line)
    return line

def extract_text_preserve_words(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    lines = []
    for page in doc:
        # get words with positions to preserve natural order and spacing
        words = page.get_text("words")  # list of (x0, y0, x1, y1, word, blockno, lineno, wordno)
        # sort by line then x
        words.sort(key=lambda w: (w[6], w[0]))
        cur_line_no = None
        cur_line_words = []
        for w in words:
            line_no = w[6]
            word = w[4]
            if cur_line_no is None:
                cur_line_no = line_no
            if line_no != cur_line_no:
                # finalize previous line
                line_text = ' '.join(cur_line_words)
                lines.append(line_text)
                cur_line_words = [word]
                cur_line_no = line_no
            else:
                cur_line_words.append(word)
        if cur_line_words:
            lines.append(' '.join(cur_line_words))
    doc.close()
    # Fix spaced letters if any slipped through
    lines = [unsplit_spaced_words(l) for l in lines]
    return '\n'.join(lines)

# Use improved extractor
better_text = extract_text_preserve_words(file)
df_better = pd.DataFrame(better_text.split('\n'), columns=['Text'])
df_better

Unnamed: 0,Text
0,City Version Purpose Generative Generative The...
1,for Prepared volumes autocorrect encourage Usi...
2,"Published: based wrote, yourselves on we input..."
3,Applies ChatGPT excuse support our events to: ...
4,Schools subject about these of active technolo...
...,...
113,"Lucas, Professor of the Practice at Northeaste..."
114,Harvard Business School; Alejandro Jimenez Jar...
115,"Office of New Urban Mechanics; Jerry Kelley, p..."
116,"Innovation and Technology, Kerry Jordan, Chief..."


In [20]:
pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six)
  Downloading cryptography-46.0.3-cp311-abi3-macosx_10_9_universal2.whl.metadata (5.7 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six)
  Downloading cryptography-46.0.3-cp311-abi3-macosx_10_9_universal2.whl.metadata (5.7 kB)
Collecting cffi>=2.0.0 (from cryptography>=36.0.0->pdfminer.six)
  Downloading cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.6 kB)
Collecting pycparser (from cffi>=2.0.0->cryptography>=36.0.0->pdfminer.six)
Collecting cffi>=2.0.0 (from cryptography>=36.0.0->pdfminer.six)
  Downloading cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.6 kB)
Collecting pycparser (from cffi>=2.0.0->cryptography>=36.0.0->pdfminer.six)
  Downloading pycparser-2.23-py3-none-any.whl.metadata (993 bytes)
Downloading pdfminer_six-20251107-py3-none-any.

In [21]:
# Alternative 1: pdfminer.six extraction (robust layout analysis)
try:
    from pdfminer.high_level import extract_text as pdfminer_extract_text
except ImportError:
    raise RuntimeError("pdfminer.six not installed. Run: pip install pdfminer.six")

text_pdfminer = pdfminer_extract_text(file)
# Optional fix for spaced letters
lines_pdfminer = [unsplit_spaced_words(l) for l in text_pdfminer.split('\n')]
df_pdfminer = pd.DataFrame(lines_pdfminer, columns=['Text'])
df_pdfminer

Unnamed: 0,Text
0,City of Boston Interim Guidelines
1,for Using Generative AI
2,
3,Version 1.1
4,"Prepared by Santiago Garces, Chief Information..."
...,...
452,"Ofﬁce of New Urban Mechanics; Jerry Kelley, pr..."
453,"Innovation and Technology, Kerry Jordan, Chief..."
454,and Technology.
455,
