In [1]:
import os
from pypdf import PdfReader
import pandas as pd

In [2]:
folder_path = '../data/pdf'

In [32]:
from pdfminer.high_level import extract_text as pdfminer_extract_text


pdfs = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]

texts = []
for pdf in pdfs:
    file_path = os.path.join(folder_path, pdf)
    reader = PdfReader(file_path)
    for idx in range(len(reader.pages)):
        page_text = pdfminer_extract_text(file_path, page_numbers=[idx])
        page_text = '\n'.join(unsplit_spaced_words(l) for l in page_text.split('\n'))
        texts.append(
            {
                'document': pdf,
                'page': idx + 1,
                'text': page_text,
            }
        )

df = pd.DataFrame(texts)
df

Unnamed: 0,document,page,text
0,SaltLakeCity.pdf,1,ARTICLE E. USE OF GENERATIVE ARTIFICIAL INTELL...
1,SaltLakeCity.pdf,2,stereotypes. Users must edit or disregard bias...
2,Cambridge.pdf,1,City of Cambridge Guidelines on Using Generati...
3,Cambridge.pdf,2,"Only use City-approved, authorized AI tools fo..."
4,Cambridge.pdf,3,• Recommended Practice: To improve team learn...
5,Cambridge.pdf,4,"o Copilot Chat, a Microsoft Generative AI cha..."
6,Cambridge.pdf,5,• City of Cambridge Written Information Secur...
7,Cambridge.pdf,6,"o Test results, diagnoses, or treatment detai..."
8,Cambridge.pdf,7,"used in limited non-sensitive, low-risk \ncirc..."
9,LongBeach.pdf,1,Generative AI Guidance \n\nV 1.3 \n\nIntroduct...


In [50]:
import re, unicodedata

UNICODE_FIXES = {
    '\u00AD': '',   # soft hyphen
    '\u2010': '-', '\u2011': '-', '\u2012': '-', '\u2013': '-', '\u2014': '-', '\u2212': '-',
    '\u2022': ' ', '\u25CF': ' ', '\u25AA': ' ', '\u2027': ' ', '\u00B7': ' ',
    '\ufb01': 'fi', '\ufb02': 'fl',
}
def normalize_unicode(s: str) -> str:
    s = unicodedata.normalize('NFKC', s)
    for k, v in UNICODE_FIXES.items():
        s = s.replace(k, v)
    return s

SPACED_LETTERS_RX = re.compile(r'\b(?:[A-Za-z]\s){3,}[A-Za-z]\b')
def unsplit_spaced_words(line: str) -> str:
    def _join(m: re.Match) -> str:
        return m.group(0).replace(' ', '')
    prev = None
    while prev != line:
        prev = line
        line = SPACED_LETTERS_RX.sub(_join, line)
    return line

# --- NEW: split CamelCase & alnum boundaries BEFORE lowercasing ---
CAMEL_RX = re.compile(r'([a-z])([A-Z])')
ALNUM_RX_1 = re.compile(r'([A-Za-z])(\d)')
ALNUM_RX_2 = re.compile(r'(\d)([A-Za-z])')

def split_word_boundaries(line: str) -> str:
    # Split lower→Upper (InterimGuidelines -> Interim Guidelines)
    line = CAMEL_RX.sub(r'\1 \2', line)
    # Split letter→digit and digit→letter (rare, but safe)
    line = ALNUM_RX_1.sub(r'\1 \2', line)
    line = ALNUM_RX_2.sub(r'\1 \2', line)
    return line

# --- Your header patterns (unchanged, with Boston additions) ---
HEADER_PATTERNS = [
    r'City of Cambridge Guidelines on Using Generative Artificial Intelligence.*',
    r'City of Cambridge Guidelines.*',
    r'San Francisco Generative AI Guidelines.*',
    r'ARTICLE\s+E\..*',
    r'Use of Generative Artificial Intelligence.*',
    r'City of San Jos\u00e9 City Administrative Policy Manual Artificial Intelligence.*',
    r'City of San Jos\u00e9 .* Artificial Intelligence.*',
    r'Artificial Intelligence \(AI\) Policy.*',
    r'Ethical Artificial Intelligence \(AI\) Policy.*',
    r'AI/ML Governance Policy.*',
    r'District of Columbia Government.*Office of the Chief Technology Officer.*',
    r'Acceptable Use Policy Page\s*\d+\s*of\s*\d+.*',
    r'City of Lebanon New Hampshire.*ADMINISTRATIVE POLICIES.*Use of Artificial Intelligence.*',
    r'Use of Artificial Intelligence.*ADM.*',
    r'City of Boston Interim.*Guidelines.*for Using Generative AI.*',
    r'Unset\s*',
    r'Sample\s*Use\s*Cases\s*',
    r'City of Long Beach Generative AI Guidance.*',
    r'City of San Jos\u00e9.*',
    r'City Administrative Policy Manual.*',
    r'ADMINISTRATIVE POLICIES\s*&\s*PROCEDURES\s+Use of Artificial Intelligence\s+City of Lebanon\s+New Hampshire.*',
    r'ADMINISTRATIVE POLICIES\s*&\s*PROCEDURES.*Use of Artificial Intelligence.*City of Lebanon.*New Hampshire.*Policy Number.*Effective Date.*Last Revision.*Page No\..*ADM\s*-\s*\d+.*Approved by:.*',
]
HEADER_RX = re.compile(r'^\s*(?:' + '|'.join(HEADER_PATTERNS) + r')$', flags=re.I | re.M)
PAGE_RX   = re.compile(r'^\s*Page\s+\d+\s*$', flags=re.I | re.M)
URL_MAIL  = re.compile(r'https?://\S+|\S+@\S+', flags=re.I)
TOC_RX    = re.compile(r'\btable of contents\b.*', flags=re.I | re.S)
# Remove Lebanon policy metadata fragments
POLICY_META_RX = re.compile(r'\b(?:policy\s+number|effective\s+date|last\s+revision|page\s+no\.?|adm\s*-?\s*\d+|approved\s+by:?)\b.*', flags=re.I)

def clean_text(t) -> str:
    if t is None:
        t = ""
    t = str(t)

    # 1) Unicode normalization (ligatures, bullets, soft hyphens)
    t = normalize_unicode(t)

    # 2) Normalize newlines
    t = re.sub(r'[\r\n]+', '\n', t)

    # 3) Join spaced-out letters per line (Boston PDFs)
    t = '\n'.join(unsplit_spaced_words(line) for line in t.split('\n'))

    # 4) Split CamelCase/alnum boundaries per line (BEFORE lowercasing)
    t = '\n'.join(split_word_boundaries(line) for line in t.split('\n'))

    # 5) Remove known headers/banners
    t = HEADER_RX.sub(' ', t)

    # 6) Remove page footers like "Page 1"
    t = PAGE_RX.sub(' ', t)

    # 7) Strip URLs and emails
    t = URL_MAIL.sub(' ', t)

    # 8) Drop "Table of Contents" blocks if extracted
    t = TOC_RX.sub(' ', t)

    # 9) Remove policy metadata fragments
    t = POLICY_META_RX.sub(' ', t)

    # 10) Keep alphanumerics + hyphen; lowercase
    t = re.sub(r'[^a-zA-Z0-9\s-]', ' ', t.lower())

    # 11) Collapse whitespace
    return re.sub(r'\s+', ' ', t).strip()


In [51]:
df['cleaned_text'] = (
    df['text']
    .fillna('')         # guard against None/NaN
    .astype(str)        # ensure strings
    .map(clean_text)    # apply the cleaner
)

df

Unnamed: 0,document,page,text,cleaned_text
0,SaltLakeCity.pdf,1,ARTICLE E. USE OF GENERATIVE ARTIFICIAL INTELL...,52-13 e-1 purpose this article provides guidan...
1,SaltLakeCity.pdf,2,stereotypes. Users must edit or disregard bias...,stereotypes users must edit or disregard biase...
2,Cambridge.pdf,1,City of Cambridge Guidelines on Using Generati...,intelligence ai information technology departm...
3,Cambridge.pdf,2,"Only use City-approved, authorized AI tools fo...",only use city-approved authorized ai tools for...
4,Cambridge.pdf,3,• Recommended Practice: To improve team learn...,recommended practice to improve team learning ...
5,Cambridge.pdf,4,"o Copilot Chat, a Microsoft Generative AI cha...",o copilot chat a microsoft generative ai chatb...
6,Cambridge.pdf,5,• City of Cambridge Written Information Secur...,city of cambridge written information security...
7,Cambridge.pdf,6,"o Test results, diagnoses, or treatment detai...",o test results diagnoses or treatment details ...
8,Cambridge.pdf,7,"used in limited non-sensitive, low-risk \ncirc...",used in limited non-sensitive low-risk circums...
9,LongBeach.pdf,1,Generative AI Guidance \n\nV 1.3 \n\nIntroduct...,generative ai guidance v 1 3 introduction gene...


In [52]:
df.iloc[51].text

"ADMINISTRATIVE POLICIES &  PROCEDURES\n\nUse of Artificial Intelligence\n\nCity of Lebanon\nNew Hampshire\n\nPolicy Number\n\nEffective Date\n\nLast Revision\n\nPage No. \n\nADM - 143\n\n12/ 19/ 23\n\nPage 2 of 7\n\nApproved by:  \n\ncode and emails.   ChatGPT- 3,  ChatGPT- 3.5, and ChatGPT- 4 are the current models\navailable to the public to use. \n\n3.6 Generative AI uses both computer algorithms and large volumes of data to\ncreate new content,  such as audio,  code,  images,  and videos.    \n\n3.7 Sensitive Data or Personally Identifying Information ( PII) (as defined in\nADM- 450 Securing Sensitive Information)  is information that is private and must be\nprotected. \n\n3.8 Technology Review Committee ( TRC) –  This committee consists of the\nCyber Services Director,  Chief Innovation Officer,  and Asset Manager.  The\ncommittee works across all departments to review and audit any technology, \nincluding but not limited to software,  technology,  and AI.  \n\nSection 4.0: Polic