In [1]:
# Install dependencies
!pip install pymupdf pandas --quiet

# Import libraries
from google.colab import files
import fitz  # PyMuPDF
import pandas as pd
import re

# Helper: parse PDF metadata date strings (D:YYYYMMDD...)
def parse_pdf_date(dstr):
    if not dstr:
        return None
    m = re.match(r"D:(\d{4})(\d{2})(\d{2})", dstr)
    if m:
        y, mth, d = m.groups()
        return f"{d}-{mth}-{y}"
    return None

# Upload one or more BNM PDF files
uploaded = files.upload()

records = []
for fname in uploaded:
    # Open PDF
    doc = fitz.open(fname)
    meta = doc.metadata

    # Extract Title
    title = meta.get('title') or None
    if not title:
        # Fallback: first non-empty line of first page
        lines = doc[0].get_text().split('\n')
        title = next((ln for ln in lines if ln.strip()), fname)

    # Extract Date
    date = parse_pdf_date(meta.get('creationDate')) or parse_pdf_date(meta.get('modDate'))
    if not date:
        first_page = doc[0].get_text()
        m = re.search(r'\b(\d{1,2}\s+\w+\s+\d{4})\b', first_page)
        date = m.group(1) if m else None

    # Agency (static)
    agency = "Bank Negara Malaysia"

    # Document Type (heuristic from title)
    ttl = title.lower()
    if 'guideline' in ttl:
        doc_type = "Guideline"
    elif 'policy' in ttl:
        doc_type = "Policy"
    elif 'exposure draft' in ttl:
        doc_type = "Exposure Draft"
    else:
        doc_type = "Document"

    # Appends dictionary of extracted fields into records list
    records.append({
        "Filename": fname,
        "Title": title,
        "Date": date,
        "Agency": agency,
        "Document Type": doc_type
    })

# 4️⃣ Create DataFrame & display
df = pd.DataFrame(records)
df


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/24.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/24.1 MB[0m [31m20.8 MB/s[0m eta [36m0:00:02[0m[2K   [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/24.1 MB[0m [31m70.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/24.1 MB[0m [31m109.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m18.3/24.1 MB[0m [31m213.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m24.1/24.1 MB[0m [31m238.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m24.1/24.1 MB[0m [31m238.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [3

Saving ar2023_en_book.pdf to ar2023_en_book.pdf


Unnamed: 0,Filename,Title,Date,Agency,Document Type
0,ar2023_en_book.pdf,Annual Report 2023,25-03-2024,Bank Negara Malaysia,Document
