In [8]:
# Notebook-friendly Corporate News PDF extractor
# Paste into a Jupyter notebook cell and run.
# Produces: ./corporate_news_notebook_output.xlsx

# If packages are missing, uncomment the pip install line and re-run the cell.
# !pip install pymupdf pandas openpyxl

from pathlib import Path
import fitz        # PyMuPDF
import re
import pandas as pd
from IPython.display import FileLink, display

# ---------- CONFIG ----------
INPUT_PATH = Path("data/Mercury")                # change to a single PDF path or folder path in your notebook
OUTPUT_XLSX = Path("corporate_news_notebook_output.xlsx")
SECTION_HEADING = "Corporate News"
TITLE_MAX_LEN = 300
FONT_SIZE_TITLE_FACTOR = 1.15
MAX_HEADER_LINE_WORDS = 12
ARTICLE_GAP_TOLERANCE = 4.0
MERGE_CONTINUATION_LOWERCASE = True
JOIN_MULTIPLE_LINKS = True
URL_RE = re.compile(r'https?://[^\s\)\]\}]+')
# ----------------------------

def get_section_blocks_for_page(page, heading_text: str):
    page_dict = page.get_text("dict")
    blocks = page_dict.get("blocks", [])
    heading_idx = None
    for i, b in enumerate(blocks):
        text = "".join(
            "".join(span.get("text","") for span in line.get("spans", [])) + "\n"
            for line in b.get("lines", [])
        )
        if heading_text.lower() in text.lower():
            heading_idx = i
            break
    if heading_idx is None:
        return None
    section_blocks = []
    for j in range(heading_idx, len(blocks)):
        b = blocks[j]
        if j != heading_idx:
            first_line = ""
            if b.get("lines"):
                first_line = "".join(span.get("text","") for span in b["lines"][0].get("spans", []))
            words = first_line.strip().split()
            if 0 < len(words) <= MAX_HEADER_LINE_WORDS and len(first_line.strip()) <= 140:
                letters = [c for c in first_line if c.isalpha()]
                upper_frac = sum(1 for c in letters if c.isupper()) / len(letters) if letters else 0
                if upper_frac > 0.6 or first_line.istitle():
                    break
        section_blocks.append(b)
    return section_blocks

def lines_from_blocks(blocks):
    lines = []
    for b in blocks:
        for line in b.get("lines", []):
            spans = line.get("spans", [])
            txt = "".join(span.get("text","") for span in spans).strip()
            if not txt:
                continue
            bbox = line.get("bbox", [0,0,0,0])
            sizes = [span.get("size", 0) for span in spans if span.get("size", 0)]
            lines.append({"text": txt, "y0": bbox[1], "y1": bbox[3], "sizes": sizes})
    return lines

def detect_title_indices(lines):
    sizes = [s for l in lines for s in l["sizes"]]
    median_size = (sorted(sizes)[len(sizes)//2] if sizes else 0)
    candidates = []
    for i, l in enumerate(lines):
        t = l["text"]
        if SECTION_HEADING.lower() in t.lower():
            continue
        if ":" in t and len(t) < TITLE_MAX_LEN:
            candidates.append(i); continue
        max_font = max(l["sizes"]) if l["sizes"] else 0
        if median_size and max_font >= median_size * FONT_SIZE_TITLE_FACTOR:
            candidates.append(i); continue
        words = t.split()
        if 1 < len(words) <= MAX_HEADER_LINE_WORDS and (t.istitle() or t.isupper()):
            candidates.append(i)
    filtered = []
    for idx in candidates:
        if filtered and idx == filtered[-1] + 1:
            continue
        filtered.append(idx)
    return filtered

def build_segments(lines, title_idxs, section_bottom_y):
    if not title_idxs:
        if not lines:
            return []
        return [{"title_idx": 0, "y0": lines[0]["y0"], "y1": section_bottom_y}]
    segments = []
    for k, idx in enumerate(title_idxs):
        top = lines[idx]["y0"]
        bottom = section_bottom_y if k+1 >= len(title_idxs) else lines[title_idxs[k+1]]["y0"] - ARTICLE_GAP_TOLERANCE
        segments.append({"title_idx": idx, "y0": top, "y1": bottom})
    return segments

def collect_annotations(page):
    annots = []
    for l in page.get_links():
        uri = l.get("uri") or l.get("file")
        if uri:
            rect = l.get("from")
            annots.append({"uri": uri, "rect": rect})
    try:
        for a in page.annots():
            if not a:
                continue
            info = a.info
            uri = info.get("uri") or (info.get("A") or {}).get("URI")
            if uri:
                annots.append({"uri": uri, "rect": a.rect})
    except Exception:
        pass
    seen = set(); uniq = []
    for at in annots:
        if at["uri"] in seen: continue
        seen.add(at["uri"]); uniq.append(at)
    return uniq

def locate_inline_urls(lines):
    found = []
    for i, l in enumerate(lines):
        for m in URL_RE.finditer(l["text"]):
            found.append({"uri": m.group(0), "y0": l["y0"], "y1": l["y1"], "line_idx": i})
    return found

def map_links_to_segments(segments, annotations, text_urls):
    seg_links = {i: [] for i in range(len(segments))}
    def assign(y, uri):
        assigned = False
        for i, s in enumerate(segments):
            if y >= s["y0"] - 0.5 and y <= s["y1"] + 0.5:
                if uri not in seg_links[i]:
                    seg_links[i].append(uri)
                assigned = True
                break
        if not assigned and segments:
            dists = [min(abs(y - s["y0"]), abs(y - s["y1"])) for s in segments]
            nearest = dists.index(min(dists))
            if uri not in seg_links[nearest]:
                seg_links[nearest].append(uri)
    for a in annotations:
        rect = a.get("rect")
        if rect:
            if isinstance(rect, (list, tuple)) and len(rect) >= 4:
                # page.get_links() returns rect as list [x0,y0,x1,y1] in some builds
                if len(rect) >= 4:
                    cy = (rect[1] + rect[3]) / 2.0
                else:
                    continue
            elif hasattr(rect, "y0"):
                cy = (rect.y0 + rect.y1) / 2.0
            else:
                continue
            assign(cy, a["uri"])
    for t in text_urls:
        cy = (t["y0"] + t["y1"]) / 2.0
        assign(cy, t["uri"])
    return seg_links

def assemble_articles_from_page(page, pdf_name):
    section_blocks = get_section_blocks_for_page(page, SECTION_HEADING)
    if not section_blocks:
        return []
    lines = lines_from_blocks(section_blocks)
    if not lines:
        return []
    last_bbox = section_blocks[-1].get("bbox", [0,0,0,0])
    section_bottom_y = last_bbox[3] if last_bbox else lines[-1]["y1"]
    title_idxs = detect_title_indices(lines)
    segments = build_segments(lines, title_idxs, section_bottom_y)
    annots = collect_annotations(page)
    text_urls = locate_inline_urls(lines)
    seg_links = map_links_to_segments(segments, annots, text_urls)

    rows = []
    for i, seg in enumerate(segments):
        title_line_idx = seg["title_idx"] if seg["title_idx"] < len(lines) else 0
        title = lines[title_line_idx]["text"].strip() if lines else ""
        if SECTION_HEADING.lower() in title.lower():
            title = next((l["text"] for l in lines if SECTION_HEADING.lower() not in l["text"].lower()), title)
        desc_parts = []
        for li, l in enumerate(lines):
            mid = (l["y0"] + l["y1"]) / 2.0
            if mid >= seg["y0"] - 0.1 and mid <= seg["y1"] + 0.1:
                if li == title_line_idx:
                    continue
                desc_parts.append(l["text"])
        description = " ".join(desc_parts).strip()
        links = seg_links.get(i, [])
        link_field = (";".join(links)) if (JOIN_MULTIPLE_LINKS and links) else (links[0] if links else "")
        rows.append({"title": title, "description": description, "link": link_field, "source_pdf": pdf_name, "page_number": page.number + 1})
    # merge continuation fragments
    merged = []
    for r in rows:
        title = r["title"].strip()
        desc = r["description"].strip()
        if not merged:
            merged.append(r.copy()); continue
        is_cont = False
        if title:
            if MERGE_CONTINUATION_LOWERCASE and (title[0].islower() or (title.startswith('“') and len(title) > 1 and title[1].islower())):
                is_cont = True
            if len(title.split()) > 40 and ":" not in title:
                is_cont = True
        else:
            is_cont = True
        if is_cont:
            prev = merged[-1]
            prev["description"] = (prev.get("description","") + " " + (title + " " + desc).strip()).strip()
            if r.get("link") and not prev.get("link"):
                prev["link"] = r.get("link")
        else:
            merged.append(r.copy())
    return merged

def process_input(input_path):
    input_path = Path(input_path)
    pdfs = []
    if input_path.is_file() and input_path.suffix.lower() == ".pdf":
        pdfs = [input_path]
    elif input_path.is_dir():
        pdfs = sorted(list(input_path.glob("*.pdf")))
    else:
        raise FileNotFoundError(f"Input {input_path} not found or not a PDF/directory")
    all_rows = []
    for pdf in pdfs:
        doc = fitz.open(str(pdf))
        for page in doc:
            rows = assemble_articles_from_page(page, pdf.name)
            all_rows.extend(rows)
        doc.close()
    # dedupe by title
    seen = set(); final = []
    for r in all_rows:
        t = (r["title"] or "").strip().lower()
        if not t: continue
        if t in seen: continue
        seen.add(t); final.append(r)
    return final

# --- Run extraction ---
input_path = INPUT_PATH  # edit this to the folder or single file you want to process
print("Processing:", input_path)
rows = process_input(input_path)

# If no links were found for any row, as a last-resort fallback attempt to map urls in whole pdf by order
if rows and not any(r["link"] for r in rows):
    # fallback: gather all urls in all pdfs, assign by order
    urls_all = []
    pdf_paths = [p for p in (input_path.glob("*.pdf") if input_path.is_dir() else [input_path])]
    for p in pdf_paths:
        doc = fitz.open(str(p))
        text_all = ""
        for page in doc:
            text_all += page.get_text()
        doc.close()
        for u in re.findall(URL_RE, text_all):
            if u not in urls_all:
                urls_all.append(u)
    for i, r in enumerate(rows):
        if i < len(urls_all):
            r["link"] = urls_all[i]

# Save to Excel
df = pd.DataFrame(rows, columns=["title", "description", "link", "source_pdf", "page_number"])
df.to_excel(OUTPUT_XLSX, index=False)
print(f"Saved {len(df)} rows to {OUTPUT_XLSX.resolve()}")

# Provide download link for notebook users
display(FileLink(str(OUTPUT_XLSX.resolve())))
df.head(20)


Processing: data\Mercury
Saved 342 rows to C:\Users\gooyt\Desktop\ai-news-agent\corporate_news_notebook_output.xlsx


Unnamed: 0,title,description,link,source_pdf,page_number
0,Ni Hsin: Explore acquisition of Hy-Fresh poult...,Ni Hsin Group Bhd on Monday signed a heads of ...,https://theedgemalaysia.com/node/760744;https:...,Market-Watch-20250701.pdf,3
1,MyEG: Changes name to Zetrix AI effective July 3,MyEG Services Bhd on Monday announced its name...,https://theedgemalaysia.com/node/760782,Market-Watch-20250701.pdf,3
2,Genting Malaysia: Submits bid for New York cas...,Genting Malaysia Bhd announced that a formal b...,https://theedgemalaysia.com/node/760781,Market-Watch-20250701.pdf,3
3,Reservoir Link: Bags Petronas Carigali contrac...,Reservoir Link Energy Bhd said on Monday that ...,https://theedgemalaysia.com/node/760767,Market-Watch-20250701.pdf,3
4,Yinson: Pause major investments after another ...,Yinson Holdings Bhd said it would hold off maj...,https://theedgemalaysia.com/node/760773,Market-Watch-20250701.pdf,3
5,Avangaad: Secures RM66.8m tugboat charter and ...,Marine transportation and offshore storage com...,https://theedgemalaysia.com/node/760891;https:...,Market-Watch-20250702.pdf,3
6,Gamuda: Signs deal to co-develop renewable ene...,Gamuda Bhd has signed an agreement with the Do...,https://theedgemalaysia.com/node/760945,Market-Watch-20250702.pdf,3
7,Cape EMS: Appoints UHY Malaysia PLT as externa...,Cape EMS Bhd has appointed UHY Malaysia PLT as...,https://theedgemalaysia.com/node/760955,Market-Watch-20250702.pdf,3
8,Velesto: Secures US$40m drilling contract from...,Velesto Energy has secured a US$40m (RM188m) d...,https://theedgemalaysia.com/node/761015,Market-Watch-20250702.pdf,3
9,Ayer Holdings: Former Kerjaya Prospek Property...,Property and plantation outfit Ayer Holdings B...,https://theedgemalaysia.com/node/760996,Market-Watch-20250702.pdf,3
