## Data Preprocessing

In [2]:
import fitz  # PyMuPDF
import re

pdf_path="ajol-file-journals_45_articles_245590_submission_proof_245590-529-589228-1-10-20230411.pdf"

In [3]:
def extract_title_footer_doi_keywords(pdf_path):
    doc = fitz.open(pdf_path)

    # -------- TITLE EXTRACTION --------
    first_page = doc[0]
    blocks = first_page.get_text("dict")["blocks"]

    title_candidates = []

    for block in blocks:
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                text = span["text"].strip()
                font_size = span["size"]
                # Filter for longer text, avoid headers/metadata
                if text and len(text.split()) > 3 and not re.search(r'(journal|doi|volume|issue|page \d+)', text, re.IGNORECASE):
                    title_candidates.append((text, font_size, span["bbox"][1]))

    # Sort by font size descending, then y-position (top of page)
    title_candidates.sort(key=lambda x: (-x[1], x[2]))
    title = title_candidates[0][0] if title_candidates else "Unknown Title"

    # -------- CITATION INFO (Footer) EXTRACTION --------
    footer_blocks = first_page.get_text("blocks")
    footer_blocks.sort(key=lambda b: b[1], reverse=True)

    # Look for citation-style text among bottom blocks
    citation_candidates = [
        block[4].strip() for block in footer_blocks
        if re.search(r'(journal|volume|issue|vol\.|no\.|doi|issn)', block[4], re.IGNORECASE)
    ]

    citation_info = citation_candidates[0] if citation_candidates else (
        footer_blocks[0][4].strip() if footer_blocks else "Unknown Citation Info"
    )

    # -------- DOI + KEYWORDS --------
    text_to_search = ""
    for i in range(min(2, len(doc))):  # First 2 pages
        text_to_search += doc[i].get_text()

    # Extract DOI
    doi_match = re.search(r'\b(10\.\d{4,9}/[^\s"\'<>]*)', text_to_search)
    doi = f"http://dx.doi.org/{doi_match.group(1).rstrip('.,;')}" if doi_match else "DOI not found"

    # Extract keywords (supporting Keywords, Key words, Index Terms)
    keywords_match = re.search(r'(Keywords|Key words|Index Terms)\s[:\-–]?\s(.+)', text_to_search, re.IGNORECASE)
    if keywords_match:
        raw_keywords = keywords_match.group(2).split('\n')[0]  # Take only the first line in case of line breaks
        keywords = [kw.strip().strip('.') for kw in re.split(r',|;', raw_keywords) if kw.strip()]
    else:
        keywords = []

    return title, citation_info, doi, keywords

In [4]:
title, citation_info, doi, keywords = extract_title_footer_doi_keywords(pdf_path)
print("Title:", title)
print("Citation Info:", citation_info)
print("Doi:", doi)
print("Keywords:", keywords)

Title: Surgical androgen deprivation therapy in advanced prostate cancer in patients
Citation Info: African Health Sciences, Vol 23 Issue 1, March, 2023
483
Doi: http://dx.doi.org/10.4314/ahs.v23i1.50
Keywords: []


In [8]:
doc = fitz.open(pdf_path)
content = ""
for page in doc:
    content += page.get_text()

# Clean the text: remove multiple newlines and replace them with a single space
cleaned_content = re.sub(r'\s*\n\s*', ' ', content)
cleaned_content = re.sub(r'\s{2,}', ' ', cleaned_content).strip()

cleaned_content

## Testing API

In [2]:
import requests

url = "http://127.0.0.1:8000/api/v1/query"
payload = {
    "query": "Procedures for prostate cancer screening and diagnosis",
}

response = requests.post(url, json=payload)
print(response.status_code)
print(response.json())


200
{'answer': "Based on the provided context, here's a comprehensive answer regarding prostate cancer screening and diagnosis procedures in Africa:\n\n**Prostate Cancer Screening and Diagnosis Procedures in Africa**\n\nThe document highlights the current practices and challenges in prostate cancer screening and diagnosis across Africa. Key points include:\n\n1.  **Screening Modalities:**\n    *   Digital Rectal Examination (DRE): Commonly used for clinical assessment of the prostate.\n    *   Prostate-Specific Antigen (PSA) Assay: Available in almost all hospital centers surveyed and widely used for screening.  However, access may be limited by the availability of biomedical analysis laboratories and cost. The use of technologies such as MRI is beginning to have a place in prostate cancer screening and surveillance in high-income countries, however, PSA and DRE are almost always used for screening and diagnosing PCA in Africa.\n    *   The National Comprehensive Cancer Network (NCCN) 