In [18]:
import PyPDF2
import pandas as pd
import json
from pathlib import Path


# ---------- File paths ----------

# Assuming your notebook, PDF, and output will all be in the same folder.
BASE_DIR = Path(".")  # current folder

PDF_PATH = BASE_DIR / "Data_Input.pdf"
OUTPUT_EXCEL_PATH = BASE_DIR / "Output.xlsx"

print("PDF path:", PDF_PATH.resolve())
print("Output will be saved as:", OUTPUT_EXCEL_PATH.resolve())


PDF path: C:\Users\Sunilraj\Desktop\Assignment_r\solution\Data_Input.pdf
Output will be saved as: C:\Users\Sunilraj\Desktop\Assignment_r\solution\Output.xlsx


In [19]:
def extract_text_from_pdf(pdf_path: Path) -> str:
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF file not found at: {pdf_path}")

    all_text = []

    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        num_pages = len(reader.pages)
        print(f"Number of pages in PDF: {num_pages}")

        for i in range(num_pages):
            page = reader.pages[i]
            page_text = page.extract_text()
            if page_text:
                all_text.append(page_text.strip())
            else:
                print(f"Warning: No text extracted from page {i}")

    # Join pages with two newlines between them
    full_text = "\n\n".join(all_text)
    return full_text


# ---- Use the function and preview the first part of the text ----
pdf_text = extract_text_from_pdf(PDF_PATH)

print("First 500 characters of extracted text:\n")
print(pdf_text[:500])

Number of pages in PDF: 1
First 500 characters of extracted text:

Vijay Kumar was born on March 15, 1989, in Jaipur, Rajasthan, making him 35 years old as of 2024. 
His birthdate is formatted as 1989 -03-15 in ISO format for easy parsing, while his age serves as a 
key demographic marker for analytical purposes. Born and ra ised in the Pink City of India, his 
birthplace provides valuable regional profiling context, and his O+ blood group is noted for 
emergency contact purposes. As an Indian national, his citizenship status is important for 
understanding his


In [20]:
import os
import google.generativeai as genai

# Read API key from environment variable
api_key = os.getenv("GOOGLE_API_KEY")

if api_key is None:
    raise ValueError("GOOGLE_API_KEY is not set in your environment variables.")

genai.configure(api_key=api_key)

# Model name (Free + Powerful)
GEMINI_MODEL_NAME = "gemini-1.5-flash"

print("Gemini API key loaded successfully.")
print("Using model:", GEMINI_MODEL_NAME)


Gemini API key loaded successfully.
Using model: gemini-1.5-flash


In [21]:
import google.generativeai as genai

# list_models() returns a generator → convert to list
models = list(genai.list_models())

print(f"Found {len(models)} models:\n")

for i, m in enumerate(models, 1):
    # model objects differ by SDK version, so we try common fields
    name = getattr(m, "name", None) or getattr(m, "model", None) or getattr(m, "id", None)
    display_name = getattr(m, "displayName", None) or getattr(m, "display_name", None)
    description = getattr(m, "description", None)

    print(f"Model #{i}")
    print("  name        :", name)
    if display_name:
        print("  displayName :", display_name)
    if description:
        print("  description :", (description[:180] + "...") if len(description) > 180 else description)

    # Some SDK versions expose supported methods
    if hasattr(m, "supported_generation_methods"):
        print("  supported_generation_methods:", m.supported_generation_methods)

    print("-" * 60)


Found 57 models:

Model #1
  name        : models/embedding-gecko-001
  displayName : Embedding Gecko
  description : Obtain a distributed representation of a text.
  supported_generation_methods: ['embedText', 'countTextTokens']
------------------------------------------------------------
Model #2
  name        : models/gemini-2.5-flash
  displayName : Gemini 2.5 Flash
  description : Stable version of Gemini 2.5 Flash, our mid-size multimodal model that supports up to 1 million tokens, released in June of 2025.
  supported_generation_methods: ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
------------------------------------------------------------
Model #3
  name        : models/gemini-2.5-pro
  displayName : Gemini 2.5 Pro
  description : Stable release (June 17th, 2025) of Gemini 2.5 Pro
  supported_generation_methods: ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
-----------------------------------------------

In [22]:
# Cell: Robust Gemini extraction that enforces meaningful, non-empty COMMENTS (with fallback single-record comment generation)
# Paste & run this after you have configured genai (api key) and extracted pdf_text earlier.
import json
import google.generativeai as genai
from typing import List, Dict

# Choose model from your available list
GEMINI_MODEL_NAME = "models/gemini-2.5-flash"

def _normalize_single_record(rec) -> Dict[str, str]:
    """
    Normalize a single record into keys: key, value, comments (strings).
    Accept alternate key names and ensure strings.
    """
    if isinstance(rec, str):
        return {"key": "Unlabeled Fact", "value": rec.strip(), "comments": ""}

    if not isinstance(rec, dict):
        return {"key": "Unlabeled Fact", "value": str(rec), "comments": ""}

    # map possible field names to canonical names
    lower_map = {k.lower(): k for k in rec.keys()}
    def find_field(candidates):
        for c in candidates:
            if c in lower_map:
                return lower_map[c]
        return None

    key_field = find_field(("key", "title", "label", "field"))
    value_field = find_field(("value", "val", "text", "content"))
    comments_field = find_field(("comments", "comment", "notes", "note", "context", "meta"))

    key = rec.get(key_field) if key_field else None
    value = rec.get(value_field) if value_field else None
    comments = rec.get(comments_field) if comments_field else ""

    # if the model put a sentence in 'key' by mistake, shift it into value
    if (value is None or value == "") and key is not None and isinstance(key, str) and len(key.split()) > 6:
        value = key
        key = "Unlabeled Fact"

    # ensure value exists (try to pick any string field)
    if value is None or value == "":
        for k in rec:
            if isinstance(rec[k], str) and rec[k].strip():
                value = rec[k]
                break
    if value is None:
        value = ""

    if key is None or str(key).strip() == "":
        short_key = " ".join(value.strip().split()[:3]) or "Unlabeled Fact"
        key = short_key if len(short_key) <= 40 else short_key[:40].strip()

    return {"key": str(key).strip(), "value": str(value).strip(), "comments": str(comments).strip()}

def _generate_single_comment_from_model(key: str, value: str, doc_text: str) -> str:
    """
    If the model omitted a comments field, call the model to generate a single
    concise, grounded comment for this (key, value) using only information present
    in doc_text. If no contextual info is present, return a safe fallback string.
    The returned text should be a short sentence (<= 30 words).
    """
    prompt = (
        "You will be given a document (source) and a single extracted fact (key + value). "
        "Produce exactly ONE concise sentence (comment) that explains the significance, "
        "purpose, or context of the VALUE using ONLY information present in the SOURCE. "
        "Do NOT invent or assume new facts. If the SOURCE contains no additional context "
        "for this value, return exactly: No additional context available.\n\n"
        "SOURCE:\n"
        f"\"\"\"{doc_text}\"\"\"\n\n"
        f"FACT KEY: {key}\n"
        f"FACT VALUE: {value}\n\n"
        "Output ONLY the single sentence comment (no JSON, no quotes)."
    )

    try:
        model = genai.GenerativeModel(GEMINI_MODEL_NAME)
        resp = model.generate_content(prompt)
        raw = ""
        if hasattr(resp, "text") and resp.text:
            raw = resp.text.strip()
        elif hasattr(resp, "candidates") and resp.candidates:
            raw = resp.candidates[0].content.strip()
        else:
            raw = str(resp).strip()

        # sometimes the model may return code fences or extra whitespace - clean it
        if raw.startswith("```"):
            first = raw.find("```")
            last = raw.rfind("```")
            inner = raw[first+3:last].strip() if last > first else raw.strip("`").strip()
            raw = inner
        # ensure single-line, single sentence fallback if model returns multiple lines
        raw = raw.replace("\n", " ").strip()
        # final safety: if it looks like an instruction or too long, truncate politely
        if len(raw.split()) > 50:
            raw = " ".join(raw.split()[:45]) + "..."
        # if model returned empty, provide canonical fallback
        if raw == "":
            return "No additional context available."
        return raw
    except Exception:
        # On any error, return a safe fallback (keeps the pipeline moving)
        return "No additional context available."

def call_llm_for_extraction_with_rich_comments(document_text: str) -> List[Dict[str, str]]:
    """
    Primary function to get atomic records with meaningful comments.
    1) Ask Gemini to produce JSON array of objects {key,value,comments}.
    2) Normalize each returned record to ensure canonical keys exist.
    3) For any record with empty comments, call a tiny helper prompt to generate a grounded comment.
    4) Return normalized list where every record has non-empty 'comments' (or safe fallback).
    """
    # Detailed prompt that enforces meaningful comments and examples
    system_instructions = (
        "You are an assistant that turns an unstructured document into a list of atomic facts. "
        "Return ONLY valid JSON — a single JSON array. Each element must be an object with fields: "
        "\"key\" (short label), \"value\" (original wording), \"comments\" (concise context/interpretation). "
        "CRITICAL: Comments must be meaningful, non-empty, and derived from the document text. "
        "Do NOT invent facts. Preserve original wording in 'value' and only paraphrase for 'key'."
    )

    # Provide explicit examples (few-shot) to show desired comment style
    few_shot_examples = """
Example (illustrative):

Source snippet:
"John Doe was born on January 2, 1990 in Mumbai. His birthdate is formatted as 1990-01-02 in ISO format."

Desired JSON elements:
[
  {
    "key": "Name",
    "value": "John Doe",
    "comments": "Subject's full name as listed in the profile."
  },
  {
    "key": "Date of Birth",
    "value": "January 2, 1990",
    "comments": "Exact date of birth used for age verification."
  },
  {
    "key": "Birth Date (ISO)",
    "value": "1990-01-02",
    "comments": "ISO formatted date included for machine parsing."
  },
  {
    "key": "Birth Place",
    "value": "Mumbai",
    "comments": "City of birth useful for regional demographic profiling."
  }
]
"""

    user_instructions = f"""
Here is the full document text to structure:

\"\"\"{document_text}\"\"\"

Return ONLY a JSON array with objects exactly like:
[
  {{
    "key": "...",
    "value": "...",
    "comments": "..."
  }},
  ...
]

NOTES on COMMENTS:
- Comments should be short (one sentence), descriptive, and explain the significance or context of the VALUE.
- Do NOT repeat the VALUE itself.
- Do NOT add facts not present in the source.
- If the source provides no additional context for a value, include a short neutral comment like: "No additional context available."
- Keys should be concise (2-4 words). Values must preserve original wording (except trivial spacing fixes).

Now output ONLY the JSON array. Do NOT include any extra text or explanation.
"""

    full_prompt = system_instructions + "\n\n" + few_shot_examples + "\n\n" + user_instructions

    # call model
    model = genai.GenerativeModel(GEMINI_MODEL_NAME)
    resp = model.generate_content(full_prompt)

    # Extract raw text robustly
    raw_output = ""
    if hasattr(resp, "text") and resp.text:
        raw_output = resp.text
    elif hasattr(resp, "candidates") and resp.candidates:
        raw_output = resp.candidates[0].content
    else:
        raw_output = str(resp)
    raw_output = raw_output.strip()

    # strip triple-backtick fences if present
    if raw_output.startswith("```"):
        first = raw_output.find("```")
        last = raw_output.rfind("```")
        inner = raw_output[first+3:last].strip() if last > first else raw_output.strip("`").strip()
        if inner.lower().startswith("json"):
            inner = inner[len("json"):].strip()
        raw_output = inner

    # Try parse JSON
    try:
        parsed = json.loads(raw_output)
    except json.JSONDecodeError:
        print("Failed to parse JSON. Raw output (first 3000 chars):\n")
        print(raw_output[:3000])
        raise

    # If parsed is a dict, try to find list inside; else wrap into list
    if isinstance(parsed, dict):
        found_list = None
        for v in parsed.values():
            if isinstance(v, list):
                found_list = v
                break
        parsed = found_list or [parsed]

    if not isinstance(parsed, list):
        raise ValueError("Model output is not a JSON array as expected.")

    # Normalize entries (ensure key, value, comments exist)
    normalized: List[Dict[str, str]] = []
    for rec in parsed:
        norm = _normalize_single_record(rec)
        # If comments empty, attempt to generate one grounded in document_text
        if norm.get("comments", "").strip() == "":
            generated_comment = _generate_single_comment_from_model(norm["key"], norm["value"], document_text)
            norm["comments"] = generated_comment
        normalized.append(norm)

    # Final check: ensure every record has non-empty comments (if still empty, put fallback)
    for rec in normalized:
        if not rec.get("comments") or str(rec.get("comments")).strip() == "":
            rec["comments"] = "No additional context available."

    return normalized

# ---- Run the new function on your extracted PDF text (pdf_text must already exist) ----
records_with_comments = call_llm_for_extraction_with_rich_comments(pdf_text)

print("Extracted records (count):", len(records_with_comments))
print("First 6 records preview:")
for r in records_with_comments[:6]:
    print(r)


Extracted records (count): 62
First 6 records preview:
{'key': 'Name', 'value': 'Vijay Kumar', 'comments': "Subject's full name as listed in the profile."}
{'key': 'Date of Birth', 'value': 'March 15, 1989', 'comments': 'Exact date of birth provided in the document.'}
{'key': 'Birth City', 'value': 'Jaipur', 'comments': 'City of birth provides valuable regional profiling context.'}
{'key': 'Birth State', 'value': 'Rajasthan', 'comments': 'State of birth mentioned in the document.'}
{'key': 'Current Age', 'value': '35 years old', 'comments': 'Age as of 2024 serves as a key demographic marker for analytical purposes.'}
{'key': 'Age Reference Year', 'value': '2024', 'comments': 'The specific year for which the current age is referenced.'}


In [25]:
# Cell A: normalize small formatting issues, convert to DataFrame, and save cleaned Excel
import re
import pandas as pd
from pathlib import Path

def fix_iso_date_spacing(text: str) -> str:
    """Fix patterns like '1989 -03-15' or '1989- 03-15' -> '1989-03-15'."""
    if not isinstance(text, str):
        return text
    return re.sub(r'(\d{4})\s*-\s*(\d{2})\s*-\s*(\d{2})', r'\1-\2-\3', text)

# run normalization across all records
normalized_records = []
for rec in records_with_comments:  # from your previous cell
    key = rec.get("key","").strip()
    value = fix_iso_date_spacing(rec.get("value","").strip())
    comments = fix_iso_date_spacing(rec.get("comments","").strip())
    normalized_records.append({"Key": key, "Value": value, "Comments": comments})

# Convert to DataFrame and replace any None/NaN with empty strings
df_clean = pd.DataFrame(normalized_records, columns=["Key","Value","Comments"]).fillna("")

# Optional: if you want to merge rows that are exact duplicates, uncomment:
# df_clean = df_clean.drop_duplicates().reset_index(drop=True)

# Save cleaned file
out_path = Path("Output.xlsx")
df_clean.to_excel(out_path, index=False)

print(f"Saved cleaned output with {len(df_clean)} rows to: {out_path.resolve()}")
display(df_clean.head(12))


Saved cleaned output with 62 rows to: C:\Users\Sunilraj\Desktop\Assignment_r\solution\Output.xlsx


Unnamed: 0,Key,Value,Comments
0,Name,Vijay Kumar,Subject's full name as listed in the profile.
1,Date of Birth,"March 15, 1989",Exact date of birth provided in the document.
2,Birth City,Jaipur,City of birth provides valuable regional profi...
3,Birth State,Rajasthan,State of birth mentioned in the document.
4,Current Age,35 years old,Age as of 2024 serves as a key demographic mar...
5,Age Reference Year,2024,The specific year for which the current age is...
6,Birth Date (ISO),1989-03-15,ISO formatted birthdate included for easy pars...
7,Birthplace Context,Pink City of India,"Descriptive nickname for his birthplace, Jaipur."
8,Blood Group,O+,Blood group noted for emergency contact purposes.
9,Citizenship Status,Indian national,Citizenship status is important for understand...
